In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import make_moons
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import datetime
import nltk
import time
import logging
import itertools

from shared_lib import utils, vocabulary
from shared_lib import ngram_lm
from shared_lib import ngram_utils
from shared_lib import simple_trigram
from scipy import sparse
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

%matplotlib inline

In [3]:
def clean_data(df):
        sentences = []
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentences.append(sentence)
        return sentences

In [4]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [5]:
boston_bomb_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/boston_comments_2013.03-2013.05.txt', lines=True)
boston_series_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/boston_comments_2013.10-2013.11.txt', lines=True)
colorado_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/colorado_comments_2017.06-2017.09.txt', lines=True)
florida_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/florida_comments_2017.06-2017.09.txt', lines=True)
houston_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/houston_comments_2017.06-2017.09.txt', lines=True)
miami_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/miami_comments_2017.06-2017.09.txt', lines=True)
nyc_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/nyc_comments_2012.08-2012.12.txt', lines=True)
puerto_rico_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/puerto_rico_comments_2017.06-2017.09.txt', lines=True)
vegas_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/vegas_comments_2017.06-2017.09.txt', lines=True)

In [6]:

# setup local times
boston_bomb_df['created_at_local'] = pd.to_datetime(boston_bomb_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
boston_series_df['created_at_local'] = pd.to_datetime(boston_series_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
florida_df['created_at_local'] = pd.to_datetime(florida_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
houston_df['created_at_local'] = pd.to_datetime(houston_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Central')
miami_df['created_at_local'] = pd.to_datetime(miami_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
nyc_df['created_at_local'] = pd.to_datetime(nyc_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
puerto_rico_df['created_at_local'] = pd.to_datetime(puerto_rico_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('America/Puerto_Rico')
vegas_df['created_at_local'] = pd.to_datetime(vegas_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('US/Pacific')

0       2017-05-31 17:02:09-07:00
1       2017-05-31 17:25:40-07:00
2       2017-05-31 17:34:44-07:00
3       2017-05-31 17:38:33-07:00
4       2017-05-31 18:17:50-07:00
5       2017-05-31 18:34:51-07:00
6       2017-05-31 18:38:49-07:00
7       2017-05-31 18:49:23-07:00
8       2017-05-31 19:03:11-07:00
9       2017-05-31 19:04:52-07:00
10      2017-05-31 19:06:20-07:00
11      2017-05-31 19:10:10-07:00
12      2017-05-31 19:12:37-07:00
13      2017-05-31 19:22:35-07:00
14      2017-05-31 19:23:16-07:00
15      2017-05-31 19:25:15-07:00
16      2017-05-31 19:27:52-07:00
17      2017-05-31 19:36:15-07:00
18      2017-05-31 19:37:57-07:00
19      2017-05-31 19:45:04-07:00
20      2017-05-31 19:52:39-07:00
21      2017-05-31 20:17:16-07:00
22      2017-05-31 20:38:52-07:00
23      2017-05-31 20:39:44-07:00
24      2017-05-31 20:40:17-07:00
25      2017-05-31 20:42:48-07:00
26      2017-05-31 20:45:08-07:00
27      2017-05-31 20:45:30-07:00
28      2017-05-31 20:45:44-07:00
29      2017-0

In [8]:
before_boston = (boston_bomb_df['created_at_local'] > '2013-02-28') & (boston_bomb_df['created_at_local'] <= '2013-04-14')

In [9]:
before_florida = (florida_df['created_at_local'] > '2017-05-31') & (florida_df['created_at_local'] <= '2017-08-30')

In [10]:
before_houston = (houston_df['created_at_local'] > ' 2017-05-31') & (houston_df['created_at_local'] <= '2017-08-16')


In [11]:
before_miami = (miami_df['created_at_local'] > '2017-05-31') & (miami_df['created_at_local'] <= '2017-08-30')

In [12]:
before_nyc = (nyc_df['created_at_local'] > '2012-07-31') & (nyc_df['created_at_local'] <= '2012-10-23')

In [13]:
before_puerto_rico = (puerto_rico_df['created_at_local'] > '2017-05-31') & (puerto_rico_df['created_at_local'] <= '2017-09-15')

In [15]:
before_vegas = (vegas_df['created_at_local'] > '2017-05-31') & (vegas_df['created_at_local'] <= '2017-09-30')

### Boston shooting

In [16]:
#### Before

In [17]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20

In [18]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(boston_bomb_df.loc[before_boston]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in boston before bombing NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in boston before bombing NMF model:
Topic #0: going work home getting make things way probably run high able won school having doesn traffic better different lot time
Topic #1: http com www org reddit jpg boston imgur html watch comments link wikipedia https youtube en wiki news amp facebook
Topic #2: thanks awesome thank check definitely cool ll looking wow look help haha ah god interesting advice info idea interested oh
Topic #3: boston city area cambridge new east places nice live nyc expensive water bars south town cities pizza york check living
Topic #4: like looks sounds feel shit old band yea look sound makes stuff kind real crazy idea nah kinda said piece
Topic #5: don know want boston let need cab people understand uber sorry care cabs forget internet regulations question anymore drivers seriously
Topic #6: just post thought doesn article comment saying mean person guess wanted read wasn globe didn come boston makes say op
Topic #7: good food beer luck house pretty jus

#### boston bombing after

In [19]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(boston_bomb_df.loc[-before_boston]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [20]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Boston after bombing NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in Boston after bombing NMF model:
Topic #0: use time long year pay work having money way public days won school car high used need getting make able
Topic #1: com http www amp imgur jpg boston watch org https twitter youtube comments html link video google en wikipedia page
Topic #2: like looks look feel doesn sounds kind maybe sound bag person looked stuff bomb better dick way pressure different does
Topic #3: boston city new like beer town area year strong big favorite best moved cities sox red lived week beautiful living
Topic #4: ll work check definitely day good try place better time amazing damn soon ok glad doing awesome friend cool hopefully
Topic #5: just went ago saw away called happened hours got came minutes phone edit took explosion morning line said couple watching
Topic #6: don thanks know let link ll boston info want exactly appreciate check ok forget understand posting worry need answer sharing
Topic #7: people lot help trying stop family working world run blo

#### Florida Irma

In [21]:
###
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(florida_df.loc[before_florida]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in before Irma in florida NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in before Irma in florida NMF model:
Topic #0: people work things pay school isn having money doesn insurance cost want income use run instead make high living need
Topic #1: com https reddit http www amp comments post np message news originalpostsearcher bot org compose github faq morgan papernotes jpg
Topic #2: florida south north water central coast living best panhandle area winter favorite fl summer ocean welcome clear cheap east orlando
Topic #3: beach nice area west park orlando key palm city near tampa live hour visit drive beaches fun fl place st
Topic #4: like looks sounds cheese feel stuff america different honestly fall imagine driving weed summer water feels neighborhood bit bunch haha
Topic #5: just publix bad read fucking heat deal say humidity gun article ya holy edit big probably forgot late wish republicans
Topic #6: good man luck idea oh exactly job gets miss guy said thanks kid info know ok rules parents lot company
Topic #7: don know need world mean walk mi

In [22]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(florida_df.loc[-before_florida]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in during Irma in florida NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in during Irma in florida NMF model:
Topic #0: probably storm florida new hit state really far hurricanes hard cat time day having better area things remember storms bad
Topic #1: com https amp reddit www http comments bot news watch link message np imgur nbsp source org youtube info information
Topic #2: power lost went fpl days trees lines night house damage sunday generator tree duke hours ac lucky came grid neighborhood
Topic #3: hope bad pretty good luck best tampa orlando little staying eye friend north family hoping friends guys ll fort myers
Topic #4: don good know want need idea edit op point run luck care understand able option read help neighbor forget family
Topic #5: just maybe fine wait case ago saw wasn turn nope point trying nah like moved ones posted dick wondering plywood
Topic #6: like looks look guy sounds looking nice kind stuff times natural plane small kinda absolutely fake bunch try lmao ones
Topic #7: storm shelter evacuate evacuation leave windows wind

### houston

In [23]:
# tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
#                                    max_features=10**5,
#                                    stop_words='english',
#                                    strip_accents="ascii"
#                                   )
# tfidf = tfidf_vectorizer.fit_transform(clean_data(houston_df.loc[before_houston]))
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# tfidf = tfidf.todense()
# tfidf = np.unique(tfidf, axis=0)
# tfidf = sparse.csr_matrix(tfidf)
# nmf = NMF(n_components=n_components, random_state=1,
#           beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
#           l1_ratio=.5).fit(tfidf)
# print("\nTopics in before Harvey in Houston NMF model:")
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# print_top_words(nmf, tfidf_feature_names, n_top_words)

In [24]:
# tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
#                                    max_features=10**5,
#                                    stop_words='english',
#                                    strip_accents="ascii"
#                                   )
# tfidf = tfidf_vectorizer.fit_transform(clean_data(houston_df.loc[-before_houston]))
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# tfidf = tfidf.todense()
# tfidf = np.unique(tfidf, axis=0)
# tfidf = sparse.csr_matrix(tfidf)
# nmf = NMF(n_components=n_components, random_state=1,
#           beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
#           l1_ratio=.5).fit(tfidf)
# print("\nTopics in during/after Harvey in Houston NMF model:")
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# print_top_words(nmf, tfidf_feature_names, n_top_words)

#### Miami Irma

In [25]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(miami_df.loc[before_miami]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Miami  before Irma NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in Miami  before Irma NMF model:
Topic #0: people need new problem going lot public start year want things city really bad especially far way isn doesn parking
Topic #1: miami beach north city dade best area la lauderdale season fc broward que stadium county pretty places clubs gonna fl
Topic #2: com https www http amp reddit google watch news pm message comments link facebook events post imgur org bot info
Topic #3: good nice bar cool pretty definitely luck little places happy food cheap restaurant stuff key recommend really havana spot fish
Topic #4: like looks look days feel true club hour super kid doral room range taken busy inside amazon price does apple
Topic #5: don know like sounds think understand interesting forget word kids seriously said says joke reason dude care miami tried sound
Topic #6: beach south park area uber lot downtown wynwood brickell street parking want west usually weekend near kendall everglades water night
Topic #7: thanks week sorry thank wow ll a

In [26]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(miami_df.loc[-before_miami]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Miami before  Irma  NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in Miami before  Irma  NMF model:
Topic #0: like bad things gonna doesn better won really storm didn having year florida said years big change sure possible worst
Topic #1: power fpl lost internet lines came morning trees days working yesterday area ticket street restored weeks went comcast outage neighborhood
Topic #2: miami beach south north dade florida county west broward fl east city lakes according news local live palm hit center
Topic #3: com https amp www reddit http news comments watch message weather link stop video bot data imgur information channel edit
Topic #4: just hours ago guy love trying case went took moved came google doesnt saw law minutes hour account wanted reason
Topic #5: good hope best pretty luck idea new far soon hoping hopefully homestead evacuated place sucks wings bro friend neighbor sorry
Topic #6: don know need want let help really understand uber work believe dude waiting care doubt paid asking calling welcome flanigans
Topic #7: storm hurrican

### Sandy

In [None]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(nyc_df.loc[before_nyc]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics NYC before Sandy NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(nyc_df.loc[-before_nyc]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in NYC after Sandy NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

### Maria

In [21]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(puerto_rico_df.loc[before_puerto_rico]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Puerto Rico before Maria NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in Puerto Rico before Maria NMF model:
Topic #0: que la es el en para se mas del tiene como los por una ser al lo esta gente hay
Topic #1: like don people know just really work want lot island think way make ve spanish did time pretty thing better
Topic #2: la en del el isla parte estadidad independencia lado mayoria calle nunca casa internet por reggaeton los ya tienda medio
Topic #3: com https www http amp reddit youtube message watch jpg org comments np wiki imgur facebook source compose subject gov
Topic #4: el en es ha aqui esta ese dia todo este plebiscito video gobierno falta tipo otro nada sub yunque una
Topic #5: en que yo hay tambien donde estan una porque ver uno estoy tienen jajaja aqui voy trabajo amazon puedes tengo
Topic #6: puerto rico rican ricans government state america statehood independence congress status states economy independent born laws white colony territory tax
Topic #7: es eso pero si lo una bueno esa verdad muy bien buena pa ponce jaja tienes idea

In [23]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(puerto_rico_df.loc[-before_puerto_rico]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Puerto Rico after Maria NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in Puerto Rico after Maria NMF model:
Topic #0: que en la el se los es las por pero para si una como hay del ya yo esta lo
Topic #1: going people work island things think pr right time money don lot way ve want thing just flights leave went
Topic #2: com https reddit amp www bot comments message http np totesmessenger thread vote compose twitter facebook puertorico puertoricoinformation links respect
Topic #3: puerto rico ricans government american rican federal state citizens florida states gt white americans country congress relief texas united nation
Topic #4: family hope thank hear heard update able ok haven information safe lives news okay friend friends soon reach caguas area
Topic #5: que lo es mas love contact tu esto te se place le yo eso uno creo estamos para decir espero
Topic #6: la en el gracias esta mi por del informacion guaynabo si muchas baja familia pa casa cabron toa rojo cabo
Topic #7: act jones shipping ships ship law goods american waiver understand foreig

### Vegas

In [16]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(vegas_df.loc[before_vegas]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Vegas before Shooting NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in Vegas before Shooting NMF model:
Topic #0: way uber want end try little usually bit home just need away car high maybe hour deal lyft long drive
Topic #1: com https www amp http reddit imgur vegas message google try facebook news watch search youtube jpg comments link las
Topic #2: thanks check look info definitely ll help man haha wow ok appreciate sidebar gonna today reply link ah awesome love
Topic #3: vegas las downtown town north city street fremont live miss trip charleston area east blvd living la park grand old
Topic #4: good luck like idea bar food pretty beer recommend best price buffet steak chicken house point decent sounds island fun
Topic #5: like looks sounds feel fuck shit doesn op look things california city live kinda sound doing holy nevada man vacation
Topic #6: know don need let come care wrong use doing sorry question asking happen kind doesn want friends illegal make answer
Topic #7: just read thought post say hope maybe bad good wanted mean internet s

In [17]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(clean_data(vegas_df.loc[-before_vegas]))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("\nTopics in Vegas before Shooting NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.