In [1]:
import pandas as pd
import numpy as np
import re
import glob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt

import datetime
import nltk
import time

from shared_lib import utils, vocabulary
from shared_lib import ngram_lm
from shared_lib import ngram_utils
from shared_lib import simple_trigram
from scipy import sparse

%matplotlib inline



In [2]:
boston_bomb_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/boston_comments_2013.03-2013.05.txt', lines=True)
boston_series_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/boston_comments_2013.10-2013.11.txt', lines=True)
colorado_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/colorado_comments_2017.06-2017.09.txt', lines=True)
florida_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/florida_comments_2017.06-2017.09.txt', lines=True)
houston_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/houston_comments_2017.06-2017.09.txt', lines=True)
miami_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/miami_comments_2017.06-2017.09.txt', lines=True)
nyc_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/nyc_comments_2012.08-2012.12.txt', lines=True)
puerto_rico_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/puerto_rico_comments_2017.06-2017.09.txt', lines=True)
vegas_df = pd.read_json('/Users/krista/Desktop/w266-project-master/data/reddit/vegas_comments_2017.06-2017.09.txt', lines=True)

In [3]:
# setup local times
boston_bomb_df['created_at_local'] = pd.to_datetime(boston_bomb_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
boston_series_df['created_at_local'] = pd.to_datetime(boston_series_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
florida_df['created_at_local'] = pd.to_datetime(florida_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
houston_df['created_at_local'] = pd.to_datetime(houston_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Central')
miami_df['created_at_local'] = pd.to_datetime(miami_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
nyc_df['created_at_local'] = pd.to_datetime(nyc_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
puerto_rico_df['created_at_local'] = pd.to_datetime(puerto_rico_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('America/Puerto_Rico')
vegas_df['created_at_local'] = pd.to_datetime(vegas_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('US/Pacific')

In [4]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 10

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

###vegas

In [6]:
pattern = "http"
dirs = glob.glob("/Users/krista/Desktop/w266-project-master/data/reddit/vegas_comments_2017.06-2017.09.txt")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_json(dir_, lines=True)
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:

            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e


In [7]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [8]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

In [9]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model:
Topic #0: time uber away little just going high getting usually don
Topic #1: com https www amp http reddit imgur vegas google watch
Topic #2: thanks check look info definitely ll awesome help man ok
Topic #3: vegas las downtown city live north area fremont circus town
Topic #4: good pretty luck food like idea beer recommend stuff price
Topic #5: like looks sounds shit feel fuck look things doesn op
Topic #6: know don need let come care wrong question sorry doing
Topic #7: just read post say mean good hope saw wanted bad
Topic #8: people gt bad said life fucking wasn start school thing
Topic #9: does week day long got work oh hours guy time
Topic #10: ll tip people money probably don won make tipping tax
Topic #11: strip dont lot people casino places rooms street far hotel
Topic #12: great thank really place nice love amazing agree food awesome
Topic #13: lol room hotel mgm free pay pool dude actually going
Topic #14: ve got best years seen heard new used town hav

###florida

In [10]:
pattern = "http"
dirs = glob.glob('/Users/krista/Desktop/w266-project-master/data/reddit/florida_comments_2017.06-2017.09.txt')
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_json(dir_, lines=True)
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [12]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

In [13]:
print("\nTopics in florida NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in florida NMF model:
Topic #0: house bad getting hit went don left outside far goes
Topic #1: com https reddit amp www http comments bot news watch
Topic #2: florida south state north love central living lived fl coast
Topic #3: don people want understand trump fucking money government country care
Topic #4: good luck best oh thing idea ok really hear op
Topic #5: like looks look looking sounds guy dude fun old shit
Topic #6: just maybe case gonna got saw ride late moved guess
Topic #7: people going leave evacuate tell evacuation storm shelter problem aren
Topic #8: people power fpl lines duke grid company lost solar poor
Topic #9: thanks yeah man post doing thought guess trying exactly okay
Topic #10: safe stay power hope home sorry friend feel guys glad
Topic #11: water need food buy use store beer ice windows generator
Topic #12: know did way didn work let said half dont does
Topic #13: beach county live area st tampa miami west palm city
Topic #14: ve years got time seen s

###miami

In [14]:
pattern = "http"
dirs = glob.glob('/Users/krista/Desktop/w266-project-master/data/reddit/miami_comments_2017.06-2017.09.txt')
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_json(dir_, lines=True)
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

In [15]:

tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [16]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

In [17]:
print("\nTopics in miami NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in miami NMF model:
Topic #0: just things way family friends school actually car worth bit
Topic #1: miami beach south north city dade downtown florida county fl
Topic #2: power fpl area lost internet night lines yesterday near hialeah
Topic #3: com https amp www reddit http news watch link message
Topic #4: good best luck brickell idea food place looking far friend
Topic #5: like looks look feel sounds person times guys sound nice
Topic #6: don know want need people understand english spanish speak drivers
Topic #7: people florida isn money state true poor driving price gt
Topic #8: going probably way better getting long time able rain orlando
Topic #9: know just people let didn love man dont help guy
Topic #10: thanks think don just try ok ll cool update want
Topic #11: bad going gt right hope lol say doing said saying
Topic #12: hurricane water storm need cat andrew irma windows doesn damage
Topic #13: got ve years ago time seen day year week went
Topic #14: want water gas r

###puerto rico

In [18]:
pattern = "http"
dirs = glob.glob('/Users/krista/Desktop/w266-project-master/data/reddit/puerto_rico_comments_2017.06-2017.09.txt')
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_json(dir_, lines=True)
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

In [19]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [20]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

In [21]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model:
Topic #0: la que el en para es por una del se
Topic #1: people pr time need way island going right help make
Topic #2: com https www amp http reddit facebook message youtube twitter
Topic #3: que lo hay yo tu como es mas le creo
Topic #4: puerto rico rican ricans statehood state government independence congress federal
Topic #5: el del lol esta video aqui este maria link mobile
Topic #6: en yo tengo que mi hay estoy tambien donde estan
Topic #7: family thank info hope hear news information heard ok help
Topic #8: es eso lo pero si mi bueno pa ponce esa
Topic #9: don think really vote reddit read thread bot place care
Topic #10: los son pr si todos pero op es esos nope
Topic #11: san juan area live island old rio areas near service
Topic #12: por te gracias si lo tu esta alguien pero fue
Topic #13: know ll let don try work spanish dont want really
Topic #14: gt yes man did isn love english year lt leave
Topic #15: people like fuck just said fucking feel shit thank 

###boston

In [None]:
pattern = "http"
dirs = glob.glob('/Users/krista/Desktop/w266-project-master/data/reddit/boston_comments_2013.03-2013.05.txt')
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_json(dir_, lines=True)
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [None]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

In [None]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
###houston

In [None]:
# pattern = "http"
# dirs = glob.glob('/Users/krista/Desktop/w266-project-master/data/reddit/houston_comments_2017.06-2017.09.txt')
# sentences = []
# for dir_ in dirs:
#     try:
#         df = pd.read_json(dir_, lines=True)
# #         df = df[~df.text.str.contains(pattern)]
#         new_sentences = list(df['body'].values)
#         for sentence in new_sentences:
# #             regex = re.compile('[^a-zA-Z]')
# #             sentence = regex.sub(sentence, regex)
#             sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
#             sentence = re.sub(" \d+", '', sentence)
#             sentence = re.sub(r'\w*\d\w*', '', sentence)

#             sentences.append(sentence)
#     except Exception as e:
#         print e

In [None]:
# tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
#                                    max_features=10**5,
#                                    stop_words='english',
#                                    strip_accents="ascii"
#                                   )
# tfidf = tfidf_vectorizer.fit_transform(sentences)
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# tfidf = tfidf.todense()
# tfidf = np.unique(tfidf, axis=0)
# tfidf = sparse.csr_matrix(tfidf)
# print 'tfidf done'

In [None]:
# nmf = NMF(n_components=n_components, random_state=1,
#           beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
#           l1_ratio=.5).fit(tfidf)
# print 'nmf done'

In [None]:
# print("\nTopics in NMF model:")
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# print_top_words(nmf, tfidf_feature_names, n_top_words)

###nyc

In [None]:
pattern = "http"
dirs = glob.glob('/Users/krista/Desktop/w266-project-master/data/reddit/nyc_comments_2012.08-2012.12.txt')
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_json(dir_, lines=True)
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['body'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)


In [None]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)


In [None]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)