In [1]:
import pandas as pd
import numpy as np
import re
import glob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import make_moons
from scipy import sparse


%matplotlib inline

In [2]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
import string
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

#Sanitizes the text by removing front and end punctuation, 
#making words lower case, and removing any empty strings.
def get_text_sanitized(tweet):
    return ' '.join([w.lower().strip().rstrip(string.punctuation)\
        .lstrip(string.punctuation).strip()\
        for w in tweet.replace('\xe2\x80\xa6', '').split(" ")\
        if w.strip().rstrip(string.punctuation).strip()])

#Gets the text, clean it, make it lower case, stem the words, and split
#into a vector. Also, remove stop words.
def get_text_normalized(tweet):
    #Sanitize the text first.
    text = get_text_sanitized(tweet).split()
    
    #Remove the stop words.
    text = [t for t in text if t not in [stopwords.words('english')] ]

    return text
    
    #Stemmer gets upset at a lot of tweets
    #Create the stemmer.
    stemmer = LancasterStemmer()
    
    #Stem the words.
    return [stemmer.stem(t) for t in text]

def purge_urls(tweet):
    return re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)

def sanitize_dataset(dataset):
    sentences = []
    for sentence in dataset:
        sentences.append(get_text_normalized(purge_urls(sentence)))
    return sentences

### vegas


#### vegas before

In [5]:
pattern = "http"
dirs = glob.glob("data/data_Las_Vegas/2017-09*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")

        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

expected string or buffer
expected string or buffer


In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [7]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: vegas las strip north bellagio venetian cosmopolitan downtown wynn night baby center rock convention weekend welcome fabulous em paris club
Topic #1: bit ly vegastraffic xxmewb accident http blvd rd clark nb ave sb reported approaching ramp right beltway eb dr sahara
Topic #2: beautiful life lifeisbeautiful festival weekend amazing lib friends lifeisbeautifulfestival art people gorillaz best live experience truly like fun really ready
Topic #3: com twitter pic http nfl raidernation mp raiders tour home circlepix week https oakland dlvr listing washington retweet listed jets
Topic #4: lasvegas lasvegasstrip lifeisbeautiful like usa bellagio hiring vegas strip home vivalasvegas travel jobs flamingo bar world morning place ready great
Topic #5: casino hotel resort paris mirage aria flamingo hollywood planet orleans luxor excalibur bay mandalay york rock spa hard island bellagio
Topic #6: just posted photo video like stratosphere got planet fitness bar resort cause live bay manda

#### vegas after 


In [62]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20

In [63]:
pattern = "http"
dirs = glob.glob("data/data_Las_Vegas/2017-10*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

Error tokenizing data. C error: Expected 10 fields in line 296, saw 11

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 239, saw 11

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 1755, saw 11



In [64]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [65]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: vegas las strip bellagio venetian cosmopolitan north wynn en downtown grand good palazzo mgm vegasbaby fabulous sign welcome view weekend
Topic #1: bit ly vegastraffic xxmewb accident http blvd rd clark ave nb sb reported approaching right eb dr charleston wb west
Topic #2: swarmapp nv las henderson vegas bar center lasairport cafe pic twitter grill lunch home restaurant station burger starbucks buffet picking
Topic #3: beer untp drinking http photo ipa khourysfinewine ale hopnutsbrewing zombies bottle share tenayacreek lager stout light bangerbrewing hop bar eagle
Topic #4: lasvegas bellagio lasvegasstrip sincity en usa lv tour sema home hiring travel depechemode fremont mandalaybay vegas wynn fun circlepix dtlv
Topic #5: casino hotel resort rock luxor hard mirage aria mandalay bay spa paris planet hollywood red york excalibur rio island suite
Topic #6: just posted photo video got park like want listed palace nightclub caesars retweet center red restaurant henderson world st

### houston before

In [11]:
pattern = "http"
dirs = glob.glob("data/data_Houston/2017-08-0*.csv") + glob.glob ("data/data_Houston/2017-08-10.csv") + glob.glob("data/data_Houston/2017-08-11.csv")  + glob.glob("data/data_Houston/2017-08-12.csv")  +glob.glob("data/data_Houston/2017-08-13.csv") +glob.glob("data/data_Houston/2017-08-14.csv") +glob.glob("data/data_Houston/2017-08-15.csv") + glob.glob("data/data_Houston/2017-08-16.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
#         df = df[~df.text.str.contains(pattern)]
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
#             regex = re.compile('[^a-zA-Z]')
#             sentence = regex.sub(sentence, regex)
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

Error tokenizing data. C error: Expected 10 fields in line 70, saw 11

expected string or buffer
expected string or buffer


In [12]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [13]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: traffic delay stop mins fwy cleared accident outbound lp inbound hwy sw katy rd ly bit stall northside nb gulf
Topic #1: swarmapp tx houston club pasadena intercontinental airport george pearland bush city brunch checkin sam sports starbucks ubercheckin automatically uber grill
Topic #2: bubly http beds baths tx st dr pearland ln pasadena bath rd houston porte ct deer la way lake bellaire
Topic #3: beer untp drinking http photo ipa flying ale hops hop saintarnold conservatoryhtx premiumdraught summer grill heights meet west casa better
Topic #4: twitter pic dlvr rt status http astros shit did way white lol people road circlepix fun george cool local try
Topic #5: houston texas southeast westside htown downtown htx north museum night en arts work sunday sundayfunday live mi la good fine
Topic #6: just posted photo video club fitness heights trying galleria live know pearland listed free petco events thanks store say game
Topic #7: day family wednesday special amazing school aw

### houston after 

In [55]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [56]:
pattern = "http"
dirs = glob.glob("data/data_Houston/2017-08-2*.csv") + glob.glob("data/data_Houston/2017-08-3*.csv")
#print dirs
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

expected string or buffer
expected string or buffer
expected string or buffer
expected string or buffer
expected string or buffer


In [57]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [58]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: houston texas westside pray share southeast downtown byb help city htown en day like god byt prayforhouston time eastside flood
Topic #1: water bit ly high lanes traffic main affecting http fwy sb hwy wb nb lp downtown baytown eb right harris
Topic #2: bubly http beds baths beer tx st untp dr closed drinking astros shelter flooding pasadena pearland new ln porte relief
Topic #3: harvey hurricane houston got rain needs relief prayers center look today flooded friday come good george stuck open thing home
Topic #4: hurricaneharvey safe prayforhouston stay houston need park downtown htx flooded rain ready houstonstrong dry buffalo going water home getting prayers
Topic #5: just posted photo video stadium hard downtown got houston house southeast getting la didn meyerland beer untp old don say
Topic #6: twitter pic status rt like god tour home astros looking getting time pray didn great http know going friday people
Topic #7: repost get_repost help people prayforhouston thanks wo

### puerto rico

In [20]:
pattern = "http"
dirs = glob.glob("data/data_San_Juan/2017-09-0*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e
print dirs

['data/data_San_Juan/2017-09-06.csv', 'data/data_San_Juan/2017-09-07.csv', 'data/data_San_Juan/2017-09-05.csv', 'data/data_San_Juan/2017-09-04.csv', 'data/data_San_Juan/2017-09-01.csv', 'data/data_San_Juan/2017-09-03.csv', 'data/data_San_Juan/2017-09-02.csv', 'data/data_San_Juan/2017-09-09.csv', 'data/data_San_Juan/2017-09-08.csv']


In [21]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [22]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: juan san argentina marquesado capital todo que hoy foto te es gracias feliz estadio dia el la del club boca
Topic #1: la hoy san es sabado siempre del juan gracias foto feliz estadio yo el lo dia club capital boca bicentenario
Topic #2: que lo es pero todo siempre yo por hoy quiero mas capital dia el juan del club boca estadio feliz
Topic #3: el hoy quiero boca yo foto es que capital feliz la juan gracias estadio lo dia del club bicentenario las
Topic #4: del bicentenario siempre estadio juan sabado san que club hoy gracias foto feliz yo es el las dia capital boca
Topic #5: te siempre quiero estadio sabado pero hoy que gracias juan dia la del el es club capital feliz foto boca
Topic #6: por gracias lo quiero siempre sabado la todo del estadio juan hoy bicentenario foto feliz boca es el dia club
Topic #7: sanjuan estadio boca bicentenario san yo juan gracias la hoy foto feliz el es lo dia del club capital las
Topic #8: una foto sabado bicentenario marquesado lo feliz las la ju

### maria  during

In [23]:
pattern = "http"
dirs = glob.glob("data/data_San_Juan/2017-09-2*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)

            sentences.append(sentence)
    except Exception as e:
        print e

In [24]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [25]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: juan san argentina lucianopereyra marquesado dia vida foto lo domingo quiero club como del la el es feliz capital al
Topic #1: que lo mute la para el juan foto feliz es vida domingo del como club capital argentina al dia los
Topic #2: la lucianopereyra que una mute vida san es foto feliz el dia domingo del como club capital argentina al juan
Topic #3: el domingo para dia la juan foto feliz es vida lo del como club capital argentina al las los una
Topic #4: mi vida lucianopereyra para marquesado san es feliz foto el domingo la dia del como club capital argentina al juan
Topic #5: te quiero mas vida como que feliz mute san argentina capital club juan del dia al domingo el es foto
Topic #6: los quiero san mute es domingo juan foto feliz el del dia las como club capital argentina al la vida
Topic #7: sabado mute para vida domingo juan foto feliz es el dia las del como club capital argentina al la los
Topic #8: por vida quiero una lo club capital como del la argentina domingo el e

### Miami 

#### Before Irma

In [26]:
pattern = "http"
dirs = glob.glob("data/data_Miami/2017-08*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

Error tokenizing data. C error: Expected 10 fields in line 987, saw 11

Error tokenizing data. C error: Expected 10 fields in line 974, saw 11

Error tokenizing data. C error: Expected 10 fields in line 846, saw 11

Error tokenizing data. C error: Expected 10 fields in line 579, saw 11

Error tokenizing data. C error: Expected 10 fields in line 644, saw 11

Error tokenizing data. C error: Expected 10 fields in line 425, saw 12

Error tokenizing data. C error: Expected 10 fields in line 1105, saw 11

Error tokenizing data. C error: Expected 10 fields in line 276, saw 11

expected string or buffer


In [27]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [28]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: miami international airport mia miamibeach southbeach downtown brickell city vacation beach home north vibes iflymia tbt like good fontainebleau miamilife
Topic #1: chance tonight storm pm hi forecast lo augth fl cloudy partly showers tue wednesday sunny storms mon sat thu tuesday
Topic #2: bit ly http sfltraffic blocked lane sb st accident nb disabled vehicle sr left ramp right express tpke cleared nwth
Topic #3: en una los mi acaba publicar las foto doral hoy este del ya por park noche tu kendall esta nuestro
Topic #4: com twitter pic http tour https home realestate keyes listing net dlvr utm_source utm_medium tecnohoy status circlepix virtual looking shop
Topic #5: beach south usa hotel miami southbeach miamibeach sunny ocean isles fontainebleau summer hollywood em hallandale north drive sea life riu
Topic #6: repost get_repost saturday amazing party today tonight tbt join ladies app coming meet tomorrow una esta tune augustth itunes class
Topic #7: florida usa miami doral

### during/after irma 

In [29]:
pattern = "http"
dirs = glob.glob("data/data_Miami/2017-09*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")
        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 336, saw 11

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 357, saw 11

expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 706, saw 11

expected string or buffer
expected string or buffer


In [30]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [31]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: miami airport downtown international miamibeach mia brickell city em north tbt southbeach gardens iflymia good dade hurricaneirma night today live
Topic #1: sfltraffic bit ly http sr blocked accident expy nb lane st nwth sb rd palmetto disabled cleared vehicle ave right
Topic #2: chance storm tonight hi forecast lo pm septh fl showers sat mon storms sunday tue wednesday heights tuesday thu friday
Topic #3: en mi una los doral hoy brickell acaba publicar del por te foto este ya esta mexico para las dios
Topic #4: com twitter pic http tour home realestate https dlvr net keyes utm_medium utm_source tecnohoy listing circlepix looking virtual buyer status
Topic #5: irma hurricane hurricaneirma ready safe post huracan like miamibeach got stay relief week storm aftermath brickell southflorida thank help power
Topic #6: beach south sunny miamibeach southbeach isles miami hallandale north ocean usa isle tbt em hollywood pointe drive hotel fontainebleau vacation
Topic #7: repost get_re

### Sandy

##### Before sandy

In [32]:
pattern = "http"
dirs = glob.glob("data/data_New_York_City/2012-10-0*.csv")+ glob.glob("data/data_New_York_City/2012-10-1*.csv")+glob.glob("data/data_New_York_City/2012-10-20*.csv")+glob.glob("data/data_New_York_City/2012-10-21*.csv")+glob.glob("data/data_New_York_City/2012-10-22*.csv")+ glob.glob("data/data_New_York_City/2012-10-23*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")

        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

Error tokenizing data. C error: Expected 10 fields in line 687, saw 11

Error tokenizing data. C error: Expected 10 fields in line 41, saw 11

Error tokenizing data. C error: Expected 10 fields in line 683, saw 11

Error tokenizing data. C error: Expected 10 fields in line 553, saw 11

Error tokenizing data. C error: Expected 10 fields in line 240, saw 11

Error tokenizing data. C error: Expected 10 fields in line 473, saw 11

expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 34, saw 11

Error tokenizing data. C error: Expected 10 fields in line 37, saw 11

Error tokenizing data. C error: Expected 10 fields in line 242, saw 12

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 3, saw 11

Error tokenizing data. C error: Expected 10 fields in line 856, saw 11

Error tokenizing data. C error: Expected 10 fields in line 105, saw 11



In [33]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [34]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: http instagr park street newyork hall ly bar photo dinner bit posted city square art music day hotel house theater
Topic #1: new york ny bar city square street hotel hall lincoln times cafe west shop theatre club kitchen st world grill
Topic #2: pic twitter com path event http great tonight party right dinner awesome theatre birthday better food oh work el cool
Topic #3: like feel people look really nice shit looks make girl try fucking stop away open im niggas lmaoo getting ll
Topic #4: just foursquare mayor posted photo ousted think day saw came haha did fuck face wanna oh come noticed baby literally
Topic #5: love baby izod thanks guys lmfao friend omg haha fuck true nigga great team ya damn face tho missing forever
Topic #6: lol know ass right yea im tho think hard yeah smh stop text ya ll cause did use okay dont
Topic #7: brooklyn ny bridge barclays bowl park restaurant pic services studios dog free st club mall family fitness thanks church bronx
Topic #8: time hall part

#### after sandy

In [35]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20

In [36]:
pattern = "http"
dirs = glob.glob("data/data_New_York_City/2012-11*.csv") + glob.glob("data/data_New_York_City/2012-10-3*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")

        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 445, saw 11

Error tokenizing data. C error: Expected 10 fields in line 585, saw 11

Error tokenizing data. C error: Expected 10 fields in line 753, saw 11

Error tokenizing data. C error: Expected 10 fields in line 677, saw 11

Error tokenizing data. C error: Expected 10 fields in line 808, saw 11

Error tokenizing data. C error: Expected 10 fields in line 196, saw 12

expected string or buffer
expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 418, saw 11

Error tokenizing data. C error: Expected 10 fields in line 385, saw 11

expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 146, saw 11

Error tokenizing data. C error: Expected 10 fields in line 460, saw 11

Error tokenizing data. C error: Expected 10 fields in line 224, saw 11

Error tokenizing data. C error: Expected 10 fields i

In [37]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [38]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: http instagr center photo com rockefeller posted bar st sandy house dinner store christmas tree hall building red day little
Topic #1: pic com twitter path http sandy true look bad line finally ready fun yes tree storm happy way street rock
Topic #2: new york ny city club jersey bar square manhattan cafe st times terminal restaurant hall station amc bridge broadway flight
Topic #3: lol ok oh suck did rt cool hate gonna big fun right man phone ass omg ur better thank happy
Topic #4: just foursquare mayor posted photo ousted saw want life took did im ve live week finished asked day fuck half
Topic #5: like look looks feel bitch girls shit say bitches eating sure im house sound watching niggas sounds wtf does better
Topic #6: don know think come tell going let want mean gonna bad ll make fuck wait wanna use oh son understand
Topic #7: love baby thanks real_liam_payne miss make ll little forever follow happy let feel birthday gonna use america girl boy person
Topic #8: que la en 

### Boston bombing

#### Before

In [39]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [40]:
pattern = "http"
dirs = glob.glob("data/data_Boston/2013-03*.csv") + glob.glob("data/data_Boston/2013-04-0*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")

        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

Error tokenizing data. C error: Expected 10 fields in line 275, saw 11

expected string or buffer
expected string or buffer
expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 248, saw 11

Error tokenizing data. C error: Expected 10 fields in line 946, saw 13

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 29, saw 11

Error tokenizing data. C error: Expected 10 fields in line 24, saw 11

Error tokenizing data. C error: Expected 10 fields in line 410, saw 11

Error tokenizing data. C error: Expected 10 fields in line 1604, saw 11

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 189, saw 11

expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 671, saw 11



In [41]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [42]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: http instagram com dinner photo house td beer day garden st east center dlvr happy little thanks untp bruins square
Topic #1: twitter pic com paxeast great lot today best did pax lmfao new hey bruins tonight wish lmao yes game restaurant
Topic #2: just got think lmao damn said did haha way home going cuz watch want don today new girl time long
Topic #3: like feel people shit ass looks really week rt come seriously ll girls bad live cause makes better look gets
Topic #4: boston ma time ly bit house http center east excited school best cambridge way restaurant paxeast college night report bad
Topic #5: know don let day wanna shit want doesn thanks mt dont time say yeah tomorrow trying probably ve won start
Topic #6: good time going day tonight look im gonna looking yeah bad think looks lmao ve today far ill right pretty
Topic #7: lol got right bro shit okay funny didn bitch ya lot come worst friends actually oh high house new team
Topic #8: love people better watching night hey

#### boston after 

In [51]:
n_samples = 2000
n_features = 1000
n_components = 15
n_top_words = 20

In [52]:
pattern = "http"
dirs = glob.glob("data/data_Boston/2013-04-15.csv") + glob.glob("data/data_Boston/2013-04-16.csv") + glob.glob("data/data_Boston/2013-04-17.csv")+ glob.glob("data/data_Boston/2013-04-18.csv")+ glob.glob("data/data_Boston/2013-04-19.csv")+ glob.glob("data/data_Boston/2013-04-2*.csv")
sentences = []
for dir_ in dirs:
    try:
        df = pd.read_csv(dir_, delimiter=";")

        new_sentences = list(df['text'].values)
        for sentence in new_sentences:
            sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            sentence = re.sub(" \d+", '', sentence)
            sentence = re.sub(r'\w*\d\w*', '', sentence)
            sentence = re.sub('http[s]?://(www. )?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
            sentences.append(sentence)
    except Exception as e:
        print e

expected string or buffer
expected string or buffer
expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 233, saw 11

Error tokenizing data. C error: Expected 10 fields in line 43, saw 11

expected string or buffer
expected string or buffer
Error tokenizing data. C error: Expected 10 fields in line 787, saw 11

Error tokenizing data. C error: Expected 10 fields in line 392, saw 11

expected string or buffer
expected string or buffer


In [53]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=10,
                                   max_features=10**5,
                                   stop_words='english',
                                   strip_accents="ascii"
                                  )
tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf = tfidf.todense()
tfidf = np.unique(tfidf, axis=0)
tfidf = sparse.csr_matrix(tfidf)

In [54]:
nmf = NMF(n_components=n_components, random_state=1,
          max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words=20)

Topic #0: http instagram com bostonstrong beautiful redsox park yq fenway new prayforboston photo food day bar sunset square police dinner boston
Topic #1: pic twitter com bostonstrong little today game im happy rt gonna prayforboston home bruins night yes tonight school best seen
Topic #2: boston ma marathon house bostonstrong city manhunt way vs watertown bar great grill la police sports http bruins market center
Topic #3: like feel looks really make guy shit dont nigga stop wait yes wanna prayforboston haha lmao old cause school away
Topic #4: fenway park sox red redsox mlb astros vs night houston game lets pic instagram sunset new watching home bostonstrong best
Topic #5: just news saw did said life didn friend posted photo shit walk want gonna watching need heard work getting yes
Topic #6: love friends life miss man baby really girl news beautiful city great weird ya thank alive wish god let oh
Topic #7: don know shit want fuck wanna thingsthatirritateme let food haha tell rt aliv