In [101]:
import pandas as pd
import numpy as np
import gensim
import requests
from bs4 import BeautifulSoup
import time
import re

In [102]:
# Pulling in tweets that we pulled
df = pd.read_csv('./datasets/stacked_classified.csv')
df.head()

Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,link,day,hour,minute,during_fire,is-fire-related,text_nourl,key_score,from_locations,sent
0,366.0,"Paradise, CA",1.06e+18,"::paradise baby:: meet JD, the grandson of my ...",2018-11-16 16:31:06+00:00,#campfire,trekcass,@JulieRems,0,0,https://twitter.com/trekcass/status/1063469521...,16,16,31,1,1,"::paradise baby:: meet JD, the grandson of my ...",2,"Paradise, CA",1
1,700.0,"Paradise, CA",1.06e+18,"¡Coño! #CampFire @Chico, California https://ww...",2018-11-09 19:29:06+00:00,#CampFire,VXO,,0,0,https://twitter.com/VXO/status/106097760184037...,9,19,29,1,1,"¡Coño! #CampFire @Chico, California",1,"Chico, CA",1
2,494.0,"Paradise, CA",1.06e+18,. . . . . #photography #instapics #photographe...,2018-11-13 15:51:31+00:00,#photography #instapics #photographersofig #pi...,sidewayseightp,,0,0,https://twitter.com/sidewayseightp/status/1062...,13,15,51,1,0,. . . . . #photography #instapics #photographe...,0,"Chico, CA",1
3,53.0,"Paradise, CA",1.07e+18,.@Stucam7771 nailed it! The man in the White H...,2018-11-24 11:29:40+00:00,,XLComedy,@stucam7771,1,0,https://twitter.com/XLComedy/status/1066292766...,24,11,29,1,1,.@Stucam7771 nailed it! The man in the White H...,0,"Paradise, CA",1
4,201.0,"Oroville, CA",1.06e+18,‘Merurica... #voted #uklastudios #uklapictures...,2018-11-07 02:02:51+00:00,#voted #uklastudios #uklapictures,UKLA_Music,,0,0,https://twitter.com/UKLA_Music/status/10599895...,7,2,2,0,0,‘Merurica... #voted #uklastudios #uklapictures...,0,"Gridley, CA",1


In [103]:
# counts of hashtages (including stripping the '#'
def count_hashtags(df):
    keyterms = {}
    for i in df['hashtags'].dropna():
        i = i.replace("#", '')
        for j in i.split():
            if j not in keyterms.keys():
                keyterms[j] = 1
            else:
                keyterms[j] += 1
    return keyterms

# Created a dictionary of 'root words' related to a wildfire
# Iterate over the hashtags and this list to build a keywords list to score tweets
def build_keywords(hashtags): 
    word_filter = ['fire', 'evac', 'smok', 'burn', 'wild', 'blaz', 'hell', 'department',
              'inferno', 'help']
    keywords = []
    for word in set(hashtags):
        for wf in word_filter:
            if wf in word.lower():
                keywords.append(word.lower())
    return list(set(keywords))

# check each tweet for keyterms and score them
# trying to also account for plural words and capitalizations
# key_score is just the count of the keywords in the tweet
def keyscoring(df, keywords):
    keyscore = []
    df['tweet_text'] = df['tweet_text'].str.lower() #removing all capitals
    for row in df.index:
        keyscore.append(0)
        for word in (df.iloc[row]['tweet_text']).split(): # splitting 1 tweet into words
            word_nohash = re.sub(r'[^\w\s]','', word) #removing all punctuation
            if word_nohash in keywords: # Using try excepts to just make sure nothing breaks with a missing index
                try:
                    keyscore[row]+=1
                except:
                    print(f'fail in row {row}')
            else:
                try:
                    if word_nohash[-1] == 's':
                        if word_nohash[:-1] in keywords:
                            try:
                                keyscore[row]+=1
                            except:
                                print(f'fail in row {row}')
                except:
                    pass
    df['key_score'] = keyscore

# This scrapes each tweet for the from location at the bottom of some tweets
# This is a Twitter generated location if enabled by the user
def get_locs(df):
    locs = []
    df_num_rows = df.shape[0]
    for row in df.index:
        url = df.iloc[row]['link']
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')
        spans = soup.find_all('span', {'class' : 'permalink-tweet-geo-text'})
        try:
            location = spans[0].text.replace("from",'').strip()
            if location == 'California, USA':
                locs.append(df.iloc[row]['City'])
            else:
                locs.append(location)
        except:
            print(f'tweet #{row} has no location info')
            locs.append('nolocationfound')
        
        if row % 25 == 0:
            time.sleep(3)
        if row % 75 == 0:
            print(f'Located {row} out of {df_num_rows} tweets.  {row/df_num_rows:.2%}')
    df['from_locations'] = locs

# This just removes the url from the tweet text, this is because URLs can do weird things in NLP
def remove_url(df):
    df['text_nourl'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', i) for i in df['tweet_text']]

# This runs all the steps to clean an imported tweet dataset
def func_master(df):
    keywords = build_keywords(list(count_hashtags(df)))
    remove_url(df)
    keyscoring(df, keywords)
    get_locs(df)

In [104]:
func_master(df)


Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,link,day,hour,minute,during_fire,is-fire-related,text_nourl,key_score,from_locations,sent
204,488.0,"Paradise, CA",1.06e+18,amazing story coming out on how allyn pierce t...,2018-11-13 16:58:03+00:00,#Tundra #Paradise,andybell,@Toyota,0,1,https://twitter.com/andybell/status/1062389136...,13,16,58,1,1,Amazing story coming out on how Allyn Pierce t...,0,"Paradise, CA",1
327,,"Paradise, CA",1.06e+18,can't wait for tonight. see ya in a bit can't ...,2018-11-05 00:54:06+00:00,,teamwilkins,,0,0,https://twitter.com/teamwilkins/status/1059247...,5,0,54,0,0,Can't wait for tonight. See ya in a bit Can't ...,0,"Chico, CA",1
328,,"Paradise, CA",1.06e+18,can't wait for you to join us to share a meal ...,2018-11-02 18:56:06+00:00,,teamwilkins,,1,0,https://twitter.com/teamwilkins/status/1058432...,2,18,56,0,0,Can't wait for you to join us to share a meal ...,0,"Chico, CA",1


In [98]:
df.head() #checking

Unnamed: 0.1,Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,link,day,hour,minute,during_fire,text_nourl,key_score,from_locations
0,144,,"Paradise, CA",1.057787e+18,chico ca wed oct 31st pm forecast: tonight mos...,2018-11-01 00:10:34+00:00,,_ChicoCA,,0,0,https://twitter.com/_ChicoCA/status/1057786941...,1,0,10,0,chico ca wed oct 31st pm forecast: tonight mos...,0,"Chico, CA"
1,143,,"Paradise, CA",1.057787e+18,happy halloween from harry potter and danaerys...,2018-11-01 00:12:17+00:00,,itoddsmama,,0,0,https://twitter.com/itoddsmama/status/10577873...,1,0,12,0,happy halloween from harry potter and danaerys...,0,"Chico, CA"
2,142,,"Paradise, CA",1.057788e+18,"happy halloween • @chico, california https://w...",2018-11-01 00:12:58+00:00,,hannahmariexx_,,0,0,https://twitter.com/hannahmariexx_/status/1057...,1,0,12,0,"happy halloween • @chico, california",0,"Chico, CA"
3,182,,"Oroville, CA",1.057789e+18,just posted a video @ukla studios https://www....,2018-11-01 00:18:43+00:00,,UKLA_Music,,0,0,https://twitter.com/UKLA_Music/status/10577889...,1,0,18,0,just posted a video @ukla studios,0,"Gridley, CA"
4,181,,"Oroville, CA",1.057789e+18,just posted a video @ukla studios https://www....,2018-11-01 00:19:37+00:00,,UKLA_Music,,0,0,https://twitter.com/UKLA_Music/status/10577892...,1,0,19,0,just posted a video @ukla studios,0,"Gridley, CA"


In [99]:
df['key_score'].value_counts()

0    604
1    369
2    107
3     28
4     15
5      4
6      3
Name: key_score, dtype: int64

In [106]:
df.to_csv('./datasets/stacked_clean_again.csv', index=False) #exporting

### Currently unused

In [7]:
# use hashtag list to build dictionary of keyterms to use combined with that of the gensim model
# Import word vectors into "model."
model = gensim.models.KeyedVectors.load_word2vec_format('../lexvec.enwiki+newscrawl.300d.W.pos.vectors')


In [8]:
# Building a keywords list from related words to a few seed words
keyt = []
keywords = ['fire', 'smoke', 'wildfire', 'campfire', 'forest', 'evacuate', 'hell']
for word in keywords:
    kt = [x[0] for x in model.most_similar(word, topn = 25)]
    keyt+= kt
word_list = list(set(keyt))

In [9]:
# Comparing the keywords to the hashtages to make a list to score tweets
important_words = []
for word in keyterms.keys():
    if word in word_list:
        important_words.append(word)

In [None]:
df[(df['tweet_text'].str.contains('fires')) & (df['key_score'] == 0)]