## Imports

In [171]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex
import GetOldTweets3 as got
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import datetime


pd.set_option("max_rows", 75)


## Function to Pull Tweets

In [125]:
# This function uses Reem's GOT3 code to import tweets given query, date range and number of tweets and saves it to a csv file

def get_tweets(query,start='2006-03-21',end=datetime.date.today().strftime("%Y-%m-%d"),maxtweets=1000):
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                               .setSince(start)\
                                               .setUntil(end)\
                                               .setMaxTweets(maxtweets)
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)

    tweet_dict = tweetCriteria.__dict__
    file_name = (tweet_dict['querySearch']+\
                 tweet_dict['since']+\
                 tweet_dict['until']).replace(" ","_")   
    df = pd.DataFrame([t.__dict__ for t in tweet])
    return df.to_csv(f'./data/{query}_{maxtweets}.csv', index=False)

## Function to pull word frequency from tweet csv

In [172]:
# this is the stop words list for the countvectorizer
custom_words = list(set(
    list(ENGLISH_STOP_WORDS) + list(stopwords.words('english')) + 
    ['and','of','for','the','com','amp','https','http']))

# This function outputs a dataframe with text used by good handles and top text that isn't used by good handles
def frequency_words(df, handles):
    # dataframe of tweets from friendly handles given approved by client
    info = tweets[tweets['username'].str.lower().isin([x.lower() for x in handles])]
    #instantiate countvect for good info and transform tweets from safe_handles to dataframe
    info_cvec = CountVectorizer(stop_words = custom_words)
    info_cv = info_cvec.fit_transform(info.text)
    info_words = pd.DataFrame(info_cv.toarray(), columns = info_cvec.get_feature_names())
    info_count = info_words.sum().sort_values(ascending=False)
    
    #create new word list with words used by verified accounts
    new_stops = custom_words + info_cvec.get_feature_names()
    misinfo = tweets[~tweets['username'].str.lower().isin([x.lower() for x in handles])]

    misinfo_cvec = CountVectorizer(stop_words = new_stops)
    misinfo_cv = misinfo_cvec.fit_transform(misinfo.text)
    misinfo_words = pd.DataFrame(misinfo_cv.toarray(), columns = misinfo_cvec.get_feature_names())
    
    misinfo_count = misinfo_words.sum().sort_values(ascending=False)
    return info_count, misinfo_count


## Set Disaster Info

In [170]:
search = 'bushfire'
start_date = '2019-12-20'
end_date = '2020-02-29'
tweet_count = 10000

mendocino_handles = ['fema','usaid','calfire','MendoSheriff','calfire_meu', 'cagovernor','FEMARegion9','r5_fire_news', 'MendocinoNF']
aussie_handles = ['afacnews','act_esa','abcemergency','frnsw','qldfes','cfsalerts']

good_handles = aussie_handles

### Run function to pull tweets with above parameters and save as CSV

In [145]:
# pull tweets with keyword search, optional: start and end dates and maxtweets to pull(default is 1000)
get_tweets(search,start=start_date, end= end_date, maxtweets=tweet_count)

### Import CSV back into Jupyter as a DataFrame

In [146]:
# pull csv of tweets back in as a dataframe for parsing
tweets = pd.read_csv(f'./data/{search}_{tweet_count}.csv')

# drop rows with NaN in the text column
tweets.dropna(axis=0,subset = ['text'],inplace=True)
tweets.shape

(10000, 15)

In [177]:
retweets = pd.read_csv('./Mendocino/#MendocinoComplex2018-07-262018-10-1.csv')

In [264]:
retweets.shape

(8619, 16)

In [266]:
# take each tweet and multiply the row by the number of retweets as a weight for that tweet
for ind, row in retweets.iterrows():
    for rt in range(0,retweets['retweets'].iloc[ind]):
        retweets = retweets.append(pd.Series(row,index=test.columns))
retweets.shape
#     


(59707, 16)

In [268]:
retweets.to_csv('./data/mendocino_retweets.csv')

In [211]:
retweets.loc[8618]

Unnamed: 0                                                     8618
username                                                CALFIRE_MEU
to                                                              NaN
text              Incident Update #RanchFire #RiverFire is now t...
retweets                                                         48
favorites                                                        44
replies                                                           1
id                                              1023112231397949440
permalink         https://twitter.com/CALFIRE_MEU/status/1023112...
author_id                                                3962153832
date                                      2018-07-28 07:45:39+00:00
formatted_date                       Sat Jul 28 07:45:39 +0000 2018
hashtags                    #RanchFire #RiverFire #MendocinoComplex
mentions                                                          0
geo                                             

In [203]:
# retweets['mentions'].fillna(0, inplace=True)
texts = retweets[retweets['text'].str.contains("Just a quick FYI peeps.")]
texts.head()

Unnamed: 0.1,Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls
8616,8616,Jasamsdestiny,,Just a quick FYI peeps. Ranch and River have m...,5,3,4,1023117759104860160,https://twitter.com/Jasamsdestiny/status/10231...,129877411,2018-07-28 08:07:37+00:00,Sat Jul 28 08:07:37 +0000 2018,#MendocinoComplex #RiverFire #Ranchfire #Mendo...,0,,


### Run Dataframe through Vectorizer function to output word counts

In [173]:

good_info, bad_info = frequency_words(tweets, good_handles)
word_count = pd.DataFrame(bad_info, columns = ['count'])

# word_count.loc['arson']


word_count

Unnamed: 0,count
australia,2190
relief,2027
crisis,1358
au,1262
australian,1116
...,...
macfarlane,1
macens,1
maccaheraldsun,1
macarthur,1
