# 2.1 Data Cleaning and NLP

## Combine dataframes

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import stop_words 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [2]:
# combine data 
campfire = pd.read_csv('../data/campfire_data.csv')
carrfire = pd.read_csv('../data/carrfire_data.csv')
hurricane = pd.read_csv('../data/hurricane_data.csv')

In [3]:
# remove iirlated column in campfire
campfire.drop(columns='Unnamed: 0', inplace=True)

In [4]:
# combine all disasters together
disaster = pd.concat([campfire, carrfire, hurricane])
disaster.head()

Unnamed: 0,created_at,favorite_count,hashtags,latitute,longitute,retweet_count,text
0,Wed Jan 30 23:59:00 +0000 2019,0,[],,,0,RT @ActionNewsNow: The Butte Strong Fund will ...
1,Wed Jan 30 23:55:51 +0000 2019,0,['PGE'],,,0,RT @mgafni: Talk over #PGE replicating San Die...
2,Wed Jan 30 23:52:52 +0000 2019,0,[],,,0,Federal judge asks PG&amp;E: Should I 'let you...
3,Wed Jan 30 23:51:19 +0000 2019,0,[],,,0,RT @pbartolone: Will it get harder and harder ...
4,Wed Jan 30 23:50:17 +0000 2019,0,"['CampFire', 'CampFirePets', 'Paradise', 'Para...",,,0,RT @BCWildfireToday: LOOKING FOR A LOST #CampF...


In [5]:
# drop duplicate rows based on text
disaster.drop_duplicates(subset ="text", keep = False, inplace = True)

In [6]:
# 2384 unique tweets
len(disaster)

2384

In [7]:
#store deduped rows as a dataframe
%store disaster

Stored 'disaster' (DataFrame)


## Pull words in tweets

In [8]:
# use regularexpression to do lemmatize
# use countvectorizer to tokenize, lemmatize, and exclude stopwords 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t, 'v') for t in tokenizer.tokenize(doc)]

In [9]:
# pull words using countvectorizer
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                            preprocessor = None,
                            stop_words = 'english',
                            max_features = 1500,
                            ngram_range= (1,2),
                            analyzer = 'word', 
                            min_df=3) 

disaster_words=disaster['text']
disaster_words = vectorizer.fit_transform(disaster_words)
# convert into a dataframe
disaster_words= pd.DataFrame(disaster_words.toarray(), columns=vectorizer.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [10]:
#store words after countvectorizer as a dataframe
%store disaster_words

Stored 'disaster_words' (DataFrame)


## Count Word Frequency

In [11]:
# check words and word_count as a reference for us to create disaster related corpus
disaster_word_count = pd.DataFrame(disaster_words.sum(), index=vectorizer.get_feature_names(), columns=['word_count']).sort_values(by='word_count', ascending=False)
disaster_word_count = disaster_word_count.reset_index()


In [12]:
# remove some useless word
self_defined_stop_words = ['https', 'rt', 'amp','hurricaneharvey https','link','bio','prop','cosplay',
                           'prop cosplay','coffee','jewelry','anime','cosplay anime','coffee prop',
                           'victim jewelry','jewelry coffee','propmaster','anime propmaster',
                           'beer','ebay','ebay link','link bio','bio carrfire', 'propmaster beer',
                           'people','today','carrfire https','make','wine','just','beer wine','come',
                           'click','like','work','link carrfire','wine dab','dab weed','weed','look'
                          ]
                           

disaster_word_count = disaster_word_count[~disaster_word_count['index'].isin(self_defined_stop_words)]

In [13]:
# removed some top unrelated words
# export to csv for a clear view of related words
disaster_word_count.to_csv('../data/tweets_word_count.csv', index=False)