## Using Labeled Data

In [33]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mintaekhong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Import labeled data from CrisisLexT6

In [2]:
sandy_hurricane = pd.read_csv('./CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
alberta_floods = pd.read_csv('./CrisisLexT6/2013_Alberta_Floods/2013_Alberta_Floods-ontopic_offtopic.csv')
boston_bombing = pd.read_csv('./CrisisLexT6/2013_Boston_Bombings/2013_Boston_Bombings-ontopic_offtopic.csv')
oklahoma_tornado = pd.read_csv('./CrisisLexT6/2013_Oklahoma_Tornado/2013_Oklahoma_Tornado-ontopic_offtopic.csv')
queensland_flood = pd.read_csv('./CrisisLexT6/2013_Queensland_Floods/2013_Queensland_Floods-ontopic_offtopic.csv')
texas_explosion = pd.read_csv('./CrisisLexT6/2013_West_Texas_Explosion/2013_West_Texas_Explosion-ontopic_offtopic.csv')

In [3]:
sandy_hurricane.head()

Unnamed: 0,tweet id,tweet,label
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic


In [4]:
sandy_hurricane.columns = ['id', 'text', 'on_topic']

In [5]:
sandy_hurricane.head()

Unnamed: 0,id,text,on_topic
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic


Create dummy columns: 1 for on-topic (a.k.a. tweet related to a disaster) and 0 for off-topic

In [6]:
on_topic_dum = pd.get_dummies(sandy_hurricane['on_topic'], drop_first=True)

In [7]:
on_topic_dum.head()

Unnamed: 0,on-topic
0,0
1,1
2,0
3,0
4,0


Drop column on_topic from original dataframe

In [8]:
sandy_hurricane.drop('on_topic', axis = 1, inplace = True)

Merge original dataframe with newly created dummy column dataframe

In [9]:
sandy_hurricane_df = pd.merge(sandy_hurricane, on_topic_dum, how='left', on=sandy_hurricane.index)

In [10]:
sandy_hurricane_df.head()

Unnamed: 0,key_0,id,text,on-topic
0,0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0
1,1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1
2,2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0
3,3,'263422851133079552',@taos you never got that magnificent case of B...,0
4,4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0


Drop miscellaneous 'key_0' column

In [11]:
sandy_hurricane_df.drop('key_0', axis = 1, inplace = True)

In [12]:
sandy_hurricane_df.head()

Unnamed: 0,id,text,on-topic
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0
3,'263422851133079552',@taos you never got that magnificent case of B...,0
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0


### Repeat above processes for other labeled datasets:

#### alberta_floods

In [13]:
alberta_floods.columns = ['id', 'text', 'on_topic']

In [14]:
alberta_floods_on_topic_dum = pd.get_dummies(alberta_floods['on_topic'], drop_first=True)

In [15]:
alberta_floods.drop('on_topic', axis = 1, inplace = True)

In [16]:
alberta_floods_df = pd.merge(alberta_floods, alberta_floods_on_topic_dum, how='left', on=alberta_floods.index)

In [17]:
alberta_floods_df.drop('key_0', axis = 1, inplace = True)

In [18]:
alberta_floods_df.head()

Unnamed: 0,id,text,on-topic
0,'348351442404376578',@Jay1972Jay Nope. Mid 80's. It's off Metallica...,0
1,'348167215536803841',Nothing like a :16 second downpour to give us ...,0
2,'348644655786778624',@NelsonTagoona so glad that you missed the flo...,1
3,'350519668815036416',"Party hard , suns down , still warm , lovin li...",0
4,'351446519733432320',@Exclusionzone if you compare yourself to wate...,0


#### boston bombing

In [19]:
boston_bombing.columns = ['id', 'text', 'on_topic']
boston_bombing_on_topic_dum = pd.get_dummies(boston_bombing['on_topic'], drop_first=True)
boston_bombing.drop('on_topic', axis = 1, inplace = True)
boston_bombing_df = pd.merge(boston_bombing, boston_bombing_on_topic_dum, how='left', on=boston_bombing.index)

In [20]:
boston_bombing_df.drop('key_0', axis = 1, inplace = True)

In [21]:
boston_bombing_df.head()

Unnamed: 0,id,text,on-topic
0,'325208201740029952',Funny how I got twenty minutes of sleep last n...,0
1,'325301650791215106',I just wanna go outside,0
2,'325182966332530688',This is fucking crazy. Suspect # 2 has like 2 ...,1
3,'324654539850539009',"@J_Adams91 oops, someone doesn't like to be to...",0
4,'323630018536275968',Let me love you @GavinQuin you know who else too,0


#### oklahoma_tornado

In [22]:
oklahoma_tornado.columns = ['id', 'text', 'on_topic']
oklahoma_tornado_on_topic_dum = pd.get_dummies(oklahoma_tornado['on_topic'], drop_first=True)
oklahoma_tornado.drop('on_topic', axis = 1, inplace = True)
oklahoma_tornado_df = pd.merge(oklahoma_tornado, oklahoma_tornado_on_topic_dum, how='left', on=oklahoma_tornado.index)
oklahoma_tornado_df.drop('key_0', axis = 1, inplace = True)

In [23]:
oklahoma_tornado_df.head()

Unnamed: 0,id,text,on-topic
0,'336908711324962817',@HeatleyJheat44 its barley even raining where ...,1
1,'337052158035890176',Sorry I can't do anything right.,0
2,'339338021751103488',@mrwendell29: @BradSowderWX says we have the ...,1
3,'336339509077762051',#honestyhour I like to wear half split running...,0
4,'337734129972035584',I'm too stressed to have a good summer,0


#### queensland_flood

In [24]:
queensland_flood.columns = ['id', 'text', 'on_topic']
queensland_flood_on_topic_dum = pd.get_dummies(queensland_flood['on_topic'], drop_first=True)
queensland_flood.drop('on_topic', axis = 1, inplace = True)
queensland_flood_df = pd.merge(queensland_flood, queensland_flood_on_topic_dum, how='left', on=queensland_flood.index)
queensland_flood_df.drop('key_0', axis = 1, inplace = True)

In [25]:
queensland_flood_df.head()

Unnamed: 0,id,text,on-topic
0,'296728042179534848',"@MarkSDobson I always thought that, big lad ai...",0
1,'296085045645570048',@thamonstar a lot of water moving around and a...,1
2,'296811076400603136',Craig Thompson to be extradited to Victoria on...,0
3,'295357934387486720',"Sunshine state, sort your shit out.",0
4,'296390762210398210',@MarkPhilippi yeah I saw it. He's a wanker. Pa...,0


#### Texas Explosion

In [26]:
texas_explosion.columns = ['id', 'text', 'on_topic']
texas_explosion_on_topic_dum = pd.get_dummies(texas_explosion['on_topic'], drop_first=True)
texas_explosion.drop('on_topic', axis = 1, inplace = True)
texas_explosion_df = pd.merge(texas_explosion, texas_explosion_on_topic_dum, how='left', on=texas_explosion.index)
texas_explosion_df.drop('key_0', axis = 1, inplace = True)

In [27]:
texas_explosion_df.head()

Unnamed: 0,id,text,on-topic
0,'325478991496630272',"@iAmNotA_Dyke ..nah, I'm fine with it.",0
1,'325422389854883840',Oomf still on that bullshit from lastnight,0
2,'325057083135361025',-NO Handouts Here I Work Hard For Mines !,0
3,'324742043211558912',@theatticdemos @youranonnews fertilizer plant ...,1
4,'325471165026025474',@_SayMyNameLOUD: Got ate out by a dog !! #Cryi...,0


Combine all processed dataframes into one comprehensive dataframe with all tweets related to appropriate disaster

In [28]:
final_labeled_df = pd.concat([sandy_hurricane_df, 
                             alberta_floods_df, 
                             boston_bombing_df, 
                             oklahoma_tornado_df, 
                             queensland_flood_df, 
                             texas_explosion_df])

In [29]:
final_labeled_df.shape

(60082, 3)

### Preprocessing

Use regex to clean out tweets:

In [30]:
import re

def processTweet(tweet):
    #Convert to lower case
    tweet = tweet.lower()
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet) ##may need to eliminate web address entirely
    #Eliminate hastags
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove @ signs
    tweet = re.sub('@', '', tweet)
    return tweet

In [31]:
tweets = [i for i in final_labeled_df['text']]

In [32]:
final_labeled_df['processed'] = [processTweet(i) for i in tweets]

Tokenize each tweet

In [34]:
tokenizer = RegexpTokenizer(r'\w+')

In [35]:
final_labeled_df['clean_processed'] = final_labeled_df['processed'].map(lambda x: tokenizer.tokenize(x))

Lemmatize the tokenized words

In [36]:
lemmatizer = WordNetLemmatizer()

In [37]:
final_labeled_df['lemm_clean_processed'] = final_labeled_df['clean_processed'].map(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [38]:
final_labeled_df.head()

Unnamed: 0,id,text,on-topic,processed,clean_processed,lemm_clean_processed
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,i've got enough candles to supply a mexican fa...,"[i, ve, got, enough, candles, to, supply, a, m...",i ve got enough candle to supply a mexican family
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,sandy be soooo mad that she be shattering our ...,"[sandy, be, soooo, mad, that, she, be, shatter...",sandy be soooo mad that she be shattering our ...
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,ibexgirl thankfully hurricane waugh played it ...,"[ibexgirl, thankfully, hurricane, waugh, playe...",ibexgirl thankfully hurricane waugh played it ...
3,'263422851133079552',@taos you never got that magnificent case of B...,0,taos you never got that magnificent case of bu...,"[taos, you, never, got, that, magnificent, cas...",tao you never got that magnificent case of bur...
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,"i'm at mad river bar &amp; grille (new york, n...","[i, m, at, mad, river, bar, amp, grille, new, ...",i m at mad river bar amp grille new york ny URL


Utilize tfidf to vectorize tweets with each row representing a document/tweet and column representing a dimension
- stop_words of 'english' : To filter out commonly used words in english
- min_df of 25: vectorizes words that are a part of 25 documents in the corpus.
- max_df of 1.0: refers to the percentage of documents within a corpus that a word occurs in.

In [39]:
tfidf = TfidfVectorizer(ngram_range=(1,2), 
                        stop_words = 'english', 
                        min_df = 25, 
                        max_df = 1.0)

df_tfidf = pd.SparseDataFrame(tfidf.fit_transform(final_labeled_df['lemm_clean_processed']),
                        columns = tfidf.get_feature_names())

In [40]:
df_tfidf.shape

(60082, 3676)

In [41]:
df_tfidf.fillna(0, inplace = True)

In [42]:
df_tfidf.head()

Unnamed: 0,00,00 humidity,000,000 help,000 home,000 people,10,10 30,10 donation,10 online,...,yycflood relief,yycflood url,yycflood yyc,yycflood yychelps,yycfloods,yychelps,yychelps yycflood,zero,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
