# Preprocessing

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import nltk
import re
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/connie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import Labeled Data

In [2]:
sandy_hurricane = pd.read_csv('../Mike/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
alberta_floods = pd.read_csv('../Mike/CrisisLexT6/2013_Alberta_Floods/2013_Alberta_Floods-ontopic_offtopic.csv')
boston_bombing = pd.read_csv('../Mike/CrisisLexT6/2013_Boston_Bombings/2013_Boston_Bombings-ontopic_offtopic.csv')
oklahoma_tornado = pd.read_csv('../Mike/CrisisLexT6/2013_Oklahoma_Tornado/2013_Oklahoma_Tornado-ontopic_offtopic.csv')
queensland_flood = pd.read_csv('../Mike/CrisisLexT6/2013_Queensland_Floods/2013_Queensland_Floods-ontopic_offtopic.csv')
texas_explosion = pd.read_csv('../Mike/CrisisLexT6/2013_West_Texas_Explosion/2013_West_Texas_Explosion-ontopic_offtopic.csv')

### Sandy Hurricane

In [3]:
sandy_hurricane.head()

Unnamed: 0,tweet id,tweet,label
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic


#### Rename Columns

In [4]:
sandy_hurricane.rename(columns = {'tweet id': 'id',
                                  ' tweet': 'text',
                                  ' label': 'label'},
                       inplace=True)

#### One-Hot Encoding

One-hot encode column `on-topic` and save as `sandy_hurricane_df`: 
-  1 = on-topic, tweet is related to a disaster
-  0 = off-topic, tweet is *not* related to a disaster

In [5]:
sandy_hurricane_df = pd.get_dummies(data=sandy_hurricane, 
                                    columns=['label'], 
                                    drop_first=True)

### Add `type` Column

Add a column to indicate the type of disaster it is. This will be necessary when combining all dataframes later on.

In [6]:
sandy_hurricane_df['type'] = 'hurricane'

### Repeat for all other labeled datasets:

### Alberta Floods

In [7]:
alberta_floods.rename(columns = {'tweet id': 'id',
                                 ' tweet': 'text',
                                 ' label': 'label'},
                      inplace=True)

In [8]:
alberta_floods_df = pd.get_dummies(data=alberta_floods, 
                                   columns=['label'], 
                                   drop_first=True)

In [9]:
alberta_floods_df['type'] = 'flood'

### Boston Bombing

In [10]:
boston_bombing.rename(columns = {'tweet id': 'id',
                                 ' tweet': 'text',
                                 ' label': 'label'},
                      inplace=True)

In [11]:
boston_bombing_df = pd.get_dummies(data=boston_bombing, 
                                   columns=['label'], 
                                   drop_first=True)

In [12]:
boston_bombing_df['type'] = 'bombing'

### Oklahoma Tornado

In [13]:
oklahoma_tornado.rename(columns = {'tweet id': 'id',
                                   ' tweet': 'text',
                                   ' label': 'label'},
                       inplace=True)

In [14]:
oklahoma_tornado_df = pd.get_dummies(data=oklahoma_tornado, 
                                     columns=['label'], 
                                     drop_first=True)

In [15]:
oklahoma_tornado_df['type'] = 'tornado'

### Queensland Flood

In [16]:
queensland_flood.rename(columns = {'tweet id': 'id',
                                   ' tweet': 'text',
                                   ' label': 'label'},
                       inplace=True)

In [17]:
queensland_flood_df = pd.get_dummies(data=queensland_flood, 
                                     columns=['label'], 
                                     drop_first=True)

In [18]:
queensland_flood_df['type'] = 'flood'

### Texas Explosion

In [19]:
texas_explosion.rename(columns = {'tweet id': 'id',
                                  ' tweet': 'text',
                                  ' label': 'label'},
                       inplace=True)

In [20]:
texas_explosion_df = pd.get_dummies(data=texas_explosion, 
                                    columns=['label'], 
                                    drop_first=True)

In [21]:
texas_explosion_df['type'] = 'explosion'

### Combine DataFrames

Combine all processed dataframes into one comprehensive dataframe with each tweet related to its respective disaster.

In [22]:
final_labeled_df = pd.concat([sandy_hurricane_df, 
                             alberta_floods_df, 
                             oklahoma_tornado_df, 
                             queensland_flood_df])

In [23]:
final_labeled_df.shape

(40064, 4)

We have a total of 60,082 observations and 3 features.

In [24]:
final_labeled_df.head()

Unnamed: 0,id,text,label_on-topic,type
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane


## Preprocessing

### Clean Text

In order to remove any text that will only contribute noise to our model, we will define a function that uses regular expressions to replace certain patterns:

-  **Convert all text to lower case**
-  **Remove additional white sapce**
-  **Remove links:** 
    -  Links starting with `www.` or `https?://:` are replaced with `URL`. Each link is most likely to be unique to the tweet and won't provide any information in regards to the content overall. 
-  **Eliminate hashtags**
-  **Remove `@`** 

In [25]:
def processTweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = re.sub('@', '', tweet)
    return tweet

In [26]:
final_labeled_df['processed'] = [processTweet(i) for i in final_labeled_df['text']]

### Tokenize

We instantiate tokenizer and define the search pattern using `r'\w+` as our regular expression. We only want to search through words and omit digits and symbols.

In [27]:
tokenizer = RegexpTokenizer(r'\w+')

We tokenize the `processed` column and create a new column (`tokenized`) for our results.

In [28]:
final_labeled_df['tokenized'] = final_labeled_df['processed'].map(lambda x: tokenizer.tokenize(x))

### Lemmatize

We will lemmatize our data in an attempt to return the base form of each word.

In [29]:
lemmatizer = WordNetLemmatizer()

We lemmatize the tokenized words in `tokenized` and join them to represent one string again. This is shown in the `lemmatized` column.

In [30]:
final_labeled_df['lemmatized'] = final_labeled_df['tokenized'].map(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [31]:
final_labeled_df.head()

Unnamed: 0,id,text,label_on-topic,type,processed,tokenized,lemmatized
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane,i've got enough candles to supply a mexican fa...,"[i, ve, got, enough, candles, to, supply, a, m...",i ve got enough candle to supply a mexican family
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane,sandy be soooo mad that she be shattering our ...,"[sandy, be, soooo, mad, that, she, be, shatter...",sandy be soooo mad that she be shattering our ...
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane,ibexgirl thankfully hurricane waugh played it ...,"[ibexgirl, thankfully, hurricane, waugh, playe...",ibexgirl thankfully hurricane waugh played it ...
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane,taos you never got that magnificent case of bu...,"[taos, you, never, got, that, magnificent, cas...",tao you never got that magnificent case of bur...
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane,"i'm at mad river bar &amp; grille (new york, n...","[i, m, at, mad, river, bar, amp, grille, new, ...",i m at mad river bar amp grille new york ny URL


### TF-IDF

We want to use TF-IDF to determine which words are most discriminating between tweets. Words that occue frequently are penalized and rare words are given more influence in our model. 

We instantiate TfidfVectorizer() and set:
-  `ngram_range = (1, 2)`: Set an upper and lower bound of 1 and 2. Word sequence will contain at least 1 and up to 2 words.
-  `stop_words='english'`: Filter out commonly used words in English
-  `min_df = 25`: Ignore terms that occur in less than 25 documents of the corpus
-  `max_df = 1.0`: There is no maximum threshold since terms cannot have a document frequency greater than `100%`. 

We then create a dataframe, `tfidf_df` for each word and its frequency.

In [32]:
tfidf = TfidfVectorizer(ngram_range=(1,2), 
                        stop_words = 'english', 
                        min_df = 25, 
                        max_df = 1.0)

tfidf_df = pd.SparseDataFrame(tfidf.fit_transform(final_labeled_df['lemmatized']),
                              columns = tfidf.get_feature_names())

In [33]:
tfidf_df.fillna(0, inplace = True)

In [34]:
tfidf_df.head()

Unnamed: 0,00,00 humidity,000,000 help,000 home,000 people,10,10 donation,10 online,100,...,yycflood abflood,yycflood relief,yycflood url,yycflood yyc,yycflood yychelps,yycfloods,yychelps,yychelps yycflood,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
tfidf_df.shape

(40064, 2599)

## Singular Value Decomposition (02_EDA_and_Cleaning)