# Disaster Tweets Classifier

In [None]:
# installing all dependencies required for the notebook
%pip install -r requirements.txt

# Data Exploration & Pre-processing

## Data exploration

In [92]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/train.csv', index_col='id')
df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [93]:
df.shape # 7613 rows, with 4 columns

(7613, 4)

In [94]:
# target 1 refers to disaster tweet, 0 is not a disaster tweet
df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [95]:
# checking for completeness of data
print(f"{np.sum(df['keyword'].isna())} rows have no keywords")
print(f"{np.sum(df['location'].isna())} rows have no location")
print(f"{np.sum(df['text'].isna())} rows have no text")
print(f"{np.sum(df['text'].isna())} rows have no target")

61 rows have no keywords
2533 rows have no location
0 rows have no text
0 rows have no target


In [96]:
# note that some keywords are phrases, with '%20' as a space
df['keyword'].value_counts() 

fatalities               45
armageddon               42
deluge                   42
damage                   41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [97]:
# note that there are some non-location locations, like 'World Wide!!' and 'a feminist, modernist hag.'
df['location'].value_counts() 

USA                            104
New York                        71
United States                   50
London                          45
Canada                          29
                              ... 
Port Charlotte, FL               1
Dimapur                          1
Orbost, Victoria, Australia      1
(he/him)                         1
Chicago, IL                      1
Name: location, Length: 3341, dtype: int64

## Preprocessing

In [None]:
# download spaCy model for American English
!python3 -m spacy download en_core_web_sm

In [137]:
import spacy 
import en_core_web_sm
nlp = en_core_web_sm.load()

## Modifying spaCy's tokenizer

In [134]:
# Let's see what spaCy does with numbers, contractions, #hashtags and @mentions
s = "2020 can't get any worse #ihate2020 @bestfriend"
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")

Token		Lemma		Stopword
2020		2020		False
ca		can		True
n't		not		True
get		get		True
any		any		True
worse		bad		False
#		#		False
ihate2020		ihate2020		False
@bestfriend		@bestfriend		False


In [150]:
# Contractions are split into lemmas
# Numbers are their own features
# @mentions are maintained as a token
# We want to also keep #hashtags as a token, so we will modify the spaCy model's token_match

import re 

# Retrieve the default token-matching regex pattern
re_token_match = spacy.tokenizer._get_regex_pattern(nlp.Defaults.token_match)

# Add #hashtag pattern
re_token_match = f"({re_token_match}|#\w+)"
nlp.tokenizer.token_match = re.compile(re_token_match).match

# Now let's try again
s = "2020 can't get any worse #ihate2020 @bestfriend"
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")

Token		Lemma		Stopword
2020		2020		False
ca		can		True
n't		not		True
get		get		True
any		any		True
worse		bad		False
#ihate2020		#ihate2020		False
@bestfriend		@bestfriend		False


## Pre-processing a single tweet

In [153]:
# Features is a set of all lemmas (words) encountered thus far
features = set()

# Now let's process an original tweet with our modified spaCy model
s = df.loc[1,'text']
print(f"Original tweet: {s}")

# To lowercase
s = s.lower()

# Creating a doc with spaCy
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)

lemmas = []
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")
    lemmas.append(token.lemma_)

# Union between lemmas and our features set
features |= set(lemmas)

# Constructing a bag of words for the tweet
freq = dict()
for word in features:
    freq[str(word)] = 0
for token in doc: 
    freq[str(token.lemma_)] += 1
    
print(f"Bag of words for the tweet: {freq}")

Original tweet: our deeds are the reason of this #earthquake may allah forgive us all
Token		Lemma		Stopword
our		-PRON-		True
deeds		deed		False
are		be		True
the		the		True
reason		reason		False
of		of		True
this		this		True
#earthquake		#earthquake		False
may		may		True
allah		allah		False
forgive		forgive		False
us		-PRON-		True
all		all		True
Bag of words for the tweet: {'this': 1, 'allah': 1, 'forgive': 1, 'of': 1, '-PRON-': 2, '#earthquake': 1, 'all': 1, 'be': 1, 'reason': 1, 'may': 1, 'deed': 1, 'the': 1}


## Preprocessing all data

In [None]:
# Now that we've preprocessed a single tweet, we can create a pre-process function for each tweet
def preprocess(s, nlp, features):

    # To lowercase
    s = s.lower()

    # Creating a doc with spaCy
    doc = nlp(s)

    # Let's look at the lemmas and is stopword of each token
    print(f"Token\t\tLemma\t\tStopword")
    print("="*40)

    lemmas = []
    for token in doc:
        print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")
        lemmas.append(token.lemma_)

    # Union between lemmas and our features set
    features |= set(lemmas)

    # Constructing a bag of words for the tweet
    freq = dict()
    for word in features:
        freq[str(word)] = 0
    for token in doc: 
        freq[str(token.lemma_)] += 1
        
    return features, freq

In [154]:
preprocess_df = df #duplicate for preprocessing
features = set() #using set feature to contain all words seen

In [101]:
# create dataframe for bag of words representation
bow = pd.DataFrame()
bow['id'] = range(0, len(preprocess_df))
bow.set_index('id')

0
1
2
3
4
...
7608
7609
7610
7611
7612


In [102]:
# to lower case
preprocess_df['text'] = preprocess_df.text.map(lambda tweet: tweet.lower()) 

# create documents
for i in range(0,1):
    doc = nlp(preprocess_df.iloc[i]['text'])
    #TO DO: build bag of words
    features |= (set(doc)) #union tokens and features

## Saving pre-processed data for collaborators

## Splitting into training and validation data

## Saving .csv files for training and validation sets