In [1]:
# Import libraries
import pandas as pd
import numpy as np

# gensim
import gensim

In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_sample = pd.read_csv("data/sample_submission.csv")

## Data Exploration and Preparation

In [3]:
print("Training data", df_train.shape)
print(df_train.columns)
print("Test data", df_test.shape)
print(df_test.columns)

Training data (7613, 5)
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Test data (3263, 4)
Index(['id', 'keyword', 'location', 'text'], dtype='object')


## Tokenise sentences

In [4]:
from tweet_func import preprocess
X_train = [preprocess(tweet) for tweet in df_train["text"]]
X_test = [preprocess(tweet) for tweet in df_test["text"]]

In [5]:
# Compare raw with tokenised tweet
print(df_train["text"][2], "- Output Label: ", df_train["target"][2])
print(X_train[2])

All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected - Output Label:  1
['residents', 'asked', 'place', 'notified', 'officers', 'evacuation', 'shelter', 'place', 'orders', 'expected']


## Make new columns with tokenised sentences

In [6]:
df_train['token_sentence'] = pd.NaT # Create new column to store tokenized sentences
tok_column = df_train.pop('token_sentence')
df_train.insert(0,'token_sentence', tok_column) # Shift column to first position in df
df_train['token_sentence'] = X_train

df_test['token_sentence'] = pd.NaT # Create new column to store tokenized sentences
tok_column = df_test.pop('token_sentence')
df_test.insert(0,'token_sentence', tok_column) # Shift column to first position in df
df_test['token_sentence'] = X_test

In [7]:
X_train = df_train['token_sentence']
y_train = df_train['target']
X_test = df_test['token_sentence']

## Import word vectors

In [8]:
# Google News
Model = gensim.models.KeyedVectors.load_word2vec_format(
    '/Users/ektornikolinakos/working/vscode/comply/models/GoogleNews-vectors-negative300.bin.gz', binary=True,)
# FastText
# Model = gensim.models.KeyedVectors.load_word2vec_format(
    # 'datasets/GoogleNews-vectors-negative300.bin.gz', binary=True,)

## Further tweet cleaning
There are a handful of sentences that have only one tokenized word and have a positive label. I don't see how these would help the model to identify disaster tweets, but we if had a larger dataset I could imagine a few cases where tweets including words help, disaster, amargedon would be labeled as positives. Because these sentences do not provide useful information I have decided to remove them, but if we had a larger dataset I would just remove the empty tweets.

In [9]:
for i, j in enumerate(df_train['token_sentence']):
    if len(j) < 2 and df_train['target'][i] == 1:
        print(j)

['crash']
['jorrynja']
[]
['prob']
['http']


In [10]:
from tweet_func import filter_docs # remove tweets with 1 or 0 words
tweets = [X_train]
labels = [y_train]
filter_docs(tweets, labels, lambda text: (len(text)<2))

51 rows removed


In [11]:
from tweet_func import filter_docs, has_vector_representation # Removes words not found in the Google News dictionary
filter_docs(tweets, labels, lambda text: has_vector_representation(Model, text))

3 rows removed


## Word experimentations

In [None]:
# Stemming
from tweet_func import stemming
stem_train = [stemming(sentence) for sentence in X_train]
stem_test = [stemming(sentence) for sentence in X_test]
stem_valid = [stemming(sentence) for sentence in X_valid]

In [None]:
# Lemmatization
from tweet_func import average_vecs
trainVecs = [average_vecs(sentence, Model, 300) for sentence in stem_train]
testVecs = [average_vecs(sentence, Model, 300) for sentence in stem_test]
validVecs = [average_vecs(sentence, Model, 300) for sentence in stem_valid]

## Average vectors for each tweet

In [12]:
# Average vectors for each sentence
from tweet_func import average_vecs
trainVecs = [average_vecs(sentence, Model, 300) for sentence in X_train]
testVecs = [average_vecs(sentence, Model, 300) for sentence in X_test]

## Decision Tree

In [14]:
# Decision Tree to check how submission works
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(trainVecs, y_train)
y_pred = clf.predict(testVecs)

f1_score = 0.68617

## Random Forest with random grid search

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

# distributions is a dictionary of different parameters that the random forest can take
# Random grid search will pick 100 random combinations of those parameters
# We will evaluate the best model by using a 3-fold cross validation and the f1 score

random_forest = RandomForestClassifier(random_state=42)
distributions = dict(n_estimators=range(100,2000,200),
                    max_depth=range(5,100,5))
clf = RandomizedSearchCV(random_forest, distributions, n_iter=10, cv=3, scoring="f1", random_state=42)
clf.fit(trainVecs, y_train)

y_pred = clf.predict(testVecs)

f1_score = 0.79037

## Deep Learning - Feed forward neural network

## Submission area

In [30]:
df_sample["target"] = y_pred
df_sample.shape

(3263, 2)

In [31]:
df_sample.to_csv("submission.csv", index=False)