In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [40]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 200)
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from Help_Funs import count_chars, count_words, count_capital_chars, count_capital_words, count_sent, count_unique_words, count_stopwords, count_hashtags 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

## Defining files names
file_key_1 = 'NLP-Disaster-Tweets/train.csv'
file_key_2 = 'NLP-Disaster-Tweets/test.csv'
file_key_3 = 'NLP-Disaster-Tweets/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
sample = pd.read_csv(file_content_stream_3)

# Basic Exploration

In [6]:
train['target'].value_counts() / train.shape[0]

0    0.57034
1    0.42966
Name: target, dtype: float64

In [13]:
train['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [14]:
test['keyword'].value_counts()

deluged               23
demolished            22
rubble                22
first%20responders    21
seismic               21
                      ..
threat                 5
fatalities             5
forest%20fire          5
inundation             4
epicentre              1
Name: keyword, Length: 221, dtype: int64

In [16]:
np.isin(test['keyword'].unique(), train['keyword'].unique())

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [22]:
test['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [23]:
train['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [26]:
train['location'].value_counts()

USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64

In [27]:
test['location'].value_counts()

New York                  38
USA                       37
Worldwide                 16
United States             15
London                    13
                          ..
Medford, NJ                1
Quezon City                1
LanÌ¼s                     1
USA,Washington,Seattle     1
Brussels, Belgium          1
Name: location, Length: 1602, dtype: int64

In [28]:
np.isin(test['location'].unique(), train['location'].unique())

array([False,  True, False, ..., False,  True, False])

# Basic Feature Engineering 

In [41]:
train = train[['keyword', 'text', 'target']]
test = test[['id', 'keyword', 'text']]

train['char_count'] = train['text'].apply(lambda x: count_chars(x))
train['word_count'] = train['text'].apply(lambda x: count_words(x))
train['sent_count'] = train['text'].apply(lambda x: count_sent(x))
train['capital_char_count'] = train['text'].apply(lambda x: count_capital_chars(x))
train['capital_word_count'] = train['text'].apply(lambda x: count_capital_words(x))
# train['quoted_word_count'] = train['text'].apply(lambda x: count_words_in_quotes(x))
train['stopword_count'] = train['text'].apply(lambda x: count_stopwords(x))
train['unique_word_count'] = train['text'].apply(lambda x: count_unique_words(x))
                                                 
test['char_count'] = test['text'].apply(lambda x: count_chars(x))
test['word_count'] = test['text'].apply(lambda x: count_words(x))
test['sent_count'] = test['text'].apply(lambda x: count_sent(x))
test['capital_char_count'] = test['text'].apply(lambda x: count_capital_chars(x))
test['capital_word_count'] = test['text'].apply(lambda x: count_capital_words(x))
# test['quoted_word_count'] = test['text'].apply(lambda x: count_words_in_quotes(x))
test['stopword_count'] = test['text'].apply(lambda x: count_stopwords(x))
test['unique_word_count'] = test['text'].apply(lambda x: count_unique_words(x))
                                                 
## Average word length
train['avg_wordlength'] = train['char_count'] / train['word_count']
test['avg_wordlength'] = test['char_count'] / test['word_count']

## Average sentence lenght
train['avg_sentlength'] = train['word_count'] / train['sent_count']
test['avg_sentlength'] = test['word_count'] / test['sent_count']

## Unique words vs count words
train['unique_vs_words'] = train['unique_word_count'] / train['word_count']
test['unique_vs_words'] = test['unique_word_count'] / test['word_count']

## stopwords vs count words
train['stopwords_vs_words'] = train['stopword_count'] / train['word_count']
test['stopwords_vs_words'] = test['stopword_count'] / test['word_count']

# Baseline Model: Random Forest

In [42]:
## Defining input and target variables 
X = train.drop(columns = ['keyword', 'text', 'target'], axis = 1)
Y = train['target']

RF_param_grid = {'n_estimators': [100, 300, 500],
                 'max_features': [3, 4, 5],
                 'max_depth': [3, 5, 7],
                 'min_samples_split': [5, 7, 9],
                 'min_samples_leaf': [5, 7, 9]}
        
## Running leave-one-out cross validation 
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 5, scoring = 'f1', n_jobs = -1, verbose = 3).fit(X, Y)

## Printing the best hyper-parameter combination
print(RF_grid_search.best_params_)

## Extraciting the best model 
RF_md = RF_grid_search.best_estimator_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV 2/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=5, n_estimators=300;, score=0.551 total time=   1.9s
[CV 1/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=100;, score=0.497 total time=   0.5s
[CV 4/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=100;, score=0.597 total time=   0.5s
[CV 2/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=300;, score=0.549 total time=   1.5s
[CV 4/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=500;, score=0.591 total time=   3.1s
[CV 1/5] END max_depth=3, max_features=3, min_samples_leaf=7, min_samples_split=5, n_estimators=100;, score=0.501 total time=   0.5s
[CV 3/5] END max_depth=3, max_features=3, min_samples_leaf=7, min_samples_split=5, n_estimators=100;, score=0.575 total time=   0.5s
[CV 1

In [45]:
## Printing the best hyper-parameter combination
print(RF_grid_search.best_params_)

## Printing best F1-score
print(RF_grid_search.best_score_)

{'max_depth': 7, 'max_features': 5, 'min_samples_leaf': 7, 'min_samples_split': 9, 'n_estimators': 100}
0.5777967006001864


In [46]:
## Extraciting the best model 
RF_md = RF_grid_search.best_estimator_

## Predicting on train to estimate cutoff based on precision-recall curve
RF_pred = RF_md.predict_proba(X)[:, 1]

RF_pred

array([0.31321896, 0.18545234, 0.41631674, ..., 0.49013988, 0.46691624,
       0.64951801])

In [47]:
RF_md.predict(X)

array([0, 0, 0, ..., 0, 0, 1])