### Import libraries

In [703]:
import numpy as np
import pandas as pd
import pickle

import torch
import transformers

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

### User settings

In [704]:
PATH = "C:/Users/giuma/Dropbox/Web and Social media Analytics Project/"

do_FE = False # flag for the Feature Extraction process (set False when .pkl file can be imported)

### Import Cleaned Dataset

In [710]:
df_train = pd.read_pickle("train_nlp_disaster.pkl")
df_test = pd.read_pickle("test_nlp_disaster.pkl")

Wall time: 14 ms


In [711]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,target_relabeled,text_cleaned
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,Our Deeds are the Reason of this # earthquake...
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,Forest fire near La Ronge Sask . Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,1,All residents asked to ' shelter in place ' ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,"13,000 people receive # wildfires evacuation ..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,Just got sent this photo from Ruby # Alaska a...


### Feature Extraction

Use of DistilBERT (lighter version of BERT) for approaching Transfer Learning

In [6]:
# Load pretrained model/tokenizer for DistilBERT:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [86]:
def prepare_for_classifier(text, column = None):
    """
        Args:
        
        text: may be either a DataFrame (containing multiple tweets) or a string (single tweet)
    """
    
    ### Tokenization ###
    if ((isinstance(text, pd.DataFrame)) | (isinstance(text, pd.Series)))\
        & (column is not None):
        
        tokenized = text[column].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
        
    elif isinstance(text, str):
        
        tokenized = tokenizer.encode(text, add_special_tokens=True)
        tokenized = pd.Series([tokenized])  
    else:
        print('NOT RECOGNIZED DATA TYPE')
    
    # Output of tokenization is a pd.Series, with len equal to the number of tweets.
    # Each element of the series is a list which contains the tokenized tweet.
    
    
    ### Padding ###
    max_len = tokenized.apply(lambda x: len(x)).max()
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
    
    ### Masking ###
    attention_mask = np.where(padded != 0 , 1, 0)
    attention_mask.shape
    
    # This mask contains 1 where values of the padded tokenized series is different from 0 (i.e. where no padding was applied)
    # and 0 elsewhere.
    
    
    ### Features extraction ###
    input_ids = torch.LongTensor(padded) #.tensor(padded)
    attention_mask = torch.LongTensor(attention_mask) #.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask = attention_mask) # very long computation!!!
    
    features = last_hidden_states[0][:,0,:].numpy()
    
    # [:,0,:] -> allows to take only the features devoted to a classification task
    
    return features

In [714]:
if do_FE:
    # Extract features for classification (very heavy computation!!!)
    features = prepare_for_classifier(df_train, 'text_cleaned')

    # Save to pickle
    with open('features.pkl', 'wb') as handle:
        pickle.dump(features, handle, protocol = 4)

else:
    # Import pickle with features
    with open('features.pkl', 'rb') as handle:
        features = pickle.load(handle)


## Classification models

The derived features are used in order to train a series of classification models, which are finally ensembled by a stacking classifier (Supervised Learning)

In [715]:
labels = df_train['target_relabeled']

In [716]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42)

In [717]:
print(f"Features Train: {X_train.shape}")
print(f"Target Train: {y_train.shape}\n")

print(f"Features Test: {X_test.shape}")
print(f"Target Test: {y_test.shape}")

Features Train: (5709, 768)
Target Train: (5709,)

Features Test: (1904, 768)
Target Test: (1904,)


### Random Forest Classifier

In [96]:
model_rf = RandomForestClassifier()
parameters = {
    'bootstrap': [True],
    'ccp_alpha': [0.01, 0.001],
    'criterion': ["entropy"],
    'max_depth': [5,6,7,8,9,10],
    'max_features': [70],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'class_weight': ['balanced'],
    'n_estimators': [100]
}

In [97]:
gs = GridSearchCV(model_rf, parameters, cv=3, scoring = 'f1', verbose=10, n_jobs=-1, refit=True)

In [98]:
gs = gs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   51.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   51.3s finished


In [99]:
print("** GRIDSEARCH RESULTS **")
print(f"Best score: {gs.best_score_} using {gs.best_params_}")

** GRIDSEARCH RESULTS **
Best score: 0.7275985008312129 using {'bootstrap': True, 'ccp_alpha': 0.01, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'max_features': 60, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [101]:
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)

In [102]:
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

In [172]:
print(f"** F1 Score **")
print(f"Train: {f1_score(y_train, y_pred_train):0.2f}")
print(f"Test: {f1_score(y_test, y_pred_test):0.2f}\n")

print(f"** Accuracy **")
print(f"Baseline - all zeroes: {accuracy_score(y_test, np.zeros(y_test.shape[0])):0.2f}")
print(f"Train: {accuracy_score(y_train, y_pred_train):0.2f}")
print(f"Test: {accuracy_score(y_test, y_pred_test):0.2f}")

** F1 Score **
Train: 0.74
Test: 0.72

** Accuracy **
Baseline - all zeroes: 0.57
Train: 0.79
Test: 0.78


### XGB Classifier

In [284]:
xgb_train = X_train
xgb_val = X_test[:952]
xgb_test = X_test[952:]

xgb_y_train = y_train
xgb_y_validation = y_test[:952]
xgb_y_test = y_test[952:]

In [292]:
xgb_model = XGBClassifier(learning_rate=0.1,
                          n_estimators=140,
                          max_depth=5,
                          reg_alpha=0.1,
                          reg_lambda=0.3,
                          min_child_weight=3,
                          gamma=0.2,
                          subsample=0.6,
                          colsample_bytree=1.0,
                          objective='binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          seed=42)
xgb_model.fit(xgb_train, 
              xgb_y_train,
              eval_set=[(xgb_train, xgb_y_train), (xgb_val, xgb_y_validation)], 
              verbose=20, 
              early_stopping_rounds=200)

[0]	validation_0-error:0.21300	validation_1-error:0.25735
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 200 rounds.
[20]	validation_0-error:0.13558	validation_1-error:0.20168
[40]	validation_0-error:0.10194	validation_1-error:0.19748
[60]	validation_0-error:0.07637	validation_1-error:0.19958
[80]	validation_0-error:0.05027	validation_1-error:0.19013
[100]	validation_0-error:0.03678	validation_1-error:0.17752
[120]	validation_0-error:0.02470	validation_1-error:0.18698
[139]	validation_0-error:0.01822	validation_1-error:0.18802


XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              n_estimators=140, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0.1,
              reg_lambda=0.3, scale_pos_weight=1, seed=42, subsample=0.6,
              tree_method=None, validate_parameters=False, verbosity=None)

In [293]:
xgb_train_pred = xgb_model.predict(xgb_train)
xgb_val_pred = xgb_model.predict(xgb_val)
xgb_test_pred = xgb_model.predict(xgb_test)

In [294]:
print(f'>>> XGB Model:')
print(f'Train f1: {f1_score(xgb_y_train, xgb_train_pred)}')
print(f'Validation f1: {f1_score(xgb_y_validation, xgb_val_pred)}')
print(f'Test f1: {f1_score(xgb_y_test, xgb_test_pred)}\n')

>>> XGB Model:
Train f1: 0.9560669456066946
Validation f1: 0.7773386034255599
Test f1: 0.7643979057591624



### Ridge Classifier

In [331]:
ridge_clf = RidgeClassifier()
ridge_params = {'alpha': np.arange(7,15),
                'class_weight': [None],
                'copy_X': [True],
                'fit_intercept': [True],
                'max_iter': [None],
                'normalize': [False],
                'random_state': [42],
                'solver': ['auto'],
                'tol': [0.001]}

In [359]:
ridge_gs = GridSearchCV(ridge_clf, ridge_params, cv=10, scoring = 'f1', verbose=10, n_jobs=-1, refit=True)

In [363]:
ridge_gs = ridge_gs.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   17.2s finished


In [364]:
print("** GRIDSEARCH RESULTS **")
print(f"Best score: {ridge_gs.best_score_} using {ridge_gs.best_params_}")

** GRIDSEARCH RESULTS **
Best score: 0.7678875580941108 using {'alpha': 9, 'class_weight': None, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': False, 'random_state': 42, 'solver': 'auto', 'tol': 0.001}


In [335]:
best_ridge = ridge_gs.best_estimator_

In [336]:
y_pred_train = best_ridge.predict(X_train)
y_pred_test = best_ridge.predict(X_test)

In [339]:
print(f"** F1 Score **")
print(f"Train: {f1_score(y_train, y_pred_train):0.3f}")
print(f"Test: {f1_score(y_test, y_pred_test):0.3f}\n")

print(f"** Accuracy **")
print(f"Baseline - all zeroes: {accuracy_score(y_test, np.zeros(y_test.shape[0])):0.3f}")
print(f"Train: {accuracy_score(y_train, y_pred_train):0.3f}")
print(f"Test: {accuracy_score(y_test, y_pred_test):0.3f}")

** F1 Score **
Train: 0.807
Test: 0.775

** Accuracy **
Baseline - all zeroes: 0.574
Train: 0.847
Test: 0.820


### Naive Bayes

In [None]:
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

In [342]:
y_pred_train = gnb_clf.predict(X_train)
y_pred_test = gnb_clf.predict(X_test)

In [343]:
print(f"** F1 Score **")
print(f"Train: {f1_score(y_train, y_pred_train):0.3f}")
print(f"Test: {f1_score(y_test, y_pred_test):0.3f}\n")

print(f"** Accuracy **")
print(f"Baseline - all zeroes: {accuracy_score(y_test, np.zeros(y_test.shape[0])):0.3f}")
print(f"Train: {accuracy_score(y_train, y_pred_train):0.3f}")
print(f"Test: {accuracy_score(y_test, y_pred_test):0.3f}")

** F1 Score **
Train: 0.696
Test: 0.715

** Accuracy **
Baseline - all zeroes: 0.574
Train: 0.751
Test: 0.767


### Support Vector Classifier

In [361]:
svc_clf = SVC()
svc_params = {'C': [0.001, 0.01, 0.1, 1],
              'class_weight': [None],
              'coef0': [0.0],
              'degree': [3],
              'gamma': ['scale'],
              'kernel': ['rbf', 'linear'],
              'max_iter': [-1],
              'probability': [False],
              'random_state': [42],
              'shrinking': [True],
              'tol': [0.001],
              'verbose': [False]}

In [362]:
svc_gs = GridSearchCV(svc_clf, svc_params, cv=3, scoring = 'f1', verbose=10, n_jobs=-1, refit=True)

In [365]:
svc_gs = svc_gs.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:  4.1min remaining:   49.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.5min finished


In [369]:
print("** GRIDSEARCH RESULTS **")
print(f"Best score: {svc_gs.best_score_} using {svc_gs.best_params_}")

** GRIDSEARCH RESULTS **
Best score: 0.7603890665338128 using {'C': 0.1, 'class_weight': None, 'coef0': 0.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [370]:
best_svc = svc_gs.best_estimator_

In [371]:
y_pred_train = best_svc.predict(X_train)
y_pred_test = best_svc.predict(X_test)

In [372]:
print(f"** F1 Score **")
print(f"Train: {f1_score(y_train, y_pred_train):0.3f}")
print(f"Test: {f1_score(y_test, y_pred_test):0.3f}\n")

print(f"** Accuracy **")
print(f"Baseline - all zeroes: {accuracy_score(y_test, np.zeros(y_test.shape[0])):0.3f}")
print(f"Train: {accuracy_score(y_train, y_pred_train):0.3f}")
print(f"Test: {accuracy_score(y_test, y_pred_test):0.3f}")

** F1 Score **
Train: 0.801
Test: 0.781

** Accuracy **
Baseline - all zeroes: 0.574
Train: 0.844
Test: 0.828


### Ensemble - Stacking

In [373]:
estimators = [
    ('ridge', best_ridge),
    ('gnb', gnb_clf),
    ('svc', best_svc)
]
stacking = StackingClassifier(estimators=estimators,
                              final_estimator=LogisticRegression())

In [374]:
stacking.fit(X_train, y_train)

StackingClassifier(cv=None,
                   estimators=[('ridge',
                                RidgeClassifier(alpha=9, class_weight=None,
                                                copy_X=True, fit_intercept=True,
                                                max_iter=None, normalize=False,
                                                random_state=42, solver='auto',
                                                tol=0.001)),
                               ('gnb',
                                GaussianNB(priors=None, var_smoothing=1e-09)),
                               ('svc',
                                SVC(C=0.1, break_ties=False, cache_size=200,
                                    class_weight=None, coef0=0.0,
                                    decision_function_shape='ovr...
                                    shrinking=True, tol=0.001,
                                    verbose=False))],
                   final_estimator=LogisticRegression(C=1.0, class_w

In [375]:
y_pred_train = stacking.predict(X_train)
y_pred_test = stacking.predict(X_test)

In [377]:
print(f"** F1 Score **")
print(f"Train: {f1_score(y_train, y_pred_train):0.2f}")
print(f"Test: {f1_score(y_test, y_pred_test):0.2f}\n")

print(f"** Accuracy **")
print(f"Baseline - all zeroes: {accuracy_score(y_test, np.zeros(y_test.shape[0])):0.2f}")
print(f"Train: {accuracy_score(y_train, y_pred_train):0.2f}")
print(f"Test: {accuracy_score(y_test, y_pred_test):0.2f}")

** F1 Score **
Train: 0.81
Test: 0.79

** Accuracy **
Baseline - all zeroes: 0.57
Train: 0.85
Test: 0.83


### Saving the model

In [382]:
with open('stacking_model.pkl', 'wb') as handle:
    pickle.dump(stacking, handle, protocol = 4)