# Introduction

**This notebook linear model part is based on the tutorial notebook**

https://www.kaggle.com/philculliton/nlp-getting-started-tutorial

**The sections for RidgeClassifier and XGBClassifier do not contribute to the final score but they are alternative models for this problem**

**The BERT section contributes to the final score**

In [1]:
import os
import numpy as np
import pandas as pd
import string
import re
from collections import Counter
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, decomposition
import xgboost as xgb 

import IPython
import contractions
from datetime import datetime

In [2]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

In [5]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
print(len(train_df), len(test_df))

7613 3263


In [101]:
tmp = train_df.apply(lambda x: [x["text"]] + [x["keyword"]] if pd.notnull(x["keyword"]) else x["clean_text"], axis=1)
tmp.loc[tmp.apply(len) == 2].tail().to_list()

[["#FX #forex #trading Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/7enNulLKzM",
  'wrecked'],
 ['@engineshed Great atmosphere at the British Lion gig tonight. Hearing is wrecked. http://t.co/oMNBAtJEAO',
  'wrecked'],
 ["Cramer: Iger's 3 words that wrecked Disney's stock - CNBC http://t.co/N6RBnHMTD4",
  'wrecked'],
 ['siren', 'everywhere'],
 ['omg', 'earthquake']]

# Text preprocessing for linear models

In [129]:
def clean_text(df, col='text', normalize='lemmatize', preserve_case=False, stopwords=True, add_kw=False, add_loc=False):
    twt = nltk.tokenize.TweetTokenizer(preserve_case=preserve_case, strip_handles=True, reduce_len=True)
    stop = nltk.corpus.stopwords.words("english")
    # string operations
    clean_text = df.text
    clean_text = clean_text.str.replace("%20", " ")

    clean_text = clean_text.apply(twt.tokenize)
    # tokens operations
    clean_text = clean_text.apply(lambda l: [word for word in l if "http" not in word])
    clean_text = clean_text.apply(lambda l: " ".join([contractions.fix(word) for word in l]).split())
    clean_text = clean_text.apply(lambda l: [re.sub(r'[^A-Za-z]', '', word) for word in l])
    clean_text = clean_text.apply(lambda l: [word for word in l if word != ""])

    if stopwords:
        clean_text = clean_text.apply(lambda l: [word for word in l if word not in stop])
    if normalize == 'lemmatize':
        lemmatizer = nltk.stem.WordNetLemmatizer()
        clean_text = clean_text.apply(lambda l: [lemmatizer.lemmatize(word) for word in l])
    if normalize == 'stem':
        stemmer = nltk.stem.PorterStemmer()
        clean_text = clean_text.apply(lambda l: [stemmer.stem(word) for word in l])
    
    df["clean_text"] = clean_text
    if add_kw:
        df["clean_text"] = df.apply(lambda x: x["clean_text"] + [x["keyword"]] if pd.notnull(x["keyword"]) else x["clean_text"], axis=1)

    df["clean_text"] = df["clean_text"].apply(lambda l: " ".join(l))
    return df

train_df = clean_text(train_df, add_kw=True, add_loc=False)
test_df = clean_text(test_df, add_kw=True, add_loc=False)

In [133]:
disaster_text = [word for ls in train_df.loc[train_df.target==1].clean_text.str.split().to_list() if ls is not None for word in ls]
non_disaster_text = [word for ls in train_df.loc[train_df.target==0].clean_text.str.split().to_list() if ls is not None for word in ls]

In [134]:
disaster_common = pd.DataFrame(Counter(disaster_text).most_common(20))
non_disaster_common = pd.DataFrame(Counter(non_disaster_text).most_common(20))
pd.concat([disaster_common, non_disaster_common], axis=1)

Unnamed: 0,0,1,0.1,1.1
0,fire,279,like,255
1,news,144,get,185
2,disaster,136,new,170
3,police,128,would,144
4,via,121,one,137
5,california,115,body,117
6,people,114,fire,115
7,storm,111,want,111
8,suicide,110,going,107
9,wildfire,108,u,105


In [137]:
display(train_df.loc[train_df.clean_text == ""])
display(test_df.loc[test_df.clean_text == ""])

Unnamed: 0,id,keyword,location,text,target,clean_text


Unnamed: 0,id,keyword,location,text,clean_text
13,43,,,What if?!,


In [138]:
display(train_df.sample(frac=1).head(10))
display(test_df.sample(frac=1).head(10))

Unnamed: 0,id,keyword,location,text,target,clean_text
769,1113,blew%20up,USA/SO FLORIDA via BROOKLYN NY,The 1st time someone blew up my phone 30 times...,0,st time someone blew phone time would blocked ...
2217,3174,deluge,eARth 3,the fifth pre-dynastic #king in the legendary ...,0,fifth predynastic king legendary period deluge...
4795,6821,loud%20bang,"Wandsworth, London",@SW_Trains strange loud impact bang noises und...,0,strange loud impact bang noise train epsom arr...
117,170,aftershock,dope show,@KJForDays I'm seeing them and Issues at after...,0,seeing issue aftershock aftershock
631,911,bioterrorism,"Budapest, Hungary",How about a book describing the future of ther...,0,book describing future therapy technology spor...
3412,4883,explode,"Oklahoma City, OK",my brain id about to explode lmao,0,brain id explode lmao explode
5558,7933,rainstorm,,Robot_Rainstorm: We have two vacancies on the ...,0,robotrainstorm two vacancy castle fantasy foot...
4844,6898,mass%20murder,Auckland,Hiroshima: 70 years since the worst mass murde...,1,hiroshima year since worst mass murder human h...
6640,9508,terrorist,Iraq|Afghanistan| RSA |Baghdad,Seek help warra #MetroFmTalk,0,seek help warra metrofmtalk terrorist
5454,7780,police,"Mesa, AZ",@ArizonaDOT Price Rd North bound closed from U...,1,price rd north bound closed university rio sal...


Unnamed: 0,id,keyword,location,text,clean_text
1920,6476,injured,| INDIA |,RT- Udhampur terror attack: Militants attack p...,rt udhampur terror attack militant attack poli...
757,2472,collided,,@mollywood I agree! I didn't know you had move...,agree know moved marketplace woke report thoug...
313,1019,blazing,"Montreal,QC",#website #hosting Get blazing speeds professio...,website hosting get blazing speed professional...
220,711,attacked,1937 Germany,Christian Attacked by Muslims at the Temple Mo...,christian attacked muslim temple mount waving ...
859,2814,cyclone,"Quezon City, PHILIPPINES",SEVERE WEATHER BULLETIN No. 6\nFOR: TYPHOON Û...,severe weather bulletin typhoon hannaph soudel...
1832,6189,hijacker,Over the Hills and Far Away,Remove the http://t.co/zmoKZZf4qp and Linkury ...,remove linkury browser hijacker pita hijack br...
202,650,attack,Maryland,End the Innovation Catch-22: Reduce the Attack...,end innovation catch reduce attack surface ind...
2896,9588,thunder,Journey,I need a thunder buddy. ????,need thunder buddy thunder
3242,10792,wrecked,"Plymouth, England",Almost *wrecked* my van the other day because ...,almost wrecked van day guy yeah brake also car...
315,1027,blazing,Your screen,S3XLEAK!!!\nPh0tos of 19yrs old Ash@wo lady in...,sxleak phtos yr old ash wo lady festac town de...


# Count and Vectorize approach (1-gram)

In [139]:
feature_col = "clean_text"

count_vectorizer = feature_extraction.text.CountVectorizer()
count_vectorizer_sw = feature_extraction.text.CountVectorizer()
tfidf = feature_extraction.text.TfidfVectorizer()
LSA = decomposition.TruncatedSVD(n_components=100)

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:50])
example_train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col][0:50])
example_tfidf = tfidf.fit_transform(train_df[feature_col][0:50])
example_tfidf_lsa = LSA.fit_transform(example_tfidf)

In [9]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print('No cleaning')
print(example_train_vectors[0].todense().shape)
# print(example_train_vectors[0].todense())
print('Cleaned')
print(example_train_vectors_sw[0].todense().shape)
# print(example_train_vectors_sw[0].todense())
print('TF-IDF cleaned')
print(example_tfidf[0].todense().shape)
# print(example_tfidf[0].todense())
print('TF-IDF + LSA cleaned')
print(example_tfidf_lsa[0].shape)
# print(example_tfidf_lsa[0])

No cleaning
(1, 505)
Cleaned
(1, 350)
TF-IDF cleaned
(1, 350)
TF-IDF + LSA cleaned
(50,)


In [140]:
train_vectors = count_vectorizer.fit_transform(train_df[feature_col])
train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col])
train_tfidf = tfidf.fit_transform(train_df[feature_col])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df[feature_col])
test_vectors_sw = count_vectorizer_sw.transform(test_df[feature_col])
test_tfidf = tfidf.transform(test_df[feature_col])

In [141]:
train_tfidf_lsa = LSA.fit_transform(train_tfidf)
test_tfidf_lsa = LSA.transform(test_tfidf)

# **Linear Model: Ridge Classifier**

In [160]:
clf = linear_model.RidgeClassifier(class_weight='balanced')
ridge_params = {
    "alpha": np.linspace(1e-5, 10, 1000),
}
ridge_rscv = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, n_iter=500)
ridge_rscv_lsa = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, n_iter=500)

In [161]:
search = ridge_rscv.fit(train_tfidf, train_df["target"])
search_lsa = ridge_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])

print("Best RidgeClassifier TF-IDF")
print(search.best_score_)
print(search.best_params_)
print("Best RidgeClassifier TF-IDF LSA")
print(search_lsa.best_score_)
print(search_lsa.best_params_)

Best RidgeClassifier TF-IDF
0.5707416575235432
{'alpha': 9.1992}
Best RidgeClassifier TF-IDF LSA
0.5935989390810642
{'alpha': 0.6106199999999999}


In [162]:
scores_tfidf = model_selection.cross_validate(clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

scores_tfidf_lsa = model_selection.cross_validate(clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

print("RidgeClassifier TF-IDF F1:              ", np.mean(scores_tfidf['test_f1']))
print('RidgeClassifier TF-IDF & LSA F1:        ', np.mean(scores_tfidf_lsa['test_f1']))
print("RidgeClassifier TF-IDF Precision:       ", np.mean(scores_tfidf['test_precision']))
print('RidgeClassifier TF-IDF & LSA Precision: ', np.mean(scores_tfidf_lsa['test_precision']))
print('RidgeClassifier TF-IDF Recall:          ', np.mean(scores_tfidf['test_recall']))
print('RidgeClassifier TF-IDF & LSA Recall:    ', np.mean(scores_tfidf_lsa['test_recall']))

RidgeClassifier TF-IDF F1:               0.5601837450851008
RidgeClassifier TF-IDF & LSA F1:         0.5882112343508724
RidgeClassifier TF-IDF Precision:        0.5887007448455814
RidgeClassifier TF-IDF & LSA Precision:  0.6096595652069612
RidgeClassifier TF-IDF Recall:           0.5487905315498284
RidgeClassifier TF-IDF & LSA Recall:     0.5811854238158601


In [163]:
scores_tfidf = model_selection.cross_validate(ridge_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
scores_tfidf_lsa = model_selection.cross_validate(ridge_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

print("RidgeClassifier TF-IDF F1:              ", np.mean(scores_tfidf['test_f1']))
print('RidgeClassifier TF-IDF & LSA F1:        ', np.mean(scores_tfidf_lsa['test_f1']))
print("RidgeClassifier TF-IDF Precision:       ", np.mean(scores_tfidf['test_precision']))
print('RidgeClassifier TF-IDF & LSA Precision: ', np.mean(scores_tfidf_lsa['test_precision']))
print('RidgeClassifier TF-IDF Recall:          ', np.mean(scores_tfidf['test_recall']))
print('RidgeClassifier TF-IDF & LSA Recall:    ', np.mean(scores_tfidf_lsa['test_recall']))

RidgeClassifier TF-IDF F1:               0.5707416575235432
RidgeClassifier TF-IDF & LSA F1:         0.5935989390810642
RidgeClassifier TF-IDF Precision:        0.6431236917691825
RidgeClassifier TF-IDF & LSA Precision:  0.6065785449418388
RidgeClassifier TF-IDF Recall:           0.5252431309382076
RidgeClassifier TF-IDF & LSA Recall:     0.5943343371384551


# **GBDT: XGB Classifier**
Turns out not as good as ridge

In [166]:
xgb_clf = xgb.XGBClassifier(random_state=765)
xgb_params = {
    "max_depth": [i for i in range(4, 14)],
    "min_child_weight": np.linspace(0.25, 0.45, 100),
    "gamma": np.linspace(0, 0.015, 1000),
    "learning_rate": np.linspace(0.2, 0.5, 100),
}
xgb_rscv = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)
xgb_rscv_lsa = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)

In [167]:
search = xgb_rscv.fit(train_tfidf, train_df["target"])
search_lsa = xgb_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])

print("Best XGBClassifier TF-IDF")
print(search.best_score_)
print(search.best_params_)
print("Best XGBClassifier TF-IDF LSA")
print(search_lsa.best_score_)
print(search_lsa.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  if is_sparse(data):


[CV] END gamma=0.007357357357357357, learning_rate=0.4878787878787879, max_depth=7, min_child_weight=0.3893939393939394; total time=   0.6s


  if is_sparse(data):


[CV] END gamma=0.007357357357357357, learning_rate=0.4878787878787879, max_depth=7, min_child_weight=0.3893939393939394; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.007357357357357357, learning_rate=0.4878787878787879, max_depth=7, min_child_weight=0.3893939393939394; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.007357357357357357, learning_rate=0.4878787878787879, max_depth=7, min_child_weight=0.3893939393939394; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.007357357357357357, learning_rate=0.4878787878787879, max_depth=7, min_child_weight=0.3893939393939394; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.00487987987987988, learning_rate=0.3090909090909091, max_depth=9, min_child_weight=0.28434343434343434; total time=   0.6s


  if is_sparse(data):


[CV] END gamma=0.00487987987987988, learning_rate=0.3090909090909091, max_depth=9, min_child_weight=0.28434343434343434; total time=   0.6s


  if is_sparse(data):


[CV] END gamma=0.00487987987987988, learning_rate=0.3090909090909091, max_depth=9, min_child_weight=0.28434343434343434; total time=   0.6s


  if is_sparse(data):


[CV] END gamma=0.00487987987987988, learning_rate=0.3090909090909091, max_depth=9, min_child_weight=0.28434343434343434; total time=   0.6s


  if is_sparse(data):


[CV] END gamma=0.00487987987987988, learning_rate=0.3090909090909091, max_depth=9, min_child_weight=0.28434343434343434; total time=   0.6s


  if is_sparse(data):


[CV] END gamma=0.0015015015015015015, learning_rate=0.3878787878787879, max_depth=11, min_child_weight=0.44393939393939397; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.0015015015015015015, learning_rate=0.3878787878787879, max_depth=11, min_child_weight=0.44393939393939397; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.0015015015015015015, learning_rate=0.3878787878787879, max_depth=11, min_child_weight=0.44393939393939397; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.0015015015015015015, learning_rate=0.3878787878787879, max_depth=11, min_child_weight=0.44393939393939397; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.0015015015015015015, learning_rate=0.3878787878787879, max_depth=11, min_child_weight=0.44393939393939397; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.00996996996996997, learning_rate=0.2363636363636364, max_depth=6, min_child_weight=0.2601010101010101; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.00996996996996997, learning_rate=0.2363636363636364, max_depth=6, min_child_weight=0.2601010101010101; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.00996996996996997, learning_rate=0.2363636363636364, max_depth=6, min_child_weight=0.2601010101010101; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.00996996996996997, learning_rate=0.2363636363636364, max_depth=6, min_child_weight=0.2601010101010101; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.00996996996996997, learning_rate=0.2363636363636364, max_depth=6, min_child_weight=0.2601010101010101; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.002747747747747748, learning_rate=0.4696969696969697, max_depth=6, min_child_weight=0.445959595959596; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.002747747747747748, learning_rate=0.4696969696969697, max_depth=6, min_child_weight=0.445959595959596; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.002747747747747748, learning_rate=0.4696969696969697, max_depth=6, min_child_weight=0.445959595959596; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.002747747747747748, learning_rate=0.4696969696969697, max_depth=6, min_child_weight=0.445959595959596; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.002747747747747748, learning_rate=0.4696969696969697, max_depth=6, min_child_weight=0.445959595959596; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.01493993993993994, learning_rate=0.2606060606060606, max_depth=6, min_child_weight=0.4217171717171717; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.01493993993993994, learning_rate=0.2606060606060606, max_depth=6, min_child_weight=0.4217171717171717; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.01493993993993994, learning_rate=0.2606060606060606, max_depth=6, min_child_weight=0.4217171717171717; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.01493993993993994, learning_rate=0.2606060606060606, max_depth=6, min_child_weight=0.4217171717171717; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.01493993993993994, learning_rate=0.2606060606060606, max_depth=6, min_child_weight=0.4217171717171717; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.0017717717717717718, learning_rate=0.3060606060606061, max_depth=7, min_child_weight=0.3207070707070707; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.0017717717717717718, learning_rate=0.3060606060606061, max_depth=7, min_child_weight=0.3207070707070707; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.0017717717717717718, learning_rate=0.3060606060606061, max_depth=7, min_child_weight=0.3207070707070707; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.0017717717717717718, learning_rate=0.3060606060606061, max_depth=7, min_child_weight=0.3207070707070707; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.0017717717717717718, learning_rate=0.3060606060606061, max_depth=7, min_child_weight=0.3207070707070707; total time=   0.5s


  if is_sparse(data):


[CV] END gamma=0.005255255255255255, learning_rate=0.4363636363636364, max_depth=5, min_child_weight=0.32474747474747473; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.005255255255255255, learning_rate=0.4363636363636364, max_depth=5, min_child_weight=0.32474747474747473; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.005255255255255255, learning_rate=0.4363636363636364, max_depth=5, min_child_weight=0.32474747474747473; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.005255255255255255, learning_rate=0.4363636363636364, max_depth=5, min_child_weight=0.32474747474747473; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.005255255255255255, learning_rate=0.4363636363636364, max_depth=5, min_child_weight=0.32474747474747473; total time=   0.4s


  if is_sparse(data):


[CV] END gamma=0.012042042042042041, learning_rate=0.26666666666666666, max_depth=13, min_child_weight=0.38737373737373737; total time=   0.8s


  if is_sparse(data):


[CV] END gamma=0.012042042042042041, learning_rate=0.26666666666666666, max_depth=13, min_child_weight=0.38737373737373737; total time=   0.8s


  if is_sparse(data):


[CV] END gamma=0.012042042042042041, learning_rate=0.26666666666666666, max_depth=13, min_child_weight=0.38737373737373737; total time=   0.8s


  if is_sparse(data):


[CV] END gamma=0.012042042042042041, learning_rate=0.26666666666666666, max_depth=13, min_child_weight=0.38737373737373737; total time=   0.8s


  if is_sparse(data):


[CV] END gamma=0.012042042042042041, learning_rate=0.26666666666666666, max_depth=13, min_child_weight=0.38737373737373737; total time=   0.8s


  if is_sparse(data):


[CV] END gamma=0.009429429429429429, learning_rate=0.2545454545454546, max_depth=11, min_child_weight=0.4116161616161616; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.009429429429429429, learning_rate=0.2545454545454546, max_depth=11, min_child_weight=0.4116161616161616; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.009429429429429429, learning_rate=0.2545454545454546, max_depth=11, min_child_weight=0.4116161616161616; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.009429429429429429, learning_rate=0.2545454545454546, max_depth=11, min_child_weight=0.4116161616161616; total time=   0.7s


  if is_sparse(data):


[CV] END gamma=0.009429429429429429, learning_rate=0.2545454545454546, max_depth=11, min_child_weight=0.4116161616161616; total time=   0.7s


  if is_sparse(data):


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  if is_sparse(data):


[CV] END gamma=0.012687687687687688, learning_rate=0.3424242424242424, max_depth=6, min_child_weight=0.4015151515151515; total time=   1.5s


  if is_sparse(data):


[CV] END gamma=0.012687687687687688, learning_rate=0.3424242424242424, max_depth=6, min_child_weight=0.4015151515151515; total time=   1.4s


  if is_sparse(data):


[CV] END gamma=0.012687687687687688, learning_rate=0.3424242424242424, max_depth=6, min_child_weight=0.4015151515151515; total time=   1.2s


  if is_sparse(data):


[CV] END gamma=0.012687687687687688, learning_rate=0.3424242424242424, max_depth=6, min_child_weight=0.4015151515151515; total time=   1.4s


  if is_sparse(data):


[CV] END gamma=0.012687687687687688, learning_rate=0.3424242424242424, max_depth=6, min_child_weight=0.4015151515151515; total time=   1.3s


  if is_sparse(data):


[CV] END gamma=0.003228228228228228, learning_rate=0.24545454545454548, max_depth=8, min_child_weight=0.353030303030303; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.003228228228228228, learning_rate=0.24545454545454548, max_depth=8, min_child_weight=0.353030303030303; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.003228228228228228, learning_rate=0.24545454545454548, max_depth=8, min_child_weight=0.353030303030303; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.003228228228228228, learning_rate=0.24545454545454548, max_depth=8, min_child_weight=0.353030303030303; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.003228228228228228, learning_rate=0.24545454545454548, max_depth=8, min_child_weight=0.353030303030303; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.009954954954954954, learning_rate=0.42727272727272725, max_depth=10, min_child_weight=0.35505050505050506; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.009954954954954954, learning_rate=0.42727272727272725, max_depth=10, min_child_weight=0.35505050505050506; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.009954954954954954, learning_rate=0.42727272727272725, max_depth=10, min_child_weight=0.35505050505050506; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.009954954954954954, learning_rate=0.42727272727272725, max_depth=10, min_child_weight=0.35505050505050506; total time=   2.1s


  if is_sparse(data):


[CV] END gamma=0.009954954954954954, learning_rate=0.42727272727272725, max_depth=10, min_child_weight=0.35505050505050506; total time=   2.1s


  if is_sparse(data):


[CV] END gamma=0.006486486486486486, learning_rate=0.4909090909090909, max_depth=13, min_child_weight=0.30252525252525253; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.006486486486486486, learning_rate=0.4909090909090909, max_depth=13, min_child_weight=0.30252525252525253; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.006486486486486486, learning_rate=0.4909090909090909, max_depth=13, min_child_weight=0.30252525252525253; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.006486486486486486, learning_rate=0.4909090909090909, max_depth=13, min_child_weight=0.30252525252525253; total time=   2.5s


  if is_sparse(data):


[CV] END gamma=0.006486486486486486, learning_rate=0.4909090909090909, max_depth=13, min_child_weight=0.30252525252525253; total time=   2.6s


  if is_sparse(data):


[CV] END gamma=0.013798798798798799, learning_rate=0.39090909090909093, max_depth=5, min_child_weight=0.3368686868686869; total time=   1.0s


  if is_sparse(data):


[CV] END gamma=0.013798798798798799, learning_rate=0.39090909090909093, max_depth=5, min_child_weight=0.3368686868686869; total time=   1.0s


  if is_sparse(data):


[CV] END gamma=0.013798798798798799, learning_rate=0.39090909090909093, max_depth=5, min_child_weight=0.3368686868686869; total time=   1.0s


  if is_sparse(data):


[CV] END gamma=0.013798798798798799, learning_rate=0.39090909090909093, max_depth=5, min_child_weight=0.3368686868686869; total time=   1.1s


  if is_sparse(data):


[CV] END gamma=0.013798798798798799, learning_rate=0.39090909090909093, max_depth=5, min_child_weight=0.3368686868686869; total time=   1.0s


  if is_sparse(data):


[CV] END gamma=0.002882882882882883, learning_rate=0.3090909090909091, max_depth=10, min_child_weight=0.38737373737373737; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.002882882882882883, learning_rate=0.3090909090909091, max_depth=10, min_child_weight=0.38737373737373737; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.002882882882882883, learning_rate=0.3090909090909091, max_depth=10, min_child_weight=0.38737373737373737; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.002882882882882883, learning_rate=0.3090909090909091, max_depth=10, min_child_weight=0.38737373737373737; total time=   2.2s


  if is_sparse(data):


[CV] END gamma=0.002882882882882883, learning_rate=0.3090909090909091, max_depth=10, min_child_weight=0.38737373737373737; total time=   2.3s


  if is_sparse(data):


[CV] END gamma=0.007462462462462462, learning_rate=0.4696969696969697, max_depth=13, min_child_weight=0.2904040404040404; total time=   2.6s


  if is_sparse(data):


[CV] END gamma=0.007462462462462462, learning_rate=0.4696969696969697, max_depth=13, min_child_weight=0.2904040404040404; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.007462462462462462, learning_rate=0.4696969696969697, max_depth=13, min_child_weight=0.2904040404040404; total time=   2.5s


  if is_sparse(data):


[CV] END gamma=0.007462462462462462, learning_rate=0.4696969696969697, max_depth=13, min_child_weight=0.2904040404040404; total time=   2.8s


  if is_sparse(data):


[CV] END gamma=0.007462462462462462, learning_rate=0.4696969696969697, max_depth=13, min_child_weight=0.2904040404040404; total time=   2.6s


  if is_sparse(data):


[CV] END gamma=0.00575075075075075, learning_rate=0.36969696969696975, max_depth=7, min_child_weight=0.447979797979798; total time=   1.9s


  if is_sparse(data):


[CV] END gamma=0.00575075075075075, learning_rate=0.36969696969696975, max_depth=7, min_child_weight=0.447979797979798; total time=   1.9s


  if is_sparse(data):


[CV] END gamma=0.00575075075075075, learning_rate=0.36969696969696975, max_depth=7, min_child_weight=0.447979797979798; total time=   1.9s


  if is_sparse(data):


[CV] END gamma=0.00575075075075075, learning_rate=0.36969696969696975, max_depth=7, min_child_weight=0.447979797979798; total time=   1.9s


  if is_sparse(data):


[CV] END gamma=0.00575075075075075, learning_rate=0.36969696969696975, max_depth=7, min_child_weight=0.447979797979798; total time=   1.9s


  if is_sparse(data):


[CV] END gamma=0.013978978978978979, learning_rate=0.4666666666666667, max_depth=7, min_child_weight=0.2722222222222222; total time=   2.0s


  if is_sparse(data):


[CV] END gamma=0.013978978978978979, learning_rate=0.4666666666666667, max_depth=7, min_child_weight=0.2722222222222222; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.013978978978978979, learning_rate=0.4666666666666667, max_depth=7, min_child_weight=0.2722222222222222; total time=   1.8s


  if is_sparse(data):


[CV] END gamma=0.013978978978978979, learning_rate=0.4666666666666667, max_depth=7, min_child_weight=0.2722222222222222; total time=   1.7s


  if is_sparse(data):


[CV] END gamma=0.013978978978978979, learning_rate=0.4666666666666667, max_depth=7, min_child_weight=0.2722222222222222; total time=   1.9s


  if is_sparse(data):


[CV] END gamma=0.010975975975975976, learning_rate=0.5, max_depth=10, min_child_weight=0.3065656565656566; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.010975975975975976, learning_rate=0.5, max_depth=10, min_child_weight=0.3065656565656566; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.010975975975975976, learning_rate=0.5, max_depth=10, min_child_weight=0.3065656565656566; total time=   2.3s


  if is_sparse(data):


[CV] END gamma=0.010975975975975976, learning_rate=0.5, max_depth=10, min_child_weight=0.3065656565656566; total time=   2.4s


  if is_sparse(data):


[CV] END gamma=0.010975975975975976, learning_rate=0.5, max_depth=10, min_child_weight=0.3065656565656566; total time=   2.5s


  if is_sparse(data):


Best XGBClassifier TF-IDF
0.5104798476101745
{'min_child_weight': 0.44393939393939397, 'max_depth': 11, 'learning_rate': 0.3878787878787879, 'gamma': 0.0015015015015015015}
Best XGBClassifier TF-IDF LSA
0.5700782456085994
{'min_child_weight': 0.2904040404040404, 'max_depth': 13, 'learning_rate': 0.4696969696969697, 'gamma': 0.007462462462462462}


In [168]:
scores_tfidf = model_selection.cross_validate(xgb_clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
scores_tfidf_lsa = model_selection.cross_validate(xgb_clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

print("XGBClassifier TF-IDF F1:              ", np.mean(scores_tfidf['test_f1']))
print('XGBClassifier TF-IDF & LSA F1:        ', np.mean(scores_tfidf_lsa['test_f1']))
print("XGBClassifier TF-IDF Precision:       ", np.mean(scores_tfidf['test_precision']))
print('XGBClassifier TF-IDF & LSA Precision: ', np.mean(scores_tfidf_lsa['test_precision']))
print('XGBClassifier TF-IDF Recall:          ', np.mean(scores_tfidf['test_recall']))
print('XGBClassifier TF-IDF & LSA Recall:    ', np.mean(scores_tfidf_lsa['test_recall']))

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


XGBClassifier TF-IDF F1:               0.4629093293776617
XGBClassifier TF-IDF & LSA F1:         0.5625011165953928
XGBClassifier TF-IDF Precision:        0.6224881825507008
XGBClassifier TF-IDF & LSA Precision:  0.5502636467615366
XGBClassifier TF-IDF Recall:           0.37664915843779906
XGBClassifier TF-IDF & LSA Recall:     0.5830240212900063


In [169]:
scores_tfidf = model_selection.cross_validate(xgb_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
scores_tfidf_lsa = model_selection.cross_validate(xgb_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

print("Best XGBClassifier TF-IDF F1:              ", np.mean(scores_tfidf['test_f1']))
print('Best XGBClassifier TF-IDF & LSA F1:        ', np.mean(scores_tfidf_lsa['test_f1']))
print("Best XGBClassifier TF-IDF Precision:       ", np.mean(scores_tfidf['test_precision']))
print('Best XGBClassifier TF-IDF & LSA Precision: ', np.mean(scores_tfidf_lsa['test_precision']))
print('Best XGBClassifier TF-IDF Recall:          ', np.mean(scores_tfidf['test_recall']))
print('Best XGBClassifier TF-IDF & LSA Recall:    ', np.mean(scores_tfidf_lsa['test_recall']))

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


Best XGBClassifier TF-IDF F1:               0.5104798476101745
Best XGBClassifier TF-IDF & LSA F1:         0.5700782456085994
Best XGBClassifier TF-IDF Precision:        0.6163552611731725
Best XGBClassifier TF-IDF & LSA Precision:  0.548976024995403
Best XGBClassifier TF-IDF Recall:           0.44818731470457784
Best XGBClassifier TF-IDF & LSA Recall:     0.5992170320050423


# **MAIN CONTENT: BERT**

**Note that this is not the best version, fine tunings and validation may help obtain a better score**

In [170]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [171]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Adding additional tokens for masking URLs and usernames in tweets

In [172]:
bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[LINK]', '[USER]']})
bert_tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[LINK]', '[USER]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	30522: AddedToken("[LINK]", rstrip=True, lstrip=True, single_word=False, no

As BERT is able to read complete passages and learn from the context, too much text preprocessing may not be beneficial.

Some minor preprocessing with URLs, @usernames, and #hashtag, as they may be tokenized weirdly and the token make no sense

*Note: The BERT model still did pretty good without the above processing*

Now tokenize the data

In [200]:
def bert_tokenize(df, tokenizer=bert_tokenizer, max_seq_len=100):
    input_sequences = []
    # The attention mask is an optional argument used when batching sequences together.
    # The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
    attention_masks = []
    
    # some very minor text processing, try to keep the text as close as original
    bert_text = df['text'].str.replace("\n", " ").str.replace("#", "").str.split(" ")
    bert_text = bert_text.apply(lambda l: [word if "http" not in word else "[LINK]" for word in l])
    bert_text = bert_text.apply(lambda l: [word if "@" not in word else "[USER]" for word in l])
    bert_text = bert_text.apply(lambda l: " ".join(l))

    for text in bert_text.to_list():
        sequence_dict = tokenizer.encode_plus(text, padding="max_length", max_length=max_seq_len)
        input_ids = sequence_dict['input_ids']
        att_mask = sequence_dict['attention_mask']
        input_sequences.append(input_ids)
        attention_masks.append(att_mask)
    return input_sequences, attention_masks, df

train_X, train_att, train_df = bert_tokenize(train_df)
train_y = train_df['target'].values
test_X, test_att, test_df = bert_tokenize(test_df)

In [201]:
# Checking the tokenized format
print(train_X[0])
print(train_att[0])
print(test_X[0])
print(test_att[0])

[101, 2256, 15616, 2024, 1996, 3114, 1997, 2023, 8372, 2089, 16455, 9641, 2149, 2035, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2074, 3047, 1037, 6659, 2482, 5823, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0,

Forming dataset

In [202]:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cpu")
print(device)

cpu


In [203]:
train_X = torch.tensor(train_X, device=device)
train_y = torch.tensor(train_y, device=device)
train_att = torch.tensor(train_att, device=device)
test_X = torch.tensor(test_X, device=device)
test_att = torch.tensor(test_att, device=device)

In [204]:
batch_size = 32
train_data = torch.utils.data.TensorDataset(train_X, train_att, train_y)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = torch.utils.data.TensorDataset(test_X, test_att)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

**Pretrained model from bert-base-uncased**

resize_token_embeddings is required as we have added new special tokens

In [205]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.resize_token_embeddings(len(bert_tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30524, 768)

In [206]:
model.to(device)
IPython.display.clear_output()

In [207]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fct = torch.nn.NLLLoss()



Define train and test functions

In [208]:
def train(epoch):
    t0 = datetime.now()
    model.train()
    for i, batch in enumerate(train_dataloader, start=1):
        # batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        model.zero_grad()  
        
        logits = model(inputs, attention_mask=att_masks)
        outputs = F.log_softmax(logits[0], dim=1)
        
        loss = loss_fct(outputs.view(-1, 2), labels.view(-1))
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        
        if i % 20 == 0:
            print('Train Epoch: {} [{}/{} ({:.0%})] - Elapsed: {}  |  Loss: {:.4f}'.format(
                epoch, i * len(inputs), len(train_dataloader.dataset),
                 i / len(train_dataloader), datetime.now() - t0, loss.item()
            ))

In [209]:
num_epoch = 1
for epoch in range(num_epoch):
    train(epoch)



**Generating predictions for test data**

In [210]:
def predict(text):
    # pre-process text
    input_ = torch.tensor(bert_tokenizer.encode(text)).unsqueeze(0).to(device)
    logits = model.eval()(input_ids=input_)[0]
    pred = F.softmax(logits, dim=1)[0]
    return pred

In [211]:
predictions = []
for text in test_df.text:
    prob = predict(text)
    pred = np.argmax(prob.cpu().detach().numpy())
    predictions.append(pred)

# Submission

In [212]:
sample_submission = pd.read_csv("sample_submission.csv")

In [213]:
# train_prediction = ridge_rscv.best_estimator_.predict(train_tfidf)
# train_df['pred_target'] = train_prediction

# ridge with rscv
# sample_submission["target"] = ridge_rscv.best_estimator_.predict(test_tfidf)

# bert
sample_submission["target"] = predictions

In [214]:
# clean_text_wc = train_df.clean_text.str.count(' ').add(1)
# short_text_incorrect = train_df.loc[(clean_text_wc < 5) & (train_df.target != train_df.pred_target), :]
# (short_text_incorrect.target == 1).sum(), (short_text_incorrect.target == 0).sum()

In [215]:
# display(sample_submission.head(30))
# display(test_df['text'].head(30))
pd.merge(sample_submission, test_df, on=['id']).sample(frac=1).head(10)

Unnamed: 0,id,target,keyword,location,text,clean_text
866,2840,0,cyclone,"Manchester, England",raleigh cyclone 15 gear mountain bike 26'': ht...,raleigh cyclone gear mountain bike sportinggoo...
2734,9108,1,suicide%20bomb,,reaad/ plsss Pic of 16yr old PKK suicide bombe...,reaad plsss pic yr old pkk suicide bomber deto...
1886,6357,1,hostages,China,#hot C-130 specially modified to land in a st...,hot c specially modified land stadium rescue h...
1195,3930,1,devastated,Dhaka,Obama Declares Disaster for Typhoon-Devastated...,obama declares disaster typhoondevastated saip...
2910,9633,1,thunderstorm,"Florence, South Carolina",GSP issues STRONG THUNDERSTORM WILL IMPACT POR...,gsp issue strong thunderstorm impact portion n...
1609,5429,1,first%20responders,,What I'll miss the most is that very rare occa...,miss rare occasion first responder scene diffe...
2007,6755,1,lightning,NWA & River Valley,It doesn't get any closer. Heavy rain just bar...,get closer heavy rain barely missed festival l...
3242,10792,0,wrecked,"Plymouth, England",Almost *wrecked* my van the other day because ...,almost wrecked van day guy yeah brake also car...
2782,9243,1,sunk,"Sussex, UK",According to the tabloids Cilla could've been ...,according tabloid cilla could saved sense tita...
422,1366,0,blown%20up,,We are now up to run no. 24 in the singles. Th...,run single rain blown blown%20up


In [216]:
sample_submission.to_csv("submission.csv", index=False)