# Introduction

**This notebook linear model part is based on the tutorial notebook**

https://www.kaggle.com/philculliton/nlp-getting-started-tutorial

**The sections for RidgeClassifier and XGBClassifier do not contribute to the final score but they are alternative models for this problem**

**The BERT section contributes to the final score**

In [1]:
import os
import numpy as np
import pandas as pd
import string
import re
from collections import Counter
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, decomposition
import xgboost as xgb 

import IPython
import contractions
from datetime import datetime

In [2]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(len(train_df), len(test_df))

7613 3263


# Text preprocessing for linear models

In [4]:
twt = nltk.tokenize.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stop = nltk.corpus.stopwords.words("english") + list(string.punctuation)
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
# print(stop)

def clean_text(df, col='text', normalize='lemmatize', stopwords=True, add_keyword=False, fill_empty='NULL', drop_dupe=False, shuffle=False):
    cleaned_text, pos, neg = [], [], []
    text_df = df[col]
    if drop_dupe:
        df = df.drop_duplicates(col)
    
    try: 
        targets = df.target
    except:
        targets = -np.ones(len(df))
        
    if add_keyword:
        df.keyword = df.keyword.str.replace("%20", " ").fillna("")
        text_df = df.text + " " + df.keyword
    
    for (target, text) in zip(targets, text_df):
#         print(text)
        text = text.lower().split(" ")
        text = [word for word in text if "http" not in word]
        text = [contractions.fix(word) for word in text]
        text = " ".join(text).lower()
        text = re.sub(r'\d+|#', '', text)
        text = twt.tokenize(text)
        if stopwords:
            text = [word for word in text if word not in stop]
        text = [word for word in text if word not in ["rt", "û_", "amp", "ûª", "ûªs", "ûò", "ûï", "ûó", "åè", "ìñ1", "\x89", "...", "..", "via"]]
        if normalize == 'lemmatize':
            text = [lemmatizer.lemmatize(word) for word in text]
        if normalize == 'stem':
            text = [stemmer.stem(word) for word in text]
            
        if target == 1: 
            pos.append(text)
        if target == 0: 
            neg.append(text)
        text = " ".join(text)
        cleaned_text.append(text)
#         print(text)
        
    df["clean_text"] = cleaned_text
    df.clean_text = df.clean_text.replace("%20", " ")
    if fill_empty != False:
        df.loc[df.clean_text.str.len() == 0, 'clean_text'] = fill_empty
    if shuffle:
        df = df.sample(frac=1)
    
    return pos, neg, df
        
pos_text, neg_text, train_df = clean_text(train_df, add_keyword=False, drop_dupe=True, shuffle=True)
_, _, test_df = clean_text(test_df, add_keyword=False)
pos_text = [item for sublist in pos_text for item in sublist]
neg_text = [item for sublist in neg_text for item in sublist]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["clean_text"] = cleaned_text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.clean_text = df.clean_text.replace("%20", " ")


In [5]:
pos_common = pd.DataFrame(Counter(pos_text).most_common(20))
neg_common = pd.DataFrame(Counter(neg_text).most_common(20))
pd.concat([pos_common, neg_common], axis=1)

Unnamed: 0,0,1,0.1,1.1
0,fire,193,like,203
1,like,145,get,171
2,people,114,fire,159
3,news,99,new,129
4,new,97,body,118
5,building,87,one,115
6,one,86,would,114
7,get,80,time,109
8,disaster,78,video,101
9,flood,76,people,99


In [6]:
display(train_df.loc[(train_df.clean_text == "NULL"), :])
display(test_df.loc[(test_df.clean_text == "NULL"), :])

Unnamed: 0,id,keyword,location,text,target,clean_text
4573,6501,injuries,Toronto,Peel police say male cyclist struck near South...,1,
6866,9838,trauma,"Nashville, TN",Esteemed journalist recalls tragic effects of ...,1,
6694,9591,thunder,,My brother is crying cause the thunder lmao,0,
6702,9600,thunder,"Macon, GA",#thunder outside my house this afternoon #gawx...,1,
6718,9620,thunderstorm,"El Dorado, Arkansas",NWS has Continued a Severe Thunderstorm Warnin...,1,
6697,9594,thunder,,thunder is legit,0,
6720,9627,thunderstorm,"Asheville, NC",iNWS Alert SPSGSP from 8/5/2015 10:40 PM to 11...,1,
6723,9634,thunderstorm,73101,Severe Weather Statement issued August 05 at 9...,1,
6726,9639,thunderstorm,Oklahoma City,Severe Thunderstorm Warning for Oklahoma Count...,1,
3977,5653,flooding,"Ocean City, NJ",Residents in the central part of Ocean City he...,1,


Unnamed: 0,id,keyword,location,text,clean_text
13,43,,,What if?!,
2853,9461,terrorism,,Truth...\nhttps://t.co/GLzggDjQeH\n#News\n#BBC...,
2860,9478,terrorism,,Truth...\nhttps://t.co/Kix1j4ZyGx\n#News\n#BBC...,
2863,9487,terrorism,,Truth...\nhttps://t.co/n1K5nlib9X\n#News\n#BBC...,


In [7]:
display(train_df.sample(frac=1).head(10))
display(test_df.sample(frac=1).head(10))

Unnamed: 0,id,keyword,location,text,target,clean_text
3921,5577,flood,New York,Spot Flood Combo 53inch 300W Curved Cree LED W...,0,selmo catching flame going witnessing slaughter
4922,7010,mayhem,,Mayhem is beautiful,0,debatequestionswewanttohear saudi arabia israe...
7596,10851,,,RT @LivingSafely: #NWS issues Severe #Thunders...,1,wreck road blockage woodward avenue northbound...
1194,1720,bridge%20collapse,Mumbai,@ameenshaikh3 sir i just only wanted to make a...,1,ash australia collapse trent bridge among wors...
514,740,attacked,"SÌ£o Paulo SP, Brasil",Christian Attacked by Muslims at the Temple Mo...,1,christian attacked muslim temple mount waving ...
2499,3589,desolate,Macclesfield,@booksbyRoger TY for the follow Go To http://t...,0,green line derailment concern track look like ...
6147,8768,siren,,I added a video to a @YouTube playlist http://...,0,gaping sinkhole open brooklyn new york
1578,2279,cliff%20fall,"Abuja, Nigeria",When you're in deep sleep and then you dream y...,0,go ibiza pop ah pill get drunk fall cliff real...
1273,1835,burned,Alabama,Alton brown just did a livestream and he burne...,0,fire burning pendleton burned acre smoke repor...
419,608,arsonist,ss,@Casper_rmg u on dick,0,minor citation possesion decriminalized substa...


Unnamed: 0,id,keyword,location,text,clean_text
2370,7922,rainstorm,,@Robot_Rainstorm I'm interested. Is it throug...,interested yahoo
1547,5167,fatalities,"Greenville, SC",Highway Patrol reports uptick in statewide ped...,highway patrol report uptick statewide pedestr...
2163,7243,natural%20disaster,,TRAIN ACCIDENT IN HARDA (M.P.) IS NOT NATURAL ...,train accident harda p natural disaster show f...
1254,4125,drought,,Thought it was a drought!,thought drought
2878,9530,terrorist,Bharat.,I always wonder why and how media is capable t...,always wonder medium capable meet get detail t...
2661,8889,smoke,ig: j.nessaa,smoke me out,smoke
913,3002,dead,"Colwyn Bay, Wales",WHAT AN INCREDIBLE CHARACTER MY HEART IS BROKE...,incredible character heart broken actually dea...
635,2072,casualty,,ÛÏ@MacCocktail: 'The first casualty of war is...,first casualty war truth hiram johnson died da...
3124,10356,weapons,,Aug 3 1915ÛÓKILL 10000 WITH ROCKS.; Italians ...,aug ûókill rock italian make good use nature's...
27,90,ablaze,"121 N La Salle St, Suite 500",'Burning Rahm': Let's hope City Hall builds a ...,burning rahm let u hope city hall build giant ...


# Count and Vectorize approach (1-gram)

In [8]:
feature_col = "clean_text"

count_vectorizer = feature_extraction.text.CountVectorizer()
count_vectorizer_sw = feature_extraction.text.CountVectorizer()
tfidf = feature_extraction.text.TfidfVectorizer()
LSA = decomposition.TruncatedSVD(n_components=100)

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:50])
example_train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col][0:50])
example_tfidf = tfidf.fit_transform(train_df[feature_col][0:50])
example_tfidf_lsa = LSA.fit_transform(example_tfidf)

In [9]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print('No cleaning')
print(example_train_vectors[0].todense().shape)
# print(example_train_vectors[0].todense())
print('Cleaned')
print(example_train_vectors_sw[0].todense().shape)
# print(example_train_vectors_sw[0].todense())
print('TF-IDF cleaned')
print(example_tfidf[0].todense().shape)
# print(example_tfidf[0].todense())
print('TF-IDF + LSA cleaned')
print(example_tfidf_lsa[0].shape)
# print(example_tfidf_lsa[0])

No cleaning
(1, 505)
Cleaned
(1, 350)
TF-IDF cleaned
(1, 350)
TF-IDF + LSA cleaned
(50,)


In [10]:
train_vectors = count_vectorizer.fit_transform(train_df[feature_col])
train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col])
train_tfidf = tfidf.fit_transform(train_df[feature_col])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df[feature_col])
test_vectors_sw = count_vectorizer_sw.transform(test_df[feature_col])
test_tfidf = tfidf.transform(test_df[feature_col])

In [11]:
train_tfidf_lsa = LSA.fit_transform(train_tfidf)
test_tfidf_lsa = LSA.transform(test_tfidf)

# **Linear Model: Ridge Classifier**

In [12]:
# clf = linear_model.RidgeClassifier(class_weight='balanced')
# ridge_params = {
#     "alpha": np.linspace(0, 2, 100),
#     "tol": np.linspace(1e-5, 1e-1, 2000)
# }
# ridge_rscv = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, n_iter=100)
# ridge_rscv_lsa = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, n_iter=100)

In [13]:
# search = ridge_rscv.fit(train_tfidf, train_df["target"])
# search_lsa = ridge_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])

# print("Best RidgeClassifier TF-IDF")
# print(search.best_score_)
# print(search.best_params_)
# print("Best RidgeClassifier TF-IDF LSA")
# print(search_lsa.best_score_)
# print(search_lsa.best_params_)

In [14]:
# scores_tfidf = model_selection.cross_validate(clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# scores_tfidf_lsa = model_selection.cross_validate(clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("RidgeClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('RidgeClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("RidgeClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('RidgeClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('RidgeClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('RidgeClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

In [15]:
# scores_tfidf = model_selection.cross_validate(ridge_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# scores_tfidf_lsa = model_selection.cross_validate(ridge_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("Best RidgeClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('Best RidgeClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("Best RidgeClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('Best RidgeClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('Best RidgeClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('Best RidgeClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

# **GBDT: XGB Classifier**
Turns out not as good as ridge

In [16]:
# xgb_clf = xgb.XGBClassifier(random_state=765, tree_method='gpu_hist', predictor='gpu_predictor')
# xgb_params = {
#     "max_depth": [i for i in range(4, 14)],
#     "min_child_weight": np.linspace(0.25, 0.45, 100),
#     "gamma": np.linspace(0, 0.015, 1000),
#     "learning_rate": np.linspace(0.2, 0.5, 100),
# }
# xgb_rscv = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)
# xgb_rscv_lsa = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)

In [17]:
# search = xgb_rscv.fit(train_tfidf, train_df["target"])
# search_lsa = xgb_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])
# IPython.display.clear_output()
# print("Best XGBClassifier TF-IDF")
# print(search.best_score_)
# print(search.best_params_)
# print("Best XGBClassifier TF-IDF LSA")
# print(search_lsa.best_score_)
# print(search_lsa.best_params_)

In [18]:
# scores_tfidf = model_selection.cross_validate(xgb_clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
# scores_tfidf_lsa = model_selection.cross_validate(xgb_clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("XGBClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('XGBClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("XGBClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('XGBClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('XGBClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('XGBClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

In [19]:
# scores_tfidf = model_selection.cross_validate(xgb_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
# scores_tfidf_lsa = model_selection.cross_validate(xgb_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("Best XGBClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('Best XGBClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("Best XGBClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('Best XGBClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('Best XGBClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('Best XGBClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

# **MAIN CONTENT: BERT**

**Note that this is not the best version, fine tunings and validation may help obtain a better score**

In [20]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [21]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Adding additional tokens for masking URLs and usernames in tweets

In [22]:
bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[LINK]', '[USER]']})
bert_tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[LINK]', '[USER]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	30522: AddedToken("[LINK]", rstrip=True, lstrip=True, single_word=False, no

As BERT is able to read complete passages and learn from the context, too much text preprocessing may not be beneficial.

Some minor preprocessing with URLs, @usernames, and #hashtag, as they may be tokenized weirdly and the token make no sense

*Note: The BERT model still did pretty good without the above processing*

Now tokenize the data

In [23]:
def bert_tokenize(df, tokenizer=bert_tokenizer, max_seq_len = 100):
    input_sequences = []
    # The attention mask is an optional argument used when batching sequences together.
    # The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
    attention_masks = []
    bert_text = []
    
    # some very minor text processing, try to keep the text as close as original
    for i, text in enumerate(df['text']):
#         print(i, text)
        text = text.replace("\n", " ").split(" ")
        text = [word if "http" not in word else "[LINK]" for word in text]
        text = [word if "@" not in word else "[USER]" for word in text]
        text = " ".join(text)
        text = re.sub(r'#', '', text)
        bert_text.append(text)
        
#         print(i, text)
        sequence_dict = tokenizer.encode_plus(text, max_length=max_seq_len, pad_to_max_length=True)
        input_ids = sequence_dict['input_ids']
        att_mask = sequence_dict['attention_mask']
#         print(i, tokenizer.tokenize(text))
        input_sequences.append(input_ids)
        attention_masks.append(att_mask)
    
    df['bert_text'] = bert_text
    return input_sequences, attention_masks, df

train_X, train_att, train_df = bert_tokenize(train_df)
train_y = train_df['target'].values
test_X, test_att, test_df = bert_tokenize(test_df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [24]:
# Checking the tokenized format
print(train_X[0])
print(train_att[0])
print(test_X[0])
print(test_att[0])

[101, 11503, 11733, 1005, 1055, 2725, 1037, 3582, 11867, 9910, 3892, 10047, 7491, 1997, 8404, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2074, 3047, 1037, 6659, 2482, 5823, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 

Forming dataset

In [25]:
# device = torch.device("mps" if torch.backends.mps.is_available()  else "cpu")
device = torch.device("cpu")
print(device)

cpu


In [26]:
train_X = torch.tensor(train_X, device=device)
train_y = torch.tensor(train_y, device=device)
train_att = torch.tensor(train_att, device=device)
test_X = torch.tensor(test_X, device=device)
test_att = torch.tensor(test_att, device=device)

In [27]:
batch_size = 32
train_data = torch.utils.data.TensorDataset(train_X, train_att, train_y)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = torch.utils.data.TensorDataset(test_X, test_att)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

**Pretrained model from bert-base-uncased**

resize_token_embeddings is required as we have added new special tokens

In [28]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.resize_token_embeddings(len(bert_tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30524, 768)

In [29]:
model.to(device)
IPython.display.clear_output()

In [30]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fct = torch.nn.NLLLoss()



Define train and test functions

In [31]:
def train(epoch):
    t0 = datetime.now()
    model.train()
    for i, batch in enumerate(train_dataloader, start=1):
        # batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        model.zero_grad()  
        
        logits = model(inputs, attention_mask=att_masks)
        outputs = F.log_softmax(logits[0], dim=1)
        
        loss = loss_fct(outputs.view(-1, 2), labels.view(-1))
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        
        if i % 20 == 0:
            print('Train Epoch: {} [{}/{} ({:.0%})] - Elapsed: {}  |  Loss: {:.4f}'.format(
                epoch, i * len(inputs), len(train_dataloader.dataset),
                 i / len(train_dataloader), datetime.now() - t0, loss.item()
            ))

In [32]:
# def test():
#     t0 = datetime.now()
#     model.eval()
#     test_loss, test_acc = 0, 0
#     for batch in test_dataloader:
#         batch = tuple(t.to(device) for t in batch)
#         inputs, att_masks, labels = batch
#         with torch.no_grad():
#             logits = model(inputs, attention_mask=att_masks)
#             outputs = F.log_softmax(logits[0], dim=1)
            
#             loss = loss_fct(outputs.view(-1, 2), labels.view(-1))

#         test_loss += loss.item()
#         outputs = outputs.detach().cpu().numpy()

#         pred = np.argmax(outputs, axis=1)
#         labels = labels.cpu().numpy()
        
#         test_acc += accuracy_score(pred, labels)

#     test_loss /= len(test_dataloader)
#     test_acc /= len(test_dataloader)
#     print('\nTest set: Loss: {:.4f}, Accuracy: {:.1%} - Elapsed: {}\n'.format(
#         test_loss, test_acc, datetime.now() - t0
#     ))

In [33]:
num_epoch = 1
for epoch in range(num_epoch):
    train(epoch)
#     test()



**Generating predictions for test data**

In [34]:
def predict(text):
    # pre-process text
    input_ = torch.tensor(bert_tokenizer.encode(text)).unsqueeze(0).to(device)
    logits = model.eval()(input_ids=input_)[0]
    pred = F.softmax(logits, dim=1)[0]
    return pred

In [35]:
predictions = []
for text in test_df.text:
    prob = predict(text)
    pred = np.argmax(prob.cpu().detach().numpy())
    predictions.append(pred)

# Submission

In [37]:
sample_submission = pd.read_csv("sample_submission.csv")

In [38]:
# train_prediction = ridge_rscv.best_estimator_.predict(train_tfidf)
# train_df['pred_target'] = train_prediction

# ridge with rscv
# sample_submission["target"] = ridge_rscv.best_estimator_.predict(test_tfidf)

# bert
sample_submission["target"] = predictions

In [39]:
# clean_text_wc = train_df.clean_text.str.count(' ').add(1)
# short_text_incorrect = train_df.loc[(clean_text_wc < 5) & (train_df.target != train_df.pred_target), :]
# (short_text_incorrect.target == 1).sum(), (short_text_incorrect.target == 0).sum()

In [40]:
# display(sample_submission.head(30))
# display(test_df['text'].head(30))
pd.merge(sample_submission, test_df, on=['id']).sample(frac=1).head(10)

Unnamed: 0,id,target,keyword,location,text,clean_text,bert_text
528,1732,1,buildings%20burning,,@CTVKathyLe and in other news: don't run into ...,news run burning building,[USER] and in other news: don't run into burni...
1324,4369,1,earthquake,"Sydney, New South Wales",#Children traumatised after the Nepal earthqua...,child traumatised nepal earthquake educated co...,Children traumatised after the Nepal earthquak...
805,2646,0,crashed,USA,Website Malfunctioning? PHP Scripts not workin...,website malfunctioning php script working data...,Website Malfunctioning? PHP Scripts not workin...
3022,9975,1,tsunami,,When tsunami says your order will take 40 minu...,tsunami say order take minute placed order way,When tsunami says your order will take 40 minu...
3219,10699,0,wreck,"Jackson, MS",Now that IÛªve figured out how to get my musi...,ûªve figured get music rental car take night d...,Now that IÛªve figured out how to get my musi...
2088,7011,0,mayhem,"Hollywood, Ca",Wed Aug 8 ! #Mayhem @ Avalon ! 19+ Event ! J...,wed aug mayhem avalon event july aug bdays fre...,Wed Aug 8 ! Mayhem [USER] Avalon ! 19+ Event...
641,2092,0,casualty,,charlie from casualty at the ashes https://t.c...,charlie casualty ash,charlie from casualty at the ashes [LINK]
205,668,0,attack,Worldwide,Cooper the Super Pooper. The hero dog who save...,cooper super pooper hero dog saved drowning de...,Cooper the Super Pooper. The hero dog who save...
2968,9821,1,trauma,"Nashville, TN",Esteemed journalist recalls tragic effects of ...,esteemed journalist recall tragic effect unadd...,Esteemed journalist recalls tragic effects of ...
623,2030,1,casualties,Worldwide,Warfighting Robots Could Reduce Civilian Casua...,warfighting robot could reduce civilian casual...,Warfighting Robots Could Reduce Civilian Casua...


In [41]:
sample_submission.to_csv("submission.csv", index=False)