# Summary

The target of the project is to create a supervised model with sequencial data deep learning model, to predict if a tweet indicates a real disaster. This is useful for some organizations to actively monitoring tweets for deteting disaster in near real time. 

In [2]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import random
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
import torchmetrics

# Import tqdm for progress bar
from tqdm.notebook import tqdm

In [3]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

In [4]:
KFOLD = 5

GLOVE_FILE = '/kaggle/input/glove6b/glove.6B.200d.txt'


# Load data

In [5]:
train_df=pickle.load(open("/kaggle/input/nlp-disastertweets-eda-cleaning/train_cleaned_df.pkl", "rb"))
test_df=pickle.load(open("/kaggle/input/nlp-disastertweets-eda-cleaning/test_cleaned_df.pkl", "rb"))

# Tokenize the scentences

Here I simply separate the words, since we have done cleaning

In [6]:
def tokenize(text):
    words = text.lower().split() 
    # separate non letters from words, e.g. "100year" -> "100" "year"
    result = []
    for word in words:
        result.extend(list(filter(None, re.split(r'([a-z]+)', word))))
    return result

train_df['text_token'] = train_df['text'].apply(lambda x : tokenize(x))
test_df['text_token'] = test_df['text'].apply(lambda x : tokenize(x))
train_df.head()

Unnamed: 0,id,keyword,location,text,target,text_token
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to shelter in place are be...,1,"[all, residents, asked, to, shelter, in, place..."
3,6,,,13000 people receive wildfires evacuation orde...,1,"[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby Alaska as s...,1,"[just, got, sent, this, photo, from, ruby, ala..."


# Word embedding
Before we can train a model, we need to embed the words in to dense vectors for the input of the recurance neuro network.
Here we use GloVe embedding with pretrained data set from https://www.kaggle.com/datasets/yesornope/glove6b

In [7]:
%%time

# create embedding index
print("Loading ", GLOVE_FILE)

embedding_index = {}
if GLOVE_FILE.endswith("pkl"):
    
    embedding_index = np.load(GLOVE_FILE, allow_pickle=True)
else:
    with open(GLOVE_FILE, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefs

Loading  /kaggle/input/glove6b/glove.6B.200d.txt
CPU times: user 14.2 s, sys: 893 ms, total: 15.1 s
Wall time: 19.1 s


In [None]:
# We will use the max_len as the input sequence length, text would be padded or truncated (possible in test data)
max_len = max(train_df['text_token'].apply(lambda x : len(x)))
print("max text token length:", max_len)

In [None]:
def prepare_data_embedding(data_tokens, max_len=max_len, embedding_index=embedding_index):
    print("Input data tokens shape:", data_tokens.shape)
    missing_embedding_count = 0
    total_words_count = 0
    missing_embedding_words = {}
    embedding_dim = next(iter(embedding_index.values())).shape[0]
    output = np.zeros((data_tokens.shape[0], max_len, embedding_dim))
    print("Output shape:", output.shape)
    for i in range(len(data_tokens)):
        scentence_tokens = data_tokens[i]
        for j in range(min(max_len, len(scentence_tokens))):
            word = scentence_tokens[j]
            total_words_count += 1
            if word in embedding_index:
                embeding = embedding_index[word]
                output[i, j, :] = embeding
            else:
                #print("missing embedding in i:", i, word, ":", scentence_tokens)
                missing_embedding_words[word] = missing_embedding_words.get(word, 0) + 1
                missing_embedding_count += 1
                
    print(f"Embedding coverage: {round(100*(total_words_count-missing_embedding_count)/total_words_count, 1)}%")
    print(f"Missing embedding unique words: {len(missing_embedding_words)}")
    #print(sorted(missing_embedding_words.items(), key=lambda x:x[1], reverse=True))
    return output, missing_embedding_words

X, missing_embedding_words_train = prepare_data_embedding(train_df['text_token'].values)
y = train_df["target"].values

X_test, missing_embedding_words_test = prepare_data_embedding(test_df['text_token'].values)

In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

# Create train/val data

In [None]:


kfold = KFold(n_splits=KFOLD, random_state=42, shuffle=True)

folds_idx_train = []
folds_idx_val = []

for fold_idx, (train_index, val_index) in enumerate(kfold.split(X)):
    print("fold", fold_idx, "train:", len(train_index), "val:", len(val_index))
    folds_idx_train.append(train_index)
    folds_idx_val.append(val_index)


# Create torch Dataset
Dataset can be used by Dataloader for input of the model

In [None]:
class NLPDs(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        X_item = torch.tensor(self.X[index], dtype=torch.float)
        y_item = torch.tensor(self.y[index], dtype=torch.float) if self.y is not None else torch.tensor([])
        return {
            'X': X_item,
            'y': y_item
        }

# Model selection
Since this is a excercise for RNN, I will compare both pure RNN and LSTM if the perform differently.
After testing both RNN and LSTM, it turns out LSTM has a much better performance than RNN. TO keep the Notebook concise, I have't keep the RNN code, but just show the best Model.

In [None]:
embedding_dim = next(iter(embedding_index.values())).shape[0]

class RNNClassifier(nn.Module):

    def __init__(self, 
                 embedding_dim=embedding_dim, 
                 rnn_hidden_dim=128,
                 rnn_bidirectional=False,
                 rnn_num_layers=1,
                 fc_hidden_1_dim=128,
                 fc_hidden_2_dim=128,
                 vocab_size=max_len,
                 dropout_p=0):
        super(RNNClassifier, self).__init__()
        self.dropout_p = dropout_p
        self.rnn = nn.LSTM(embedding_dim, rnn_hidden_dim, bidirectional=rnn_bidirectional, num_layers=rnn_num_layers, batch_first=True)
        
        rnn_features = rnn_hidden_dim * rnn_num_layers
        if rnn_bidirectional:
            rnn_features = rnn_features * 2
        
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout_p),
            nn.Linear(rnn_features, fc_hidden_1_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(fc_hidden_1_dim, fc_hidden_2_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(fc_hidden_2_dim, 1),

        )
    def forward(self, X):
        _, (out, _) = self.rnn(X)

        out = torch.transpose(out, 0, 1)
        out = torch.flatten(out, start_dim=1)
        out = self.classifier(out)
        
        return out


# Train the model

## Hyper parameters tuning

Hyper parameters tuning is very important, it decide the model training speed and perforamnce.
In my model, I turned the following parameers: learning rate, learning rate scheduler, optimizer, model size (LSTM hidden layer size, fully connected hidden layer size, etc.), regulation parameters like dropout rate, weight_decay, etc.

In order to keep the notebook concise, I didn't keep the code and result for all the different values of parameters, but put the final optimized values instead.

In [None]:
import copy
from IPython.display import display, clear_output
from torchmetrics.classification import BinaryAccuracy, BinaryF1Score

@torch.inference_mode()
def validate(model, val_loader):
    
    total_loss = 0
    loss_function = nn.BCEWithLogitsLoss()
    val_acc = BinaryAccuracy()
    val_f1 = BinaryF1Score()
    model.eval()
    
    for step, data in enumerate(val_loader):
        
        X, y = data["X"], data["y"]
        
        output = model(X)
        output = torch.squeeze(output, dim=1)
        loss = loss_function(output, y)
        
        output = F.sigmoid(output)
        val_acc(output, y)    
        val_f1(output, y)
        
        total_loss += loss.item()

    return total_loss/(len(val_loader)), val_acc.compute().item(), val_f1.compute().item()


def train(model, train_loader, fold_idx, epochs, loss_function, optimizer, scheduler):
    
    
    history = {
        "train_loss": [],
        "train_acc": [],
        "train_f1": [],
        "val_loss": [],
        "val_acc": [],
        "val_f1": []
    }
    best_val_f1 = 0
    
    for epoch in range(epochs):
        model.train()
        
        total_loss = 0
        train_acc = BinaryAccuracy()
        train_f1 = BinaryF1Score()

        for step, data in enumerate(train_loader):

            optimizer.zero_grad()
            X, y = data["X"], data["y"]
            
            
            output = torch.squeeze(model(X), dim=1)
            loss = loss_function(output, y)
            
            output = F.sigmoid(output)
            train_acc(output, y)
            temp_f1 = train_f1(output, y)
            
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            
            current_train_loss = (total_loss/(step+1))
            current_train_acc = train_acc.compute().item()
            current_train_f1 = train_f1.compute().item()
            current_lr = round(scheduler.get_last_lr()[0], 5)
            
        scheduler.step()
        
        val_loss, val_acc, val_f1 = validate(model, val_loader)
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            
            best_val_model_wts = copy.deepcopy(model.state_dict())
            torch.save(best_val_model_wts, f"fold_{fold_idx}_best_model.bin")
        
        print(f"Fold {fold_idx} Epoch {epoch}: "
              f"loss={round(current_train_loss, 5)}, "
              f"val_loss={round(val_loss, 5)}, "
              f"acc={round(current_train_acc, 5)}, "
              f"val_acc={round(val_acc, 5)}, "
              f"f1={round(current_train_f1, 5)}, "
              f"val_f1={round(val_f1, 5)}, "
              f"lr={round(current_lr, 5)}"
             )
        history["train_loss"].append(current_train_loss)
        history["train_acc"].append(current_train_acc)
        history["train_f1"].append(current_train_f1)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)
        history["val_f1"].append(val_f1)
    
    print("Fold", fold_idx, "best val F1 score:", best_val_f1)
    return best_val_f1, history

In [None]:
! rm -rf /kaggle/working/*

In [None]:
EPOCHS = 10
LR = 1e-2
MIN_LR = 1e-4

total_f1 = 0
for fold_idx in range(len(folds_idx_train)):
    train_idx = folds_idx_train[fold_idx]
    val_idx = folds_idx_val[fold_idx]
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]
    
    train_ds = NLPDs(X_train, y_train)
    val_ds = NLPDs(X_val, y_val)

    train_loader = DataLoader(train_ds, batch_size=64, num_workers=2, shuffle=True, pin_memory=True, drop_last=False)
    val_loader = DataLoader(val_ds, batch_size=32, num_workers=2, shuffle=False, pin_memory=True, drop_last=False)

    pos_weights = np.sum(1-y_train) / np.sum(y_train)
    print("pos_weights:", pos_weights)
    
    model = RNNClassifier(
        rnn_bidirectional=True, 
        rnn_num_layers=2,
        rnn_hidden_dim=32, 
        fc_hidden_1_dim=64, 
        fc_hidden_2_dim=64, 
        dropout_p=0.8)
    
    loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weights))

    optimizer = optim.Adam(model.parameters(), lr=LR, betas = (0.9,0.999),eps = 1.0*1e-8, weight_decay=2e-5)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=EPOCHS, eta_min=MIN_LR)


    best_val_f1, history = train(model, train_loader, fold_idx, EPOCHS, loss_function, optimizer, scheduler)
    total_f1 += best_val_f1
    
    
print("Average val F1 score:", total_f1/len(folds_idx_train))

## Plot the metrics of the last fold

In [None]:
def plot_train_history(history, metric="acc"):
    train_metric = history[f"train_{metric}"]
    val_metric = history[f"val_{metric}"]
    plt.plot(history[f"train_{metric}"])
    plt.plot(history[f"val_{metric}"])
    plt.title(f"{metric}")
    plt.ylabel(metric)
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

plot_train_history(history, "f1")
plot_train_history(history, "acc")
plot_train_history(history, "loss")

# Inference on test data

## Load models

Ensemble models from each fold and average the results.

In [None]:
def load_weights(model, weights_path):
    if os.path.isfile(weights_path):
        model.load_state_dict(torch.load(weights_path))
        print("Loaded weights from", weights_path)
        return True
    else:
        print("No previous weights available at", weights_path)
        return False

In [None]:

models = []
for fold_idx in range(len(folds_idx_train)):
    model = RNNClassifier(
        rnn_bidirectional=True, 
        rnn_num_layers=2,
        rnn_hidden_dim=32, 
        fc_hidden_1_dim=64, 
        fc_hidden_2_dim=64, 
    )
    load_weights(model, f"/kaggle/working/fold_{fold_idx}_best_model.bin")
    models.append(model)

In [None]:
test_ds = NLPDs(X_test)
test_loader = DataLoader(test_ds, batch_size=32, num_workers=2, shuffle=False, pin_memory=True, drop_last=False)

In [None]:
@torch.inference_mode()
def infer_by_one_model(model, test_loader):
    progress_bar = tqdm(total=len(test_loader), desc=f"Test", leave=True)
    
    model.eval()
    pred_list = []
    
    for step, data in enumerate(test_loader):
        X = data["X"]
        output = model(X)
        output = torch.squeeze(output, dim=1)
        pred_list.append(output.detach().cpu().numpy())
        progress_bar.update()
    
    progress_bar.close()
    return np.concatenate(pred_list)

models_results = []
for model in models:
    models_results.append(infer_by_one_model(model, test_loader))


In [None]:
prob = np.sum(np.stack(models_results), axis=0) / len(models)

In [None]:
test_df["target_prob"] = prob
test_df["target"] = test_df["target_prob"].apply(lambda x: 1 if x>=0.5 else 0)
sum(test_df["target"]==1)
test_df.head()

In [None]:
smpl_sub = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sub = pd.merge(smpl_sub[["id"]], test_df[["id","target"]], on="id", how="left")

sub.to_csv("submission.csv", index=False)

sub.head()