# Summary

The target of the project is to create a supervised model with sequencial data deep learning model, to predict if a tweet indicates a real disaster. This is useful for some organizations to actively monitoring tweets for deteting disaster in near real time. 

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import string
import pickle
import time
import gc

from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
import torchmetrics

# Import tqdm for progress bar
from tqdm.notebook import tqdm

import transformers
from transformers import BertTokenizer, BertModel, BertConfig

In [None]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
print("device:", device)

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

In [None]:
KFOLD = 5


# Load data

In [None]:

train_df=pickle.load(open("/kaggle/input/nlp-disastertweets-eda-cleaning/train_cleaned_df.pkl", "rb"))
test_df=pickle.load(open("/kaggle/input/nlp-disastertweets-eda-cleaning/test_cleaned_df.pkl", "rb"))


In [None]:
print("train data shape:", train_df.shape)
print("test data shape:", test_df.shape)

In [None]:
train_df.head()

# Tokenize the scentences

Here I simply separate the words, since we have done cleaning

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def tokenize(text):
    output = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=30,
        truncation=True,
        padding="max_length",
        return_token_type_ids=True
    )
    return np.array([output["input_ids"], output["token_type_ids"], output["attention_mask"]])

In [None]:
tqdm.pandas()
X_origin = np.array(train_df['text'].progress_apply(lambda x : tokenize(x)))


In [None]:
X = np.stack([X_origin], axis=1)
X.shape

In [None]:
# Expand dimension to match dataset class
X_test = np.array(test_df['text'].progress_apply(lambda x : tokenize(x)))
X_test = np.expand_dims(X_test, axis=1)
X_test.shape

In [None]:
y = train_df['target'].values

# Create train/val data

In [None]:

kfold = KFold(n_splits=KFOLD, random_state=42, shuffle=True)

folds_idx_train = []
folds_idx_val = []

for fold_idx, (train_index, val_index) in enumerate(kfold.split(X)):
    print("fold", fold_idx, "train:", len(train_index), "val:", len(val_index))
    folds_idx_train.append(train_index)
    folds_idx_val.append(val_index)


# Create torch Dataset
Dataset can be used by Dataloader for input of the model

In [None]:
class NLPDs(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
        print("NLPDs X.shape:", X.shape)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        # randomly select augmentation if there is
        X_item = self.X[index][random.randrange(0, self.X.shape[1])]
        y_item = torch.tensor(self.y[index], dtype=torch.float) if self.y is not None else torch.tensor([])
        return {
            'X': torch.tensor(X_item, dtype=torch.long),
            'y': y_item
        }

# Model selection
Since this is a excercise for RNN, I will compare both pure RNN and LSTM if the perform differently.
After testing both RNN and LSTM, it turns out LSTM has a much better performance than RNN. TO keep the Notebook concise, I have't keep the RNN code, but just show the best Model.

In [None]:

class BertClassifier(nn.Module):

    def __init__(
        self, 
        fc_hidden_1_dim=128,
        fc_hidden_2_dim=128,
        dropout_p_1=0.9,
        dropout_p_2=0.8,
        dropout_p_3=0.7,
        multi_dropout_sample_n = 8,
    ):
        super(BertClassifier, self).__init__()
        self.multi_dropout_sample_n = multi_dropout_sample_n
        
        self.bert = transformers.BertModel.from_pretrained('bert-large-uncased')
        
        bert_features = 1024
        
        self.classifiers = nn.ModuleList()

        for i in range(self.multi_dropout_sample_n):
            self.classifiers.append(
                nn.Sequential(
                    nn.Dropout(p=dropout_p_1),
                    nn.Linear(bert_features, fc_hidden_1_dim),
                    nn.ReLU(),
                    nn.Dropout(p=dropout_p_1),
                    nn.Linear(fc_hidden_1_dim, fc_hidden_2_dim),
                    nn.ReLU(),
                    nn.Dropout(p=dropout_p_3),
                    nn.Linear(fc_hidden_2_dim, 1),
                    #nn.Sigmoid()
                )
            )
    def forward(self, X):
        #print("X:", X.size())
        input_ids, token_type_ids, attention_mask = X[:,0,:], X[:,1,:], X[:,2,:]
        
        out = self.bert(
            input_ids, 
            attention_mask = attention_mask, 
            token_type_ids = token_type_ids, 
            return_dict=True).last_hidden_state
        #https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/roberta/modeling_roberta.py#L567
        out = out[:, 0, :]

        # multi dropout sample
        multi_dropout_out = []
        for classifier in self.classifiers:
            temp_out = classifier(out)
            #print("out:", out.size())
            multi_dropout_out.append(temp_out)

        out = torch.mean(torch.stack(multi_dropout_out), dim=0)
        return out


# Train the model

## Hyper parameters tuning

Hyper parameters tuning is very important, it decide the model training speed and perforamnce.
In my model, I turned the following parameers: learning rate, learning rate scheduler, optimizer, model size (LSTM hidden layer size, fully connected hidden layer size, etc.), regulation parameters like dropout rate, weight_decay, etc.

In order to keep the notebook concise, I didn't keep the code and result for all the different values of parameters, but put the final optimized values instead.

In [None]:
import copy
from IPython.display import display, clear_output
from torchmetrics.classification import BinaryAccuracy, BinaryF1Score

@torch.inference_mode()
def validate(model, val_loader, show_progress):
    if show_progress:
        progress_bar = tqdm(total=len(val_loader), desc=f"Validate", leave=False)
    
    total_loss = 0
    loss_function = nn.BCEWithLogitsLoss().to(device)
    val_acc = BinaryAccuracy().to(device)
    val_f1 = BinaryF1Score().to(device)
    model.eval()
    
    for step, data in enumerate(val_loader):
        
        X, y = data["X"].to(device, dtype=torch.long), data["y"].to(device, dtype=torch.float)
        
        output = model(X)
        output = torch.squeeze(output, dim=1)
        loss = loss_function(output, y)
        
        output = F.sigmoid(output)
        val_acc(output, y)    
        val_f1(output, y)
        
        total_loss += loss.item()
        
        X.to("cpu")
        del X
        y.to("cpu")
        del y

        if show_progress:
            progress_bar.set_postfix(
                val_loss=(total_loss/(step+1)), 
                val_acc=val_acc.compute().item(), 
                val_f1=val_f1.compute().item(),
            )
            progress_bar.update()
    if show_progress:    
        progress_bar.close()
    return total_loss/(len(val_loader)), val_acc.compute().item(), val_f1.compute().item()


def train(model, train_loader, fold_idx, epochs, loss_function, optimizer, scheduler, show_progress=False):
    
    fold_start_time = time.time()
    
    history = {
        "train_loss": [],
        "train_acc": [],
        "train_f1": [],
        "val_loss": [],
        "val_acc": [],
        "val_f1": []
    }
    best_val_f1 = 0
    best_val_loss = np.inf
    
    for epoch in range(epochs):
        epoch_start_time = time.time()
        model.train()
        if show_progress:
            progress_bar = tqdm(total=len(train_loader), desc=f"Fold {fold_idx} Epoch {epoch}", leave=False)

        total_loss = 0
        train_acc = BinaryAccuracy().to(device)
        train_f1 = BinaryF1Score().to(device)

        for step, data in enumerate(train_loader):

            optimizer.zero_grad()
            
            X, y = data["X"].to(device, dtype=torch.long), data["y"].to(device, dtype=torch.float)
            output = torch.squeeze(model(X), dim=1)
            loss = loss_function(output, y)
            
            output = F.sigmoid(output)
            train_acc(output, y)
            temp_f1 = train_f1(output, y)
            
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            
            # release memory
            X.to("cpu")
            del X
            y.to("cpu")
            del y
            
            
            current_train_loss = (total_loss/(step+1))
            current_train_acc = train_acc.compute().item()
            current_train_f1 = train_f1.compute().item()
            
            if show_progress:
                progress_bar.set_postfix(
                    loss=current_train_loss, 
                    acc=current_train_acc, 
                    f1=current_train_f1,
                )
                progress_bar.update()
        
        if show_progress:
            progress_bar.close()
            
        scheduler.step()

        val_loss, val_acc, val_f1 = validate(model, val_loader, show_progress)
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            
            best_val_f1_model_wts = copy.deepcopy(model.state_dict())
            
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_loss_model_wts = copy.deepcopy(model.state_dict())
            
        current_lr = round(scheduler.get_last_lr()[0], 8)
        print(f"Fold {fold_idx} Epoch {epoch}: "
              f"{round(time.time()-epoch_start_time, 1)}s, "
              f"loss={round(current_train_loss, 5)}, "
              f"val_loss={round(val_loss, 5)}, "
              f"acc={round(current_train_acc, 5)}, "
              f"val_acc={round(val_acc, 5)}, "
              f"f1={round(current_train_f1, 5)}, "
              f"val_f1={round(val_f1, 5)}, "
              f"lr={current_lr}"
             )
        history["train_loss"].append(current_train_loss)
        history["train_acc"].append(current_train_acc)
        history["train_f1"].append(current_train_f1)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)
        history["val_f1"].append(val_f1)

    torch.save(best_val_f1_model_wts, f"fold_{fold_idx}_best_f1_model.bin")
    torch.save(best_val_loss_model_wts, f"fold_{fold_idx}_best_loss_model.bin")
    
    print("Fold", fold_idx, "best val F1 score:", best_val_f1, ", best val loss:", best_val_loss, ", duration:", round(time.time()-fold_start_time, 1), "s")
    return best_val_f1, best_val_loss, history


# Move data in optimizer to cpu to release memory from gpu
def optimizer_to_cpu(optimizer):
    cpu_device = "cpu"
    for param in optimizer.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(cpu_device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(cpu_device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(cpu_device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(cpu_device)

In [None]:
! rm -rf working/*

In [None]:
EPOCHS = 8
LR = 1e-5
MIN_LR = 1e-6

FC_HIDDEN_1_DIM = 256
FC_HIDDEN_2_DIM = 128


total_f1 = 0
total_loss = 0
for fold_idx in range(len(folds_idx_train)):
    
    train_idx = folds_idx_train[fold_idx]
    val_idx = folds_idx_val[fold_idx]
    X_train, y_train = X[train_idx], y[train_idx]
    # for val we do not need augmentation
    X_val, y_val = X[val_idx][:, 0:1], y[val_idx]
    
    train_ds = NLPDs(X_train, y_train)
    val_ds = NLPDs(X_val, y_val)

    train_loader = DataLoader(train_ds, batch_size=128, num_workers=2, shuffle=True, pin_memory=True, drop_last=False)
    val_loader = DataLoader(val_ds, batch_size=128, num_workers=2, shuffle=False, pin_memory=True, drop_last=False)

    pos_weights = np.sum(1-y_train) / np.sum(y_train)
    print("pos_weights:", pos_weights)
    
    model = BertClassifier(
        fc_hidden_1_dim=FC_HIDDEN_1_DIM, 
        fc_hidden_2_dim=FC_HIDDEN_2_DIM, 
        dropout_p_1=0.5,
        dropout_p_2=0.5,
        dropout_p_3=0.5,
        multi_dropout_sample_n=1
    )
    model.to(device)
    
    loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weights)).to(device)
    #loss_function = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR, betas = (0.9,0.999),eps = 1.0*1e-8, weight_decay=1e-5)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=EPOCHS, eta_min=MIN_LR)


    best_val_f1, best_val_loss, history = train(model, train_loader, fold_idx, EPOCHS, loss_function, optimizer, scheduler, show_progress=False)

    total_f1 += best_val_f1
    total_loss += best_val_loss
    
    # Relase gpu/cpu memory
    model.to("cpu")
    optimizer_to_cpu(optimizer)
    del model
    del optimizer
    torch.cuda.empty_cache()
    gc.collect()
    
    
print("Average val F1 score:", round(total_f1/len(folds_idx_train), 5), "Average val loss:", round(total_loss/len(folds_idx_train), 5))

In [None]:
# Relase gpu/cpu memory
try:
    model.to("cpu")
    optimizer_to_cpu(optimizer)
    del model
    del optimizer
    torch.cuda.empty_cache()
    gc.collect()
except:
    pass

## Plot the metrics of the last fold

In [None]:
def plot_train_history(history, metric="acc"):
    train_metric = history[f"train_{metric}"]
    val_metric = history[f"val_{metric}"]
    plt.plot(history[f"train_{metric}"])
    plt.plot(history[f"val_{metric}"])
    plt.title(f"{metric}")
    plt.ylabel(metric)
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

plot_train_history(history, "f1")
plot_train_history(history, "acc")
plot_train_history(history, "loss")

# Inference on test data

## Load models

Ensemble models from each fold and average the results.

In [None]:
def load_weights(model, weights_path):
    if os.path.isfile(weights_path):
        model.load_state_dict(torch.load(weights_path))
        print("Loaded weights from", weights_path)
        return True
    else:
        print("No previous weights available at", weights_path)
        return False

In [None]:

best_f1_models = []
for fold_idx in range(len(folds_idx_train)):
    model = BertClassifier(
        fc_hidden_1_dim=FC_HIDDEN_1_DIM, 
        fc_hidden_2_dim=FC_HIDDEN_2_DIM, 
        dropout_p_1=0.95,
        dropout_p_2=0.8,
        dropout_p_3=0.6,
        multi_dropout_sample_n=1
    ).to(device)
    loaded = load_weights(model, f"fold_{fold_idx}_best_f1_model.bin")
    if loaded:
        best_f1_models.append(model)

In [None]:
test_ds = NLPDs(X_test)
test_loader = DataLoader(test_ds, batch_size=32, num_workers=2, shuffle=False, pin_memory=True, drop_last=False)

In [None]:
@torch.inference_mode()
def infer_by_one_model(model, test_loader):
    progress_bar = tqdm(total=len(test_loader), desc=f"Test", leave=True)
    
    model.eval()
    pred_list = []
    
    for step, data in enumerate(test_loader):
        X = data["X"].to(device, dtype=torch.long)
        output = model(X)
        output = torch.squeeze(output, dim=1)
        pred_list.append(output.detach().cpu().numpy())
        progress_bar.update()
        
        X.to("cpu")
        del X
    
    progress_bar.close()
    return np.concatenate(pred_list)


## Generate result by best val f1 model

In [None]:

models_results = []
for model in best_f1_models:
    models_results.append(infer_by_one_model(model, test_loader))

prob = np.sum(np.stack(models_results), axis=0) / len(best_f1_models)

test_df["target_prob"] = prob
test_df["target"] = test_df["target_prob"].apply(lambda x: 1 if x>=0.5 else 0)
sum(test_df["target"]==1)

smpl_sub = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sub = pd.merge(smpl_sub[["id"]], test_df[["id","target"]], on="id", how="left")

sub.to_csv("best_f1_submission.csv", index=False)

sub.head(10)

In [None]:
for model in best_f1_models:
    model.to("cpu")
    del model

del best_f1_models
torch.cuda.empty_cache()
gc.collect()

## Generate result by best val loss model

In [None]:
best_loss_models = []
for fold_idx in range(len(folds_idx_train)):
    model = BertClassifier(
        fc_hidden_1_dim=FC_HIDDEN_1_DIM, 
        fc_hidden_2_dim=FC_HIDDEN_2_DIM, 
        dropout_p_1=0.95,
        dropout_p_2=0.8,
        dropout_p_3=0.6,
        multi_dropout_sample_n=1
    ).to(device)
    loaded = load_weights(model, f"fold_{fold_idx}_best_loss_model.bin")
    if loaded:
        best_loss_models.append(model)

In [None]:

models_results = []
for model in best_loss_models:
    models_results.append(infer_by_one_model(model, test_loader))

prob = np.sum(np.stack(models_results), axis=0) / len(best_loss_models)

test_df["target_prob"] = prob
test_df["target"] = test_df["target_prob"].apply(lambda x: 1 if x>=0.5 else 0)
sum(test_df["target"]==1)

smpl_sub = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sub = pd.merge(smpl_sub[["id"]], test_df[["id","target"]], on="id", how="left")

sub.to_csv("best_loss_submission.csv", index=False)

sub.head(10)

In [None]:
for model in best_loss_models:
    model.to("cpu")
    del model

del best_loss_models
torch.cuda.empty_cache()
gc.collect()