This is a repository for the Kaggle Challenge in "Natural Language Processing with Disaster Tweets". It consists of the prediction if the tweet mentions a real disaster or not.

Challenge link: https://www.kaggle.com/competitions/nlp-getting-started/data

In [None]:
import torch
!pip install scikit-learn pandas re tqdm numpy
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device in use: {device}")
import os
import sklearn
import pandas
import re
import tqdm
import numpy as np
import torch.utils.data as data_utils
from tqdm import tqdm
!pip install boto3 sentencepiece sacremoses transformers alive-progress
from alive_progress import alive_bar
import sys
sys.stdout.isatty()
import boto3
import requests
import regex
import sentencepiece
import sacremoses
import transformers
import random
RANDOM_SEED=0
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
MODEL_CONFIG="bert-base-cased"

In order to add input from kaggle, use:

In [None]:

import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_path="/kaggle/input/nlp-data/train.csv"
test_path="/kaggle/input/nlp-data/test.csv"
submission_path = "/kaggle/input/nlp-data/sample_submission.csv"


Whenever the jupyter code is used on Google Collab, its possible to load the data from Google Drive with:

In [None]:
from google.colab import drive
## Mount google drive
drive.mount('/content/drive/')

Unzip the data!

In [None]:
!unzip /content/drive/MyDrive/Projects/Kaggle-Disaster-Tweets/data/nlp-getting-started.zip
train_path="train.csv"
test_path="test.csv"
submission_path = "sample_submission.csv"

### Load data

The data (training set and testing set) is loaded via a pandas.

In [None]:
# Load data
from sklearn.utils import shuffle

df_train = pandas.read_csv(train_path)
df_test = pandas.read_csv(test_path)
df_test["target"] = 0
print(f"Number of samples for training: {len(df_train)}")
print(f"Number of samples for testing: {len(df_test)}")
print("Training data structure:")
print(df_train.keys())
print(df_train.head())

# ChatGPT prompting

Because this problem is a NLP classification problem, we can make use of ChatGPT in order to make the classification. In order to do so, we require to make specific prompts and give context to ChatGPT for identifying if the text talks about a disaster or not.

In [None]:
# Normal classification prompt
prompt = "You are a tweet analyst in order to monitor possible emergencies is posted online like accidents (car accidents, airplane accidents, train wrecks or any type of accident), natural disasters (for example: earthquakes, typhoon, tsunamis, storm damage, fire...etc), crimes (like homicides, killings, bombing, terrorism, casualties), war, scandals....etc.  It’s not always clear whether a tweet´s words are actually referring to a disaster that happened or is happening. ANSWER ONLY WITH ONE INT VALUE: 1 (if the tweet speaks about a disaster or emergency) OR 0 (if not)!!!!!. DO NOT ANSWER WITH MORE THAN ONE INT VALUE!!!! TEXT: {query} YOUR RESPONSE: "


In [None]:
# Few shot learning prompt
prompt = "You are a tweet analyst in order to monitor possible emergencies is posted online like fire, car or airplane accidents, earthquakes, tsunamis, homicides, bombing, war, storm damage....etc.  It’s not always clear whether a tweet´s words are actually referring to a disaster that happened or is happening. ANSWER ONLY WITH ONE INT VALUE: 1 (if the tweet speaks about a disaster) OR 0 (if not)!!!!!. DO NOT ANSWER WITH MORE THAN ONE INT VALUE!!!! TEXT: On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE. DISASTER: 0. TEXT: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all. DISASTER:1. TEXT: I'm on top of the hill and I can see a fire in the woods... DISASTER: 1 TEXT: Jays rocking #MLB @JoeyBats19 just bombed one out of Rogers Centre. Play-offs r ahead for The #BlueJays - Bell Moseby and Barfield r back! DISASTER: 0 TEXT: {query} DISASTER: "


Use openai library and pass key for making calls to ChatGPT with a specific prompt.

In [None]:
!pip install openai cohere tiktoken
import openai
import csv
openai.api_key = "KEY"  # https://platform.openai.com/account/api-keys


fieldnames = ["id","text", "pred"]

while True:
  csv_filename = 'drive/MyDrive/chat_gpt_predictions.csv'
  predictions = []
  if os.path.exists(csv_filename):
      predictions = pandas.read_csv(csv_filename, index_col=0)
      predictions = predictions.index
  else:
      with open(csv_filename, 'w') as csvfile:
        fieldnames = ["id","text", "pred"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

  try:
    with open(csv_filename, 'a') as csvfile:
        fieldnames = ["id","text", "pred"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        with alive_bar(int(len(df_test)-len(predictions)), force_tty=True) as bar:
          for index, row in df_test.iterrows():
              bar()
              index = df_test.loc[index,"id"]
              if index in predictions:
                continue
              content = prompt.format(query=row["text"])

              messages = [{"role": "system",
                  "content": "You are an useful tweet analysist."}, {"role": "user", "content": content}]

              response = openai.ChatCompletion.create(
                  model="gpt-3.5-turbo", messages=messages, max_tokens=1)  #  Max tokens to 1 for just one token response

              response_content = response.choices[0].message.content
              writer.writerow({"id":index, "text":row["text"], "pred":response_content})
          break
  except Exception as exception:
    print(exception)


In [None]:
# Submission
chat_gpt_preds = pandas.read_csv("chat_gpt_predictions.csv", index_col=0)
df_submission = pandas.read_csv(submission_path, index_col=0)
df_submission.loc[chat_gpt_preds.index, "target"] = chat_gpt_preds["pred"].apply(float).apply(int)
df_submission.to_csv("submission.csv")

In [None]:
# TODO: Test with google bard or another LLM

# Data Preprocessing




Its important to shuffle the data so that the training algorithm does not see consecutive similar data:

In [None]:
df_train = shuffle(df_train, random_state=RANDOM_SEED)

### Data augmentation

In order to add more variety to the data and avoid overfitting and more generalization, its possible to apply data augmentation. In this case, because we are handling text, the NLPAug library is used. With this library, multiple operations can be applied over text: Translation to another language and back to the original language, replace the text with synonims or antonyms, add lexical or gramatical errors...etc.

In [None]:
!pip install nlpaug
import nlpaug.flow as naf
import nlpaug.augmenter.word as naw

flow = naf.Sequential([
    naw.BackTranslationAug(device="cuda"),
    naw.SynonymAug(aug_p=0.3)
])

print(f"Number of samples for training before aug: {len(df_train)}")
percent_samples_aug = 0.2
print(int(len(df_train) * percent_samples_aug))
df_aug = df_train.iloc[0:int(len(df_train) * percent_samples_aug), :].copy()
for text_idx in tqdm(range(len(df_aug["text"]))):
  text = df_aug.iloc[text_idx]["text"]
  aug_text = flow.augment(text)
  df_aug.iloc[text_idx]["text"] = aug_text

print(f"Number of samples for training after aug: {len(df_train)}")

df_aug.to_csv("train_aug.csv", index=False)


In [None]:
df_aug = pandas.read_csv("train_aug.csv")

In [None]:
df_train = pandas.concat([df_train, df_aug], ignore_index=True)
df_train = shuffle(df_train, random_state=RANDOM_SEED)

### Self-made preprocessing

**IN PROGRESS**

In [None]:
## Data preprocessing
def df_text_preprocessing(df):
  sentences = [re.sub("[@#'.,!?-]", '', text.lower()) for text in df["text"]]
  print(sentences[0:5])
  words = [list(set(" ".join(sentence))) for sentence in sentences]
  df["words"] = words
  return df

In [None]:
df_train = df_text_preprocessing(df_train)
print(df_train["words"][2])
print(df_train["text"][2])

### Preprocessing BERT with Hugginface



One option for doing NLP Classification (or any general AI task), is to finetune a large and generic model to a specific task like this. BERT is one of these large models. It was as "Pre-training of Deep Bidirectional Transformers for Language Understanding". On one side, its possible to encode the texts and finetune these encodings with Machine Learning Algorithms. On the other side, you can finetune the whole Transformer with the specific data. 

In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
def df_tokenize_bert(texts, tokenizer, max_length=None):
  if max_length is None:
    max_length_info = max([len(str(text)) for text in texts])
    print(f"Max length info of: {max_length_info}")
    max_length = max_length_info
  attention_mask = [[1 if idx < len(str(text)) else 0 for idx in range(max_length)] for text in texts]
  tokens =[]
  attention_mask = []
  # TODO: use batch_encode_plus for faster extraction
  for text in texts:
    encoding = tokenizer.encode_plus(str(text), add_special_tokens=True,max_length=max_length,padding='max_length')
    tokens.append(encoding.input_ids)
    attention_mask.append(encoding.attention_mask)

  return np.int32(tokens), np.array(attention_mask, dtype=bool), max_length

In [None]:
def df_predict_encodings(tokens, attention_mask, targets, model, batch_size=8, output_layer="pooler_output"):
  tokens_tensor = data_utils.TensorDataset(torch.tensor(tokens), torch.ByteTensor(attention_mask),torch.IntTensor(targets))
  predict_loader = data_utils.DataLoader(dataset = tokens_tensor, batch_size = batch_size, shuffle = False)  # For preprocessing
  encodings = []
  model = model.to(device)
  with alive_bar(int(len(tokens)/batch_size)) as bar:
    with torch.no_grad():
      for texts, attention_mask, _ in predict_loader:
        texts = texts.to(device)
        attention_mask = attention_mask.to(device)
        encodings_batch = model(texts, attention_mask)
        encodings_batch = getattr(encodings_batch, output_layer).cpu() # Get encodings
        encodings.extend(np.float32(encodings_batch))
        bar()


  return np.float32(encodings)

In [None]:
def load_bert_encodings(df, key="text", max_length=None, model_config="bert-base-uncased", batch_size=256, output_layer="pooler_output"):
  tokenizer = BertTokenizer.from_pretrained(model_config)
  model = BertModel.from_pretrained(model_config)
  tokens, attention_mask, max_length = df_tokenize_bert(df[key], tokenizer, max_length=max_length)
  print(f"Length tokens in use {len(tokens[0])}")
  encodings = df_predict_encodings(tokens, attention_mask, df["target"], model, batch_size, output_layer)
  print(f"Number of encondings: {len(encodings)}")
  print(f"Encondings shape: {encodings.shape}")
  return encodings, max_length

Load the training encodings!

In [None]:
# Bert encodings
X_train, max_length = load_bert_encodings(df_train, key="text", max_length=300, model_config=MODEL_CONFIG, batch_size=256, output_layer="pooler_output")
Y_train = df_train["target"]

Another useful feature from the training data is the keyword. This keyword can also be encoded and used.

In [None]:
X_train_keyword, max_length = load_bert_encodings(df_train, key="keyword", max_length=300, model_config=MODEL_CONFIG, batch_size=256, output_layer="pooler_output")

In [None]:
X_test, _ = load_bert_encodings(df_test, max_length=max_length, model_config=MODEL_CONFIG, batch_size=32, output_layer="pooler_output")


In [None]:
X_test_keyword, _ = load_bert_encodings(df_test, key="keyword",max_length=max_length, model_config=MODEL_CONFIG, batch_size=32, output_layer="pooler_output")

In [None]:
X_train_features = X_train.copy()
X_train_keyword_features = X_train_keyword.copy()
X_test_features = X_test.copy()
X_test_keyword_features = X_test_keyword.copy()
Y_train_features = Y_train.copy()

#### Join text and keyword features

Join these features into one encoding!

In [None]:
print(f"X_train shape before: {X_train_features.shape}")
X_train_features = np.concatenate((X_train_features, X_train_keyword_features), axis=-1)
X_test_features = np.concatenate((X_test_features, X_test_keyword_features), axis=-1)
print(f"X_train shape after: {X_train_features.shape}")

## Train and val split

In every machine learning problem, data is splitted into a training set and testing set. Moreover, it can exist a specific validation set which is used as a previous step to evaluate the testing set. For example, the validation set in deep learning can be used for early stopping (stop training when the model does not improve over the validation set).

In [None]:
from sklearn.model_selection import train_test_split

NUM_SAMPLES_VALIDATION = 500
X_val_split = X_train_features[0:NUM_SAMPLES_VALIDATION]
Y_val_split = Y_train_features[0:NUM_SAMPLES_VALIDATION]
X_train_split = X_train_features[NUM_SAMPLES_VALIDATION:]
Y_train_split = Y_train_features[NUM_SAMPLES_VALIDATION:]
# X_train_split, X_val_split, Y_train_split, Y_val_split = train_test_split(X_train, Y_train, test_size=0.10, random_state=RANDOM_SEED)

# Finetuning

## SMOTE, RandomUnderSampler...etc

The SMOTE technique helps to augment synthetically the unrepresented class of the training set. The synthetic data is created by creating intermediate representations of the real existing data.

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
sampling_strategy = "over"

if sampling_strategy == "over":
  pipeline = SMOTE(random_state=RANDOM_SEED)
elif sampling_strategy == "under":
  pipeline = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED)
elif sampling_strategy == "both":
  over = SMOTE(sampling_strategy=0.85, random_state=RANDOM_SEED)
  under = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED)

  steps = [('o', over), ('u', under)]
  pipeline = Pipeline(steps=steps)

X_train_split, Y_train_split = pipeline.fit_resample(X_train_split, Y_train_split)

## PCA

Principal Component Analysis (PCA) can be useful to reduce the high dimensions of the features by keeping the most relevant ones (the features which has less correlation between the rest and more variety).

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
pca.fit(X_train_split)
X_train_split = pca.transform(X_train_split)
X_val_split = pca.transform(X_val_split)
X_test_features = pca.transform(X_test_features)

## SVM

Support Vector Machines (SVM) is one of the most used machine learning classification algorithms.

In [None]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train_split, Y_train_split)

Y_pred = clf.predict(X_val_split)
Y_test_pred = clf.predict(X_test_features)

## KNN


K-Nearest Neighbors (KNN) is a classification algorithm which checks the K most nearest neighbors of a specific data point in order to find the most probable class it represents. 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train_split, Y_train_split)

Y_pred = knn_classifier.predict(X_val_split)
Y_test_pred = knn_classifier.predict(X_test_features)

## Random forests

Random forests creates a big decission tree of the data automatically for classification.

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train_split, Y_train_split)

Y_pred = clf.predict(X_val_split)
Y_test_pred = clf.predict(X_test_features)

# Networks

## MLPClassifier



The Multi-Layer Perceptron (MLP) learns the best weights in order to fit the data with a loss function.

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(random_state=RANDOM_SEED, max_iter=1000, activation="logistic",learning_rate="adaptive", batch_size=128, early_stopping=True, verbose=True, n_iter_no_change=200)
clf.fit(X_train_split, Y_train_split)
Y_pred = clf.predict(X_val_split)
Y_test_pred = clf.predict(X_test_features)

## BERT Finetuning

### Custom Dataset

In [None]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 30
LEARNING_RATE = 1e-05

In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils.class_weight import compute_class_weight
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, class_weights=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.idxs = dataframe.index
        self.targets = self.data.target
        self.max_len = max_len
        self.class_weights = class_weights
        self.weights_per_class = compute_class_weight(class_weight="balanced",y=self.targets, classes=np.unique(self.targets))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        idx = self.idxs[index]
        text = str(self.text[idx])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        weight = torch.tensor(self.weights_per_class[self.targets[idx]], dtype=torch.float)
        targets = torch.tensor([self.targets[idx]], dtype=torch.float)
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': targets,
            'weight': weight
        }

In [None]:
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_CONFIG)
new_df = df_train[['text', 'target']].copy()
NUM_SAMPLES_VALIDATION = 500

training_df = new_df[NUM_SAMPLES_VALIDATION:]
validation_df = new_df[0: NUM_SAMPLES_VALIDATION]

training_loader = CustomDataset(training_df, tokenizer, max_len=MAX_LEN)
validation_loader = CustomDataset(validation_df, tokenizer, max_len=MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_loader, **train_params)
validation_loader = DataLoader(validation_loader, **valid_params)


### Model definition

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, model):
        super(BERTClass, self).__init__()
        self.l1 = model
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768,64)
        self.l4 = torch.nn.Linear(64,1)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(input_ids=ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output_3 = self.l3(output_2)
        output = self.l4(output_3)
        activation = torch.sigmoid(output)
        return activation

model = BERTClass(model=BertModel.from_pretrained(MODEL_CONFIG))
model.to(device)

### Training Loop

In [None]:
def loss_fn(outputs, targets,weights=None):
    losses = torch.nn.BCELoss(reduction='none')(outputs, targets, )
    return torch.mean(weights*losses)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)


In [None]:
def forward(data, model):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)
    weights = data['weight'].to(device, dtype = torch.float)
    outputs = model(ids, mask, token_type_ids)
    optimizer.zero_grad()
    loss = loss_fn(outputs, targets,weights)
    return loss

def train(epochs, model, training_loader, validation_loader, early_stopping=False, n_iter_no_change=5):
    MIN_VAL_LOSS = 100000000
    best_model = model
    n_iter_no_change_count = 0
    for epoch in range(epochs):
      model.train()
      total_loss = 0
      num_iterations = 1
      for _,data in enumerate(training_loader, 0):
          optimizer.zero_grad()
          loss = forward(data, model)
          total_loss += loss.item()
          print(f'Epoch: {epoch}, Loss:  {total_loss/num_iterations}')
          num_iterations += 1
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
      if early_stopping:
        total_val_loss = 0
        num_val_iterations = 1
        print("Validating...")
        for _, data in enumerate(validation_loader, 0):
            val_loss = forward(data, model)
            total_val_loss += val_loss.item()
            num_val_iterations += 1
        val_loss = total_val_loss/num_val_iterations
        print(f"Val_loss: {val_loss}, MIN_VAL_LOSS: {MIN_VAL_LOSS}")
        if val_loss < MIN_VAL_LOSS:
            MIN_VAL_LOSS = val_loss
            n_iter_no_change_count = 0
            print(f"Validation loss has improved to {val_loss}!")
            torch.save(model.state_dict(), "bert_finetuned")
            best_model = model
        else:
          n_iter_no_change_count += 1
        print(f'Epoch: {epoch}, Loss:  {total_loss/num_iterations}, Validation loss: {val_loss}')
        if n_iter_no_change_count >= n_iter_no_change:
            return best_model
    return best_model

In [None]:
model = train(EPOCHS, model, training_loader, validation_loader, early_stopping=True, n_iter_no_change=5)

In [None]:
testing_loader = CustomDataset(df_test, tokenizer, max_len=MAX_LEN)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_loader, **test_params)

In [None]:
def predict(model, testing_loader):
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_outputs.extend((outputs.cpu().detach().numpy() > 0.5)[:,0].astype(int).tolist() )
    return fin_outputs

In [None]:
Y_pred_test = predict(model, testing_loader)

# Testing

In [None]:
from sklearn.metrics import precision_recall_fscore_support
def eval_test(y_true, y_pred):
  precission, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
  print(f"Population {len(y_true)}")
  print(f"F1-score\t| Precission\t| Recall")
  print(f"{(fscore*100).round(2)}%\t\t| {(precission*100).round(2)}%\t| {(recall*100).round(2)}%")
  return precission, recall, fscore

In [None]:
precission, recall, fscore = eval_test(Y_val_split.values, Y_pred)

## Save model

In [None]:
import pickle
pickle.dump(pca, open("pca.sav", 'wb'))
# Reload clf
# loaded_model = pickle.load(open("model.sav", 'rb'))

## Submit



In [None]:
df_submission = pandas.read_csv(submission_path)
df_submission.loc[:,"target"] = Y_pred_test
df_submission.to_csv("submission.csv", index=False)