# Importing the Libraries

In [10]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # to hide irrelevant environment warnings
import warnings
warnings.filterwarnings('ignore') # to hide irrelevant python warnings

from transformers import logging as hf_logging 
hf_logging.set_verbosity_error() # to hide irrelevant transformers warnings

import pandas as pd # for data loading and data analysis
import numpy as np # for matrix and linear algebra operations
import re # regular expression
import matplotlib.pyplot as plt # visualization
import seaborn as sns # advanced visualization
from sklearn.model_selection import train_test_split # for data splitting
import tensorflow as tf # importing tensorflow library
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification # for implementing ROBERTa
from transformers import AlbertTokenizerFast, TFAlbertForSequenceClassification # for implementing AlBert
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification # for implementing DistilBert
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score # evaluation metrics
import random
import matplotlib
matplotlib.style.use('ggplot')

ModuleNotFoundError: No module named 'tensorflow'

# Loading the Datasets

In [None]:
# loading training set
train_df = pd.read_csv('train.csv')
train_df.head() # showing first 5 rows

In [None]:
train_df.info() # overview of dataset

In [None]:
# loading test set
test_df = pd.read_csv('test.csv')
test_df.head() # showing first 5 rows

In [None]:
test_df.info() # overview of dataset

# Text Preprocessing

Label Arrangement:

0 -> not a disaster

1 -> disaster

In [None]:
label_counts = train_df['target'].value_counts() # counts of unique values in target column
plt.figure(figsize=(12, 6)) # width and height of graph
plt.bar(label_counts.index, label_counts,color ='maroon', width = 0.1) # drawing bar plot
plt.title('Label Proportion In Training Set') # title of graph
plt.xlabel("Label") # label of x-axis
plt.ylabel("Count") # label of y-axis
plt.grid(True) # adding grids to graph
plt.xticks([0, 1], ['not a disaster', 'disaster']) # putting specific tickers at x-axis
plt.show() # showing graph to screen

In [None]:
label_counts

In [None]:
# Extracting text and target information from dataframe
train_df = train_df.sample(frac=1)
temp = [(x,y) for x,y in zip(list(train_df['text']), list(train_df['target']))]
tweets = [t[0] for t in temp] # getting tweets 
y = [t[1] for t in temp] # getting labels of tweets

In [None]:
# Casting target variable to float
y = np.array(y).astype('float32')

In [None]:
def preprocess_tweets(tweets):
    """Function Applies preprocessing to tweets.
    
    Preprocessing appied to tweets include http links removed and html special characters converted
    to their respective real characters.
    
    Parameters
    ----------
    
    tweets : list
          list containing sequences of text in the form of human tweets
            
    Returns
    -------
    
    new_tweets : list
              list containing preprocessed sequences of text
    """
    new_tweets = [re.sub(r'https?://t.co/\w+','',t) for t in tweets] # removing http links from text
    new_tweets = [re.sub('&lt;','lt',t) for t in new_tweets] # converting html &lt to lt
    new_tweets = [re.sub('&gt;','gt',t) for t in new_tweets] # converting html &gt to gt
    new_tweets = [re.sub('&amp;','&',t) for t in new_tweets] # converting html &amp to &
    return new_tweets # returning preprocessed tweets

tweets = preprocess_tweets(tweets) # applying preprocessing to tweets

# ROBERTa

In [None]:
model_name = 'roberta-base' # official name of roberta model
roberta_tokenizer = RobertaTokenizerFast.from_pretrained(model_name) # applying tokenization to roberta
roberta_seq = TFRobertaForSequenceClassification.from_pretrained(model_name) # implementing roberta on sequence classification

In [None]:
roberta_seq.summary() # summary of the initialized model

In [None]:
# Combining all data in a separate list just for determining the length of sequences
all_tweets = list(pd.concat([train_df, test_df], axis=0)['text'])
all_tweets = preprocess_tweets(all_tweets)
max_len = max([len(t) for t in roberta_tokenizer(all_tweets)['input_ids']])

In [None]:
# performing data splitting based on ratio 80/20 -> 80% for training and 20% for validation
length = len(tweets)
X_train = tweets[:int(length*0.8)]
X_val = tweets[int(length*0.8):]
y_train = y[:int(length*0.8)]
y_val = y[int(length*0.8):]
real_tweets = list(X_val)
X_train = roberta_tokenizer(X_train, padding='max_length', max_length=max_len, return_tensors='tf')
X_val = roberta_tokenizer(X_val, padding='max_length', max_length=max_len, return_tensors='tf')

In [None]:
batch_size = 8 # no. of observations passed into single iteration during each epoch
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
train_dataset = train_dataset.batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(X_val), y_val))
val_dataset = val_dataset.batch(batch_size)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6) # applying Adam optimizer to neural network
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # applying loss function
roberta_seq.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # compiling roberta model with loss, optimizer and metric

In [None]:
checkpoint = './roberta_checkpoint' # directory name where model weights will be stored
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint, 
                                                         monitor='val_accuracy',
                                                         mode='max',
                                                         save_weights_only=True, 
                                                         save_best_only=True)

In [None]:
# training the model
roberta_history = roberta_seq.fit(train_dataset, epochs=3, validation_data=val_dataset, callbacks=[callback_checkpoint])

In [None]:
roberta_seq.load_weights(checkpoint) # loading weights from file

In [None]:
outputs = roberta_seq.predict(val_dataset, verbose=False) # making predictions on validation set
roberta_y_pred = outputs[0].argmax(axis=1)
roberta_predictions = ['No' if val == 0 else 'Yes' for val in roberta_y_pred.tolist()]

In [None]:
roberta_accuracy = round(accuracy_score(y_val, roberta_y_pred), 2)
print('Accuracy: {}'.format(roberta_accuracy))

In [None]:
print('Classification report:')
print(classification_report(y_val, roberta_y_pred, labels=[0, 1], target_names=['not a disaster','disaster']))

In [None]:
roberta_precision = round(precision_score(y_val, roberta_y_pred), 2)
roberta_recall = round(recall_score(y_val, roberta_y_pred), 2)
roberta_f1 = round(f1_score(y_val, roberta_y_pred), 2)

In [None]:
# visualization of confusion matrix
plt.rcParams.update({'font.size': 15})
plt.figure(figsize=(16, 8))
cm = confusion_matrix(y_val, roberta_y_pred)
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=np.unique(y_val), yticklabels=np.unique(roberta_y_pred))
plt.title('\nConfusion Matrix')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.show()

In [None]:
# applying tokenization on test set
tweets_test = list(test_df['text'])
tweets_test = preprocess_tweets(tweets_test)
X_test = roberta_tokenizer(tweets_test, padding='max_length', max_length=max_len, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices(dict(X_test))
test_dataset = test_dataset.batch(batch_size)

In [None]:
# making predictions on test data
outputs_test = roberta_seq.predict(test_dataset, verbose=False)
y_pred_test = outputs_test[0].argmax(axis=1)

In [None]:
# saving results to file
results = pd.DataFrame()
results['id'] = test_df['id']
results['target'] = y_pred_test
results.to_csv('./roberta_submission.csv', index=False)

# AlBert

In [None]:
model_name = 'albert-base-v2'
albert_tokenizer = AlbertTokenizerFast.from_pretrained(model_name)
albert_seq = TFAlbertForSequenceClassification.from_pretrained(model_name)

In [None]:
albert_seq.summary()

In [None]:
# Combining all data in a separate list just for determining the length of sequences
all_tweets = list(pd.concat([train_df, test_df],axis=0)['text'])
all_tweets = preprocess_tweets(all_tweets)
max_len = max([len(t) for t in albert_tokenizer(all_tweets)['input_ids']])

In [None]:
length = len(tweets)
X_train = tweets[:int(length*0.8)]
X_val = tweets[int(length*0.8):]
y_train = y[:int(length*0.8)]
y_val = y[int(length*0.8):]
X_train = albert_tokenizer(X_train, padding='max_length', max_length=max_len, return_tensors='tf')
X_val = albert_tokenizer(X_val, padding='max_length', max_length=max_len, return_tensors='tf')

In [None]:
batch_size = 8
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
train_dataset = train_dataset.batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(X_val), y_val))
val_dataset = val_dataset.batch(batch_size)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
albert_seq.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
checkpoint = './albert_checkpoint'
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint,
                                              monitor='val_accuracy',
                                              mode='max',
                                              save_weights_only=True,
                                              save_best_only=True)

In [None]:
# training the model
albert_history = albert_seq.fit(train_dataset, validation_data=val_dataset, epochs=3, callbacks=[callback_checkpoint])

In [None]:
albert_seq.load_weights(checkpoint)

In [None]:
outputs = albert_seq.predict(val_dataset, verbose=False)
albert_y_pred = outputs[0].argmax(axis=1)
albert_predictions = ['No' if val == 0 else 'Yes' for val in albert_y_pred.tolist()]

In [None]:
albert_accuracy = round(accuracy_score(y_val, albert_y_pred), 2)
print('Accuracy: {}'.format(albert_accuracy))

In [None]:
print('Classification report:')
print(classification_report(y_val, albert_y_pred, labels=[0, 1], target_names=['not a disaster','disaster']))

In [None]:
albert_precision = round(precision_score(y_val, albert_y_pred), 2)
albert_recall = round(recall_score(y_val, albert_y_pred), 2)
albert_f1 = round(f1_score(y_val, albert_y_pred), 2)

In [None]:
# visualization of confusion matrix
plt.rcParams.update({'font.size': 15})
plt.figure(figsize=(16, 8))
cm = confusion_matrix(y_val, albert_y_pred)
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=np.unique(y_val), yticklabels=np.unique(albert_y_pred))
plt.title('\nConfusion Matrix')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.show()

In [None]:
# applying tokenization on test set
tweets_test = list(test_df['text'])
tweets_test = preprocess_tweets(tweets_test)
X_test = albert_tokenizer(tweets_test, padding='max_length', max_length=max_len, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices(dict(X_test))
test_dataset = test_dataset.batch(batch_size)

In [None]:
# making predictions on test set
outputs_test = albert_seq.predict(test_dataset, verbose=False)
y_pred_test = outputs_test[0].argmax(axis=1)

In [None]:
# saving results to file
results = pd.DataFrame()
results['id'] = test_df['id']
results['target'] = y_pred_test
results.to_csv('./albert_submission.csv', index=False)

# DistilBert

In [None]:
model_name = 'distilbert-base-uncased'
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
distilbert_seq = TFDistilBertForSequenceClassification.from_pretrained(model_name)

In [None]:
distilbert_seq.summary()

In [None]:
# Combining all data in a separate list just for determining the length of sequences
all_tweets = list(pd.concat([train_df, test_df], axis=0)['text'])
all_tweets = preprocess_tweets(all_tweets)
max_len = max([len(t) for t in distilbert_tokenizer(all_tweets)['input_ids']])

In [None]:
length = len(tweets)
X_train = tweets[:int(length*0.8)]
X_val = tweets[int(length*0.8):]
y_train = y[:int(length*0.8)]
y_val = y[int(length*0.8):]
X_train = distilbert_tokenizer(X_train, padding='max_length', max_length=max_len, return_tensors='tf')
X_val = distilbert_tokenizer(X_val, padding='max_length', max_length=max_len, return_tensors='tf')

In [None]:
batch_size = 8
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
train_dataset = train_dataset.batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(X_val), y_val))
val_dataset = val_dataset.batch(batch_size)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
distilbert_seq.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
checkpoint = './distilbert_checkpoint'
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint, 
                                                    monitor='val_accuracy',
                                                    mode='max',
                                                    save_weights_only=True,
                                                    save_best_only=True)

In [None]:
# training the model
distilbert_history = distilbert_seq.fit(train_dataset, validation_data=val_dataset, epochs=3, callbacks=[callback_checkpoint])

In [None]:
distilbert_seq.load_weights(checkpoint)

In [None]:
outputs = distilbert_seq.predict(val_dataset, verbose=False)
distilbert_y_pred = outputs[0].argmax(axis=1)
distilbert_predictions = ['No' if val == 0 else 'Yes' for val in distilbert_y_pred.tolist()]

In [None]:
distilbert_accuracy = round(accuracy_score(y_val, distilbert_y_pred), 2)
print('Accuracy: {}'.format(distilbert_accuracy))

In [None]:
print('Classification report:')
print(classification_report(y_val, distilbert_y_pred, labels=[0, 1], target_names=['not a disaster','disaster']))

In [None]:
distilbert_precision = round(precision_score(y_val, distilbert_y_pred), 2)
distilbert_recall = round(recall_score(y_val, distilbert_y_pred), 2)
distilbert_f1 = round(f1_score(y_val, distilbert_y_pred), 2)

In [None]:
# visualization of confusion matrix
plt.rcParams.update({'font.size': 15})
plt.figure(figsize=(16, 8))
cm = confusion_matrix(y_val, distilbert_y_pred)
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=np.unique(y_val), yticklabels=np.unique(distilbert_y_pred))
plt.title('\nConfusion Matrix')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.show()

In [None]:
# applying tokenization on test set
tweets_test = list(test_df['text'])
tweets_test = preprocess_tweets(tweets_test)
X_test = distilbert_tokenizer(tweets_test, padding='max_length', max_length=max_len, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices(dict(X_test))
test_dataset = test_dataset.batch(batch_size)

In [None]:
# making predictions on test set
outputs_test = distilbert_seq.predict(test_dataset, verbose=False)
y_pred_test = outputs_test[0].argmax(axis=1)

In [None]:
# saving results to file
results = pd.DataFrame()
results['id'] = test_df['id']
results['target'] = y_pred_test
results.to_csv('./distilbert_submission.csv', index=False)

In [None]:
plt.figure(figsize=(16, 8))
roberta_acc_history = roberta_history.history['val_accuracy']
albert_acc_history = albert_history.history['val_accuracy']
distilbert_acc_history = distilbert_history.history['val_accuracy']
epochs = range(1, 4)
plt.plot(epochs, roberta_acc_history, label='ROBERTa')
plt.plot(epochs, albert_acc_history, label='AlBert')
plt.plot(epochs, distilbert_acc_history, label='DistilBert')
plt.title('Validation Accuracies Of ROBERTa, AlBert, and DistilBert During Each Epoch')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Comparison of Results Of All Models

In [None]:
index = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
columns = ['ROBERTa', 'AlBert', 'DistilBert']
roberta = [roberta_accuracy, roberta_precision, roberta_recall, roberta_f1]
albert = [albert_accuracy, albert_precision, albert_recall, albert_f1]
distilbert = [distilbert_accuracy, distilbert_precision, distilbert_recall, distilbert_f1]

In [None]:
dict_ = {'ROBERTa':roberta,
         'AlBert':albert,
         'DistilBert':distilbert}
results_df = pd.DataFrame(dict_, index=index).round(2)

In [None]:
results_df

In [None]:
# showing real tweets, true lables and predictions by three models
new_y_val = ['No' if val == 0 else 'Yes' for val in y_val.tolist()]
labels_dict = {'Tweet':real_tweets,
             'Real Label':new_y_val,
             'ROBERTa':roberta_predictions,
             'AlBert':albert_predictions,
             'DistilBert':distilbert_predictions}
labels_df = pd.DataFrame(labels_dict)
labels_df = labels_df.sample(frac=1).reset_index(drop=True)

In [None]:
labels_df.head()