# Problem

The challenge is to train a model that can identify whether a tweet is about a disaster or not. In order to get the data ready some basic NLP tasks will be conducted such as lower casing, dropping numbers and punctuations. And then the data will be tokenized, converted into a sequence and some padding will be applied to ensure uniform input.

In [None]:
import pandas as pd

# Load train and test datasets into DataFrames
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Display the first few rows of each DataFrame to verify the data
print(train_df.head())
print(test_df.head())


# EDA of the Data

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

# Creating a word cloud for disaster and non-disaster tweets
disaster_tweets = " ".join(train_df_filtered[train_df_filtered['target'] == 1]['text'])
nondisaster_tweets = " ".join(train_df_filtered[train_df_filtered['target'] == 0]['text'])

# WordCloud for disaster tweets
disaster_wordcloud = WordCloud(width=800, height=400, max_words=100, background_color="white").generate(disaster_tweets)

# WordCloud for non-disaster tweets
nondisaster_wordcloud = WordCloud(width=800, height=400, max_words=100, background_color="white").generate(nondisaster_tweets)

# Plotting the WordClouds
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.imshow(disaster_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud for Disaster Tweets", fontsize=16)

plt.subplot(1, 2, 2)
plt.imshow(nondisaster_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud for Non-Disaster Tweets", fontsize=16)

plt.tight_layout()
plt.show()

# Creating a count plot for the tweet distribution by target
plt.figure(figsize=(8, 6))
sns.countplot(x=train_df_filtered['target'])
plt.title('Tweet Count Distribution by Target', fontsize=16)
plt.xlabel('Target (0 = Non-disaster, 1 = Disaster)', fontsize=12)
plt.ylabel('Tweet Count', fontsize=12)
plt.show()


# Data Structure

The data has an ID, keyword, location and Text columns. Since the keyword and location has some nulls, they are dropped. So the ID, Text and Target columns are used for training. 

In [None]:
# Filter the train and test DataFrames to only use the 'id' and 'text' columns
train_df_filtered = train_df[['id', 'text', 'target']]
test_df_filtered = test_df[['id', 'text']]

# Display the first few rows of the filtered DataFrames to verify
print(train_df_filtered.head())
print(test_df_filtered.head())


# Test Preprocessing Steps

In [None]:
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Clean the text in both the train and test datasets
train_df_filtered['text_cleaned'] = train_df_filtered['text'].apply(clean_text)
test_df_filtered['text_cleaned'] = test_df_filtered['text'].apply(clean_text)

# Tokenizing the text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df_filtered['text_cleaned'])

# Converting text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df_filtered['text_cleaned'])
test_sequences = tokenizer.texts_to_sequences(test_df_filtered['text_cleaned'])

# Padding the sequences to ensure uniform input size
max_length = 100  # You can experiment with this value
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Display the first few sequences
print(train_padded[:5])


In [None]:
print(train_df_filtered['target'].value_counts())


# RNN Model

We first try this approach since RNNs are good for sequential tasks such as text processing. 

In [None]:
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Define the model architecture
vocab_size = 10000  # Same as the tokenizer's num_words parameter
embedding_dim = 16
max_length = 100  # This should match the max_length used for padding

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length, embeddings_initializer=HeNormal()),
    Bidirectional(LSTM(64, return_sequences=False, kernel_initializer=HeNormal(), kernel_regularizer=l2(0.01))),  # Add L2 regularization
    Dropout(0.5),
    BatchNormalization(),
    Dense(32, activation='relu', kernel_initializer=HeNormal(), kernel_regularizer=l2(0.01)),  # Add L2 regularization
    Dropout(0.5),
    Dense(1, activation='sigmoid', kernel_initializer=HeNormal(), kernel_regularizer=l2(0.01))  # Add L2 regularization
])

# Adjust the learning rate
optimizer = Adam(learning_rate=0.00001)  # Adjust to a higher value

# Compile the model with binary_crossentropy loss
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
print(train_df_filtered['target'].unique())


In [None]:
import numpy as np  # Add this line to import numpy
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_padded, train_df_filtered['target'], test_size=0.2, random_state=42)

# Convert the target columns to numpy arrays
y_train = np.array(y_train)
y_val = np.array(y_val)

# Calculate class weights to handle potential class imbalance
class_weights_array = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df_filtered['target']),
    y=train_df_filtered['target']
)

# Create the class weight dictionary based on the target labels (0 and 1)
class_weights = {0: class_weights_array[0], 1: class_weights_array[1]}

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)


# Train the model on the dataset
history = model.fit(
    X_train, y_train,
    epochs=35,
    batch_size=64,  # You can adjust this as needed
    validation_data=(X_val, y_val),
    class_weight=class_weights,  # Apply class weights to handle class imbalance
    callbacks=[early_stop],  # Early stopping callback
    verbose=2
)



# Results

The RNN model performed quite well and achieved good validation accuracy. 35 epochs were identified to be ideal as after that the validation accuracy was decreasing while training accuracy increased.

In [None]:
import matplotlib.pyplot as plt

# Plot accuracy over epochs
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

# Plot loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Tokenize and pad the test data
test_sequences = tokenizer.texts_to_sequences(test_df_filtered['text_cleaned'])  # Ensure you have cleaned the text the same way
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Predict using the trained model
predictions = model.predict(test_padded)

# Convert predictions to binary output (0 or 1)
predicted_labels = (predictions > 0.5).astype(int)  # Since it’s a binary classification, threshold is 0.5


In [None]:
# Create a DataFrame with the ID and predicted target
submission_df = pd.DataFrame({
    'id': test_df_filtered['id'],  # Use the ID from the test dataset
    'target': predicted_labels.flatten()  # Flatten if needed, so it's in the correct shape
})

# Save the submission file as a CSV
submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


# Exploring with a Distil Bert

In [None]:
import pandas as pd

# Load train and test datasets into DataFrames
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Display the first few rows of each DataFrame to verify the data
print(train_df.head())
print(test_df.head())


In [None]:
# Filter the train and test DataFrames to only use the 'id' and 'text' columns
train_df_filtered = train_df[['id', 'text', 'target']]
test_df_filtered = test_df[['id', 'text']]

# Display the first few rows of the filtered DataFrames to verify
print(train_df_filtered.head())
print(test_df_filtered.head())


In [None]:
import keras_nlp
import keras
from tensorflow.keras.optimizers import Adam

# Define some hyperparameters
preset = "distil_bert_base_en_uncased"
sequence_length = 160
BATCH_SIZE = 16
EPOCHS = 3

# Load a DistilBERT preprocessor with a sequence length of 160
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    preset,
    sequence_length=sequence_length,
    name="preprocessor_4_tweets"
)

In [None]:
# Function to preprocess text
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to training and test data
train_df_filtered['text_cleaned'] = train_df_filtered['text'].apply(preprocess_text)
test_df_filtered['text_cleaned'] = test_df_filtered['text'].apply(preprocess_text)

# Function to tokenize the text using DistilBERT tokenizer
def encode_data(texts, tokenizer, max_length=128):
    encoded = tokenizer.batch_encode_plus(
        texts.tolist(),  # Convert the text column to a list
        add_special_tokens=True,  # Add [CLS] and [SEP] tokens
        max_length=max_length,  # Max sequence length
        padding='max_length',  # Pad sequences to max_length
        truncation=True,  # Truncate longer sequences
        return_attention_mask=True,  # Return attention mask
        return_tensors='tf'  # Return TensorFlow tensors
    )
    return encoded['input_ids'], encoded['attention_mask']

# Tokenize the training and test data
X_train_input_ids, X_train_attention_mask = encode_data(train_df_filtered['text_cleaned'], tokenizer)
X_test_input_ids, X_test_attention_mask = encode_data(test_df_filtered['text_cleaned'], tokenizer)

# Display the shape of the tokenized data to verify
print(X_train_input_ids.shape)
print(X_test_input_ids.shape)


In [None]:
train_df_filtered = train_df_filtered.iloc[:len(y_train)]


In [None]:
# Load the DistilBERT classifier with 2 output classes for binary classification
classifier = keras_nlp.models.DistilBertClassifier.from_preset(
    preset,
    preprocessor=preprocessor,
    num_classes=2
)

# Display model summary
classifier.summary()

# Compile the model
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),  # Suitable for integer labels
    optimizer=Adam(learning_rate=1e-5),
    metrics=["accuracy"]
)

In [None]:
print(len(train_df_filtered['text_cleaned']))
print(len(y_train))


In [None]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
    train_df_filtered['text_cleaned'], y_train, test_size=0.2, random_state=42
)

# Train the model using raw text (no need for manual tokenization)
history = classifier.fit(
    x=X_train_texts,  # Use the cleaned text directly for training
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val_texts, y_val)  # Use the validation data split from training set
)


In [None]:
import matplotlib.pyplot as plt

# Plot the training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Predict the validation data
y_val_pred = classifier.predict(X_val_texts)
y_val_pred_labels = y_val_pred.argmax(axis=1)  # Get the predicted labels

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred_labels)

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


In [None]:
# Predict the test data (X_test_texts is the raw text from the test set)
y_test_pred = classifier.predict(test_df_filtered['text_cleaned'])
y_test_pred_labels = y_test_pred.argmax(axis=1)  # Get the predicted labels

# Create a submission file
submission_df = test_df_filtered[['id']].copy()
submission_df['target'] = y_test_pred_labels

# Save the submission file
submission_df.to_csv('submission.csv', index=False)


# Exploring with pre-trained BERT model for binary classification

In [None]:
# Filter the train and test DataFrames to only use the 'id' and 'text' columns
train_df_filtered = train_df[['id', 'text', 'target']]
test_df_filtered = test_df[['id', 'text']]

# Display the first few rows of the filtered DataFrames to verify
print(train_df_filtered.head())
print(test_df_filtered.head())
# Filter the train and test DataFrames to only use the 'id' and 'text' columns
train_df_filtered = train_df[['id', 'text', 'target']]
test_df_filtered = test_df[['id', 'text']]

# Display the first few rows of the filtered DataFrames to verify
print(train_df_filtered.head())
print(test_df_filtered.head())
# Function to preprocess text
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to training and test data
train_df_filtered['text_cleaned'] = train_df_filtered['text'].apply(preprocess_text)
test_df_filtered['text_cleaned'] = test_df_filtered['text'].apply(preprocess_text)

# Function to tokenize the text using DistilBERT tokenizer
def encode_data(texts, tokenizer, max_length=128):
    encoded = tokenizer.batch_encode_plus(
        texts.tolist(),  # Convert the text column to a list
        add_special_tokens=True,  # Add [CLS] and [SEP] tokens
        max_length=max_length,  # Max sequence length
        padding='max_length',  # Pad sequences to max_length
        truncation=True,  # Truncate longer sequences
        return_attention_mask=True,  # Return attention mask
        return_tensors='tf'  # Return TensorFlow tensors
    )
    return encoded['input_ids'], encoded['attention_mask']

# Tokenize the training and test data
X_train_input_ids, X_train_attention_mask = encode_data(train_df_filtered['text_cleaned'], tokenizer)
X_test_input_ids, X_test_attention_mask = encode_data(test_df_filtered['text_cleaned'], tokenizer)

# Display the shape of the tokenized data to verify
print(X_train_input_ids.shape)
print(X_test_input_ids.shape)


In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf

# Load
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Convert TensorFlow tensors to numpy arrays
X_train_input_ids_np = X_train_input_ids.numpy()
X_train_attention_mask_np = X_train_attention_mask.numpy()

# Now split the numpy arrays
X_train_input_ids_np, X_val_input_ids_np, X_train_attention_mask_np, X_val_attention_mask_np, y_train, y_val = train_test_split(
    X_train_input_ids_np, X_train_attention_mask_np, train_df_filtered['target'], test_size=0.2, random_state=42
)

# Convert target variables to TensorFlow tensors
y_train = tf.convert_to_tensor(y_train.values)
y_val = tf.convert_to_tensor(y_val.values)

# If needed, convert the numpy arrays back to TensorFlow tensors
X_train_input_ids = tf.convert_to_tensor(X_train_input_ids_np)
X_val_input_ids = tf.convert_to_tensor(X_val_input_ids_np)
X_train_attention_mask = tf.convert_to_tensor(X_train_attention_mask_np)
X_val_attention_mask = tf.convert_to_tensor(X_val_attention_mask_np)


In [None]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification
import numpy as np

# Load the pre-trained BERT model for sequence classification
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Prepare dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_train_input_ids, 'attention_mask': X_train_attention_mask},
    y_train
)).shuffle(100).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask},
    y_val
)).batch(16)

# Training loop
epochs = 3

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    for step, (inputs, labels) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = bert_model(inputs, training=True).logits
            loss = loss_fn(labels, logits)
        
        # Backpropagation
        gradients = tape.gradient(loss, bert_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, bert_model.trainable_variables))
        
        if step % 50 == 0:
            print(f'Step {step}, Loss: {loss.numpy()}')

    # Validation
    val_loss = []
    val_acc = []
    for inputs, labels in val_dataset:
        val_logits = bert_model(inputs, training=False).logits
        val_loss.append(loss_fn(labels, val_logits).numpy())
        
        # Calculate accuracy
        predictions = np.argmax(val_logits, axis=-1)
        accuracy = np.mean(predictions == labels.numpy())
        val_acc.append(accuracy)
    
    print(f'Validation Loss: {np.mean(val_loss)}, Validation Accuracy: {np.mean(val_acc)}')

# You can now save the model if needed
bert_model.save_pretrained('./my_bert_model')


In [None]:
from transformers import TFBertForSequenceClassification
import numpy as np

# Load the saved model
loaded_model = TFBertForSequenceClassification.from_pretrained('./my_bert_model')

# Prepare validation dataset
val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask},
    y_val
)).batch(16)

# Evaluate on validation set
val_loss = []
val_acc = []
for inputs, labels in val_dataset:
    val_logits = loaded_model(inputs, training=False).logits
    val_loss.append(loss_fn(labels, val_logits).numpy())
    
    # Calculate accuracy
    predictions = np.argmax(val_logits, axis=-1)
    accuracy = np.mean(predictions == labels.numpy())
    val_acc.append(accuracy)

# Print the final validation loss and accuracy
print(f'Validation Loss: {np.mean(val_loss)}, Validation Accuracy: {np.mean(val_acc)}')

# Optionally, you can plot the metrics like before if you have metrics for all epochs


In [None]:
import matplotlib.pyplot as plt

# Extracted data from your logs
val_loss_values = [0.3915, 0.4275, 0.4970]
val_acc_values = [0.8340, 0.8385, 0.8314]

# Plot Validation Loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, 4), val_loss_values, label='Validation Loss', marker='o')
plt.title('Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(range(1, 4))
plt.legend()
plt.show()

# Plot Validation Accuracy
plt.figure(figsize=(10, 5))
plt.plot(range(1, 4), val_acc_values, label='Validation Accuracy', marker='o')
plt.title('Validation Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.xticks(range(1, 4))
plt.legend()
plt.show()


In [None]:
# Make predictions on the test dataset
test_dataset = tf.data.Dataset.from_tensor_slices({
    'input_ids': X_test_input_ids,
    'attention_mask': X_test_attention_mask
}).batch(16)

# Get predicted logits from the trained model
test_logits = bert_model.predict(test_dataset).logits

# Convert logits to class labels (0 or 1 for binary classification)
test_predictions = tf.argmax(test_logits, axis=-1).numpy()

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df_filtered['id'],
    'target': test_predictions
})

# Save the submission DataFrame as a CSV file
submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully!")


# Conclusions

The pretrained Bert model performed the best and also provided the highest score for the submission. Bidirectional nature of BERT can help achieve better results compared to the sequential approach of RNNS. That said, the RNN model also performed quite well, and could have been improved further. 
