<a href="https://colab.research.google.com/github/christinajoslin/nlp-disaster-tweets-classifier/blob/main/NLP_Disaster_Tweets_using_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the 'datasets' library (if not already installed)
# !pip install datasets

# Import the Hugging Face Datasets library
from datasets import Dataset  # For handling datasets compatible with Hugging Face's Trainer
import pandas as pd  # For creating and manipulating data in tabular form
import numpy as np  # For handling numerical operations and predictions

# Importing necessary modules for NLP tasks using Hugging Face Transformers
from transformers import AutoTokenizer  # Tokenizer to preprocess text for the model
from transformers import TrainingArguments  # Defines training configurations (e.g., learning rate, batch size)
from transformers import Trainer  # Simplifies training, evaluation, and prediction
from transformers import AutoModelForSequenceClassification, DistilBertConfig  # Pre-trained model and configuration for text classification tasks
from transformers import EarlyStoppingCallback  # To stop training if no improvement is seen for a number of evaluations
from transformers import TrainerCallback, TrainerState, TrainerControl  # For custom training callbacks

# Import sklearn metrics for evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support  # Functions for computing evaluation metrics


In [None]:
# Import the Google Drive module for mounting
from google.colab import drive

# Unmount the drive first if it is already mounted (avoids duplicate mount errors)
# drive.flush_and_unmount() ensures the drive is cleanly unmounted
try:
    drive.flush_and_unmount()  # Flushes any pending operations and unmounts
    print('Drive unmounted successfully')
except ValueError:
    pass  # If the drive is not already mounted, ignore the error and proceed

# Mount Google Drive to the Colab workspace
# The 'force_remount=True' ensures a fresh mount even if it's already mounted
drive.mount('/content/drive', force_remount=True)


Drive unmounted
Mounted at /content/drive


In [None]:
#Loading the training and test sets as a Pandas DataFrame
train_set = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP Disaster Tweets/train.csv")
test_set = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP Disaster Tweets/test.csv")

In [None]:
# Identify the target (label) column and text column for the training and test sets

# Rename the 'target' column in the training set to 'label' to align with Hugging Face's Trainer requirements
# This ensures compatibility with the distilbert-base-uncased tokenizer
train_set = train_set.rename(columns={'target': 'label'})

# Revise the columns in the training set to include 'id', 'text', and 'label'
train_set = train_set[['id', 'text', 'label']]

# Revise the columns in the test set to include only 'id' and 'text'
test_set = test_set[['id', 'text']]

In [None]:
# Load both datasets into Hugging Face's Dataset format
# This format is required for seamless integration with the Hugging Face Trainer API
hf_train = Dataset.from_pandas(train_set)
test_dataset = Dataset.from_pandas(test_set)

In [None]:
# Split the training set into training and development (dev) sets
# Use a 10% split for the dev set to ensure a balanced evaluation set

train_dev_split = hf_train.train_test_split(test_size=0.1, seed=42)

#Assign the split datasets to respective variables
train_dataset = train_dev_split["train"]
dev_dataset = train_dev_split["test"]

In [None]:
# Specify the pre-trained model to be used for tokenization
# 'distilbert-base-uncased' is a lightweight variant of BERT, optimized for efficiency
model_name = 'distilbert-base-uncased'

# Load the tokenizer associated with the specified model
# The tokenizer will handle converting text to token IDs, padding, and truncation
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a tokenization function for the dataset
# This function applies the tokenizer to the 'text' column of the dataset
# - `truncation=True`: Ensures that sequences longer than `max_length` are truncated
# - `padding="max_length"`: Pads sequences to the maximum length of 128 tokens
# - `max_length=128`: Limits all sequences to a maximum of 128 tokens

#Tokenize functions
def tokenize_sentiment_analysis(examples):
    return tokenizer(examples["text"], truncation=True, padding = "max_length",max_length = 128)

In [None]:
# Apply tokenization to the training, development (dev), and test datasets
# The `.map()` method applies the `tokenize_sentiment_analysis` function to each dataset.
# This ensures the text data is transformed into tokenized inputs that the model can process.

# Parameters used in `.map()`:
# - `batched=True`: Processes multiple examples in a single call, improving efficiency.
# - `batch_size=100`: Specifies the number of examples to process in each batch.
#   (Adjustable based on available memory and dataset size.)

# Tokenize the training dataset
tokenized_train = train_dataset.map(
    tokenize_sentiment_analysis,  # Tokenization function
    batched=True,                 # Enable batch processing
    batch_size=100                # Process 100 examples per batch
)

# Tokenize the test dataset
tokenized_test = test_dataset.map(
    tokenize_sentiment_analysis,
    batched=True,
    batch_size=100
)

# Tokenize the development (dev) dataset
tokenized_dev = dev_dataset.map(
    tokenize_sentiment_analysis,
    batched=True,
    batch_size=100
)

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [None]:
# Define training arguments for the Trainer
# These arguments control various aspects of how the training process is conducted.

training_args = TrainingArguments(
    output_dir="./content/drive/MyDrive/Colab Notebooks/NLP Disaster Tweets/",
    # Directory to save model checkpoints, logs, and outputs.
    # Ensure this directory exists or will be created automatically.

    run_name="disaster_tweets_1",
    # A name to identify this training run. Useful for tracking experiments in tools like TensorBoard or wandb (if enabled).

    logging_strategy="steps",
    # Specifies when to log metrics during training. Here, "steps" logs metrics at regular intervals of steps.

    logging_steps=50,
    # Log training metrics every 50 steps. This provides real-time feedback during training.

    evaluation_strategy="epoch",
    # Evaluate the model on the validation set at the end of each epoch.

    save_strategy="epoch",
    # Save the model checkpoint at the end of each epoch.

    learning_rate=3e-5,
    # Initial learning rate for the optimizer. A smaller value is common for fine-tuning pre-trained models.

    per_device_train_batch_size=16,
    # Number of examples in each batch for training. Adjust based on available GPU/CPU memory.

    per_device_eval_batch_size=16,
    # Number of examples in each batch for evaluation. Typically matches the training batch size.

    num_train_epochs=10,
    # Total number of epochs to train the model. More epochs may lead to overfitting if the dataset is small.

    weight_decay=0.05,
    # Strength of L2 regularization. Helps prevent overfitting by penalizing large weights.

    logging_dir="./logs",
    # Directory to save logs for TensorBoard or other visualization tools.

    load_best_model_at_end=True,
    # After training, load the model checkpoint that achieved the best evaluation metric.

    metric_for_best_model="accuracy",
    # Metric to determine the "best" model during training. Here, it's set to "accuracy".

    report_to="none"
    # Disables integration with logging tools like wandb or TensorBoard. Set to "wandb" or "tensorboard" to enable them.
)




In [None]:
# Load the pre-trained model for fine-tuning and training
# The model will be used for binary text classification (e.g., sentiment analysis).

# Specify the name of the pre-trained model to load.
# 'distilbert-base-uncased-finetuned-sst-2-english' is fine-tuned for sentiment analysis.
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the configuration for the model
# Allows customization, such as modifying dropout rates.
config = DistilBertConfig.from_pretrained(model_name)
config.dropout = 0.2  # Add dropout to reduce overfitting

# Load the pre-trained model with the specified configuration
# AutoModelForSequenceClassification is ideal for classification tasks.
# The `num_labels=2` parameter indicates binary classification.
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

# Define a function to compute evaluation metrics for the model
# This function is called during evaluation to calculate key metrics:
# - Accuracy: Overall percentage of correct predictions
# - Precision: Proportion of correct positive predictions
# - Recall: Proportion of actual positives correctly identified
# - F1-score: Harmonic mean of precision and recall
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # Unpack logits (model outputs) and true labels
    predictions = np.argmax(logits, axis=-1)  # Convert logits to predicted classes

    # Compute precision, recall, F1-score using sklearn's precision_recall_fscore_support
    # - `average="binary"`: Computes binary classification metrics
    # - `zero_division=1`: Handles cases where precision/recall calculation might divide by zero
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=1
    )

    # Compute accuracy using sklearn's accuracy_score
    acc = accuracy_score(labels, predictions)

    # Return all metrics as a dictionary for easy access and logging
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [None]:
# Initialize the Hugging Face Trainer and begin model training
# The Trainer class handles training, evaluation, and prediction processes, simplifying the workflow.

# Define a custom callback for debugging
class DebugCallback(TrainerCallback):
    """
    Custom callback to log training progress at regular intervals.
    This callback prints the training loss every 50 steps.
    """
    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        # Log every 50 steps if the log history contains 'loss'
        if state.global_step % 50 == 0 and state.log_history and 'loss' in state.log_history[-1]:
            print(f"Step {state.global_step}: Training Loss = {state.log_history[-1]['loss']:.4f}")

# Initialize the Trainer
trainer = Trainer(
    model=model,  # The pre-trained model to fine-tune (e.g., distilbert-base-uncased)
    args=training_args,  # Training arguments (e.g., learning rate, batch size, epochs) defined earlier
    train_dataset=tokenized_train,  # Tokenized training dataset for fine-tuning
    eval_dataset=tokenized_dev,  # Tokenized development dataset for evaluation during training
    tokenizer=tokenizer,  # Tokenizer to preprocess input text for the model
    compute_metrics=compute_metrics,  # Function to compute evaluation metrics (e.g., accuracy, precision, recall, F1-score)
    callbacks=[
        DebugCallback(),  # Custom callback to log training progress
        EarlyStoppingCallback(early_stopping_patience=3)  # Stop training if no improvement for 3 evaluations
    ]
)

  trainer = Trainer(


In [None]:
# Start the training process
# This includes backpropagation, parameter updates, and periodic evaluation on the dev set
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3948,0.393071,0.821522,0.783439,0.806557,0.76161
2,0.3279,0.415477,0.838583,0.8,0.842466,0.76161
3,0.2698,0.662148,0.772966,0.755994,0.694301,0.829721
4,0.2024,0.604489,0.804462,0.768992,0.770186,0.767802
5,0.1316,0.889676,0.804462,0.766091,0.77707,0.755418


Step 100: Training Loss = 0.7705
Step 150: Training Loss = 0.4913
Step 200: Training Loss = 0.4448
Step 250: Training Loss = 0.4342
Step 300: Training Loss = 0.4172
Step 350: Training Loss = 0.4159
Step 400: Training Loss = 0.3940
Step 500: Training Loss = 0.3965
Step 550: Training Loss = 0.3130
Step 600: Training Loss = 0.3313
Step 650: Training Loss = 0.3274
Step 700: Training Loss = 0.3310
Step 750: Training Loss = 0.3041
Step 800: Training Loss = 0.2946
Step 850: Training Loss = 0.3364
Step 950: Training Loss = 0.2423
Step 1000: Training Loss = 0.2200
Step 1050: Training Loss = 0.1923
Step 1100: Training Loss = 0.2912
Step 1150: Training Loss = 0.2614
Step 1200: Training Loss = 0.2368
Step 1250: Training Loss = 0.2388
Step 1350: Training Loss = 0.2537
Step 1400: Training Loss = 0.1553
Step 1450: Training Loss = 0.1797
Step 1500: Training Loss = 0.1537
Step 1550: Training Loss = 0.1812
Step 1600: Training Loss = 0.1981
Step 1650: Training Loss = 0.1338
Step 1700: Training Loss = 0.1

TrainOutput(global_step=2145, training_loss=0.2671485843080463, metrics={'train_runtime': 209.5445, 'train_samples_per_second': 326.947, 'train_steps_per_second': 20.473, 'total_flos': 1134417685240320.0, 'train_loss': 0.2671485843080463, 'epoch': 5.0})

In [None]:
# Evaluate the fine-tuned model on the development (dev) set
dev_results = trainer.evaluate(eval_dataset=tokenized_dev)

# Print the evaluation results for the dev set
print("Dev set Accuracy:", dev_results['eval_accuracy'])
print("Dev set F1 Score:", dev_results['eval_f1'])

# Evaluate the fine-tuned model on the training set
train_results = trainer.evaluate(eval_dataset=tokenized_train)

# Print the evaluation results for the test set
print("Train set Accuracy:", train_results['eval_accuracy'])
print("Train set F1 Score:", train_results['eval_f1'])



Dev set Accuracy: 0.8385826771653543
Dev set F1 Score: 0.8
Train set Accuracy: 0.9325645891110786
Train set F1 Score: 0.9176764076977905


In [None]:
#Write the model predictions to a csv file in the format of 'id','prediction'

# Get predictions from the model
predictions = trainer.predict(test_dataset=tokenized_test)

# Convert raw predictions to the final label (e.g., argmax for classification tasks)
final_predictions = predictions.predictions.argmax(axis=1)

# Extract the 'id' column from the tokenized_test dataset
ids = tokenized_test["id"]

# Create a DataFrame with 'id' and 'target'
results_df = pd.DataFrame({"id": ids, "target": final_predictions})

# Write to a CSV file
results_df.to_csv("/content/drive/MyDrive/Colab Notebooks/NLP Disaster Tweets/predictions.csv", index=False)