In [None]:
# Installing required libraries for the project
!pip install datasets evaluate transformers[torch] scikit-learn matplotlib seaborn -U

# Importing essential libraries
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import matplotlib.pyplot as plt
import seaborn as sns
import evaluate

# Check Transformers library version
!pip show transformers

# Step 1: Load the Pidgin-English dataset
# Load data from TSV files for training, development, and testing
train_data = pd.read_csv('Data/pcm_train.tsv', sep='\t')
dev_data = pd.read_csv('Data/pcm_dev.tsv', sep='\t')
test_data = pd.read_csv('Data/pcm_test.tsv', sep='\t')

# Map sentiment labels to numeric format
label_to_id = {'positive': 0, 'neutral': 1, 'negative': 2}
train_data['label'] = train_data['label'].map(label_to_id)
dev_data['label'] = dev_data['label'].map(label_to_id)
test_data['label'] = test_data['label'].map(label_to_id)

# Convert dataframes to Hugging Face Dataset format
train_ds = Dataset.from_pandas(train_data)
dev_ds = Dataset.from_pandas(dev_data)
test_ds = Dataset.from_pandas(test_data)

# Step 2: Initialize the BERT tokenizer
# Using `bert-base-cased` for better handling of case-sensitive text
pidgin_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Step 3: Tokenize the dataset
def prepare_tokenized_data(examples):
    """
    Tokenizes Pidgin-English text for sentiment classification.
    Pads/truncates sequences to a fixed length of 256 tokens.
    """
    return pidgin_tokenizer(examples['tweet'], padding='max_length', truncation=True, max_length=256)

# Apply tokenization
train_ds = train_ds.map(prepare_tokenized_data, batched=True)
dev_ds = dev_ds.map(prepare_tokenized_data, batched=True)
test_ds = test_ds.map(prepare_tokenized_data, batched=True)

# Step 4: Load evaluation metrics
f1_evaluator = evaluate.load("f1", config_name='weighted')
accuracy_evaluator = evaluate.load("accuracy")

# Define the metrics computation function
def evaluate_metrics(predictions):
    """
    Computes accuracy and F1 scores for sentiment classification.
    """
    preds = np.argmax(predictions.predictions, axis=1)
    return {
        'accuracy': accuracy_score(predictions.label_ids, preds),
        'f1_micro': f1_score(predictions.label_ids, preds, average='micro'),
        'f1_macro': f1_score(predictions.label_ids, preds, average='macro'),
        'f1_weighted': f1_score(predictions.label_ids, preds, average='weighted'),
    }

# Step 5: Baseline model
# Train a DummyClassifier for comparison
baseline_model = DummyClassifier(strategy="most_frequent")
baseline_model.fit(train_data['tweet'], train_data['label'])
baseline_preds = baseline_model.predict(test_data['tweet'])

# Evaluate baseline performance
baseline_accuracy = accuracy_score(test_data['label'], baseline_preds)
baseline_f1_weighted = f1_score(test_data['label'], baseline_preds, average='weighted')
print("Baseline Model Accuracy:", baseline_accuracy)
print("Baseline Model F1 (Weighted):", baseline_f1_weighted)

# Step 6: Load the BERT model
# Using BERT instead of DistilBERT for classification
pidgin_model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)

# Step 7: Configure training arguments
training_parameters = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./training_logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    output_dir='./model_results',
    overwrite_output_dir=True,
)

# Step 8: Train the model
pidgin_trainer = Trainer(
    model=pidgin_model,
    args=training_parameters,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=pidgin_tokenizer,
    compute_metrics=evaluate_metrics,
)

# Start training
pidgin_trainer.train()

# Step 9: Evaluate on the test dataset
test_results = pidgin_trainer.evaluate(test_ds)
test_predictions = np.argmax(test_results.predictions, axis=1)
test_labels = test_results.label_ids

print("Test Set Accuracy:", test_results["eval_accuracy"])
print("Test Set Weighted F1:", test_results["eval_weighted_f1"])

# Step 10: Confusion matrix visualization
conf_matrix = confusion_matrix(test_labels, test_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.title('Confusion Matrix for Pidgin-English Sentiment Analysis')
plt.show()


zsh:1: no matches found: transformers[torch]
Name: transformers
Version: 4.46.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /opt/anaconda3/envs/nlp_env/lib/python3.8/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


Map:   0%|          | 0/5121 [00:00<?, ? examples/s]

Map:   0%|          | 0/1281 [00:00<?, ? examples/s]

Map:   0%|          | 0/4154 [00:00<?, ? examples/s]

Baseline Model Accuracy: 0.5599422243620606
Baseline Model F1 (Weighted): 0.40198321415622007


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  pidgin_trainer = Trainer(


Epoch,Training Loss,Validation Loss
