#### Description: This script is designed for performing natural language processing tasks using the DistilBert model.
#### The script uses the Transformers, Datasets, and Accelerate libraries to facilitate model training and evaluation.

## 1. Importing Packages

In [1]:
# Installing necessary libraries
!pip install datasets evaluate accelerate -U
!pip install transformers[torch]
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn

# Importing essential libraries for NLP tasks
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import accelerate
import evaluate
import numpy as np


# Checking the version of the transformers library
!pip show transformers


zsh:1: no matches found: transformers[torch]
Name: transformers
Version: 4.46.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /opt/anaconda3/envs/nlp_env/lib/python3.8/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


## 2. Data Loading and Label Conversion

In [2]:

# Loading training, development, and test datasets from TSV files
# Importing pandas for data manipulation
import pandas as pd

# Loading training, development, and test datasets from TSV files
train_df = pd.read_csv('Data/pcm_train.tsv', sep='\t')
dev_df = pd.read_csv('Data/pcm_dev.tsv', sep='\t')
test_df = pd.read_csv('Data/pcm_test.tsv', sep='\t')

# Mapping textual labels to numerical format for consistency
# 'positive': 0, 'neutral': 1, 'negative': 2
label_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}
train_df['label'] = train_df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

## 3. Dataset Conversion and Tokenization

In [3]:
# Converting pandas dataframes to Hugging Face 'datasets' format
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Initializing the tokenizer from the DistilBert model
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)



# Defining a function for tokenization
def tokenize_function(examples):
    # Tokenizing the text data with appropriate padding and truncation
    return tokenizer(examples['tweet'], padding='max_length', truncation=True, max_length=128)

# Applying the tokenization function to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5121 [00:00<?, ? examples/s]

Map:   0%|          | 0/1281 [00:00<?, ? examples/s]

Map:   0%|          | 0/4154 [00:00<?, ? examples/s]

## 4. Metrics Computation

In [4]:
# Loading metrics for evaluation
f1_metric = evaluate.load("f1", config_name='weighted')
accuracy_metric = evaluate.load("accuracy")

# Defining a function to compute metrics during model evaluation
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, predictions),
        'f1_micro': f1_score(p.label_ids, predictions, average='micro'),
        'f1_macro': f1_score(p.label_ids, predictions, average='macro'),
        'f1_weighted': f1_score(p.label_ids, predictions, average='weighted')
    }

## 5. Dummy Classifier (Most-Frequent Class) Initialization and Testing

In [5]:
# Extract features (tweets) and labels from the DataFrame
X_train = train_df['tweet']
y_train = train_df['label']
X_test = test_df['tweet']
y_test = test_df['label']

#Dummy Clasifier Initialization and Testing
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

## 6. Dummy Classifier Evaluation

In [6]:
# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate Weighted F1 Score
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Weighted F1 Score:", weighted_f1)

Accuracy: 0.5599422243620606
Weighted F1 Score: 0.40198321415622007


## 7.Model Initialization

In [7]:
# Initializing the model for sequence classification
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 8. Training Configuration Completion and Trainer Initialization

In [8]:
# # Completing the training arguments configuration
# training_args = TrainingArguments(
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=4,
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     save_strategy="epoch",
#     logging_dir='./logs',
#     logging_steps=100,
#     do_train=True,
#     do_eval=True,
#     output_dir='./results',
#     overwrite_output_dir=True,
#     push_to_hub=False,
# )

# # Initialize Trainer with compute metrics function
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=dev_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )



# Updated Training Configuration
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=4,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     eval_strategy="epoch",  # Changed from evaluation_strategy
#     save_strategy="epoch",
#     logging_dir='./logs',
#     logging_steps=100,
#     do_train=True,
#     do_eval=True,
#     overwrite_output_dir=True,
#     push_to_hub=False,
#     # Additional parameters for better performance
#     gradient_accumulation_steps=2,
#     warmup_steps=500
# )

# When using CPU
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=4,      # Smaller batch size for CPU
    per_device_eval_batch_size=4,       # Smaller batch size for CPU
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="no",                 # Evaluate only at end to save time
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=20,                   # More frequent updates
    do_train=True,
    do_eval=True,
    overwrite_output_dir=True,
    push_to_hub=False,
    gradient_accumulation_steps=8,      # Accumulate gradients
    save_total_limit=1,                 # Save only the last model
    dataloader_num_workers=0,           # CPU setting
    report_to="none"                    # Disable wandb reporting
)

# Updated Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)  # Removed tokenizer parameter

## 9. Model Training

In [None]:
# Initiating the training process
trainer.train()

Step,Training Loss


## 10. Model Evaluation

In [None]:

# Evaluating the model on the development set
# results = trainer.evaluate(dev_dataset)

# Note: Uncomment the following line to evaluate on the test dataset after finalizing the model.
results = trainer.evaluate(test_dataset)
predicted_labels = results.predictions.argmax(-1)
true_labels = results.label_ids

# Printing evaluation results
print("Weighted F1 on Test Set:", results["eval_weighted_f1"])
print("Accuracy on Test Set:", results["eval_accuracy"])

## 11. Plotting Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Assuming you have true labels and predictions from your model

# Generating the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plotting the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
