# **Preprocessing**

In [1]:
!pip install datasets transformers scikit-learn torch

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Do

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, EarlyStoppingCallback, Trainer
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn

In [5]:
path = r"comments_256.csv"
dataset = load_dataset("csv", data_files={"data": path})["data"]

In [6]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [7]:
train_dataset

Dataset({
    features: ['Comment', 'Sentiment'],
    num_rows: 73943
})

In [8]:
def map_sentiment(example):
    mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
    example["label"] = mapping[example["Sentiment"]]
    return example

In [9]:
train_dataset = train_dataset.map(map_sentiment)
val_dataset = val_dataset.map(map_sentiment)

In [10]:
model_name = "AmaanP314/youtube-xlm-roberta-base-sentiment-multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["Comment"], truncation=True, padding="max_length", max_length=256)

In [12]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/8216 [00:00<?, ? examples/s]

In [13]:
columns_to_keep = ["input_ids", "attention_mask", "label"]
train_dataset.set_format(type="torch", columns=columns_to_keep)
val_dataset.set_format(type="torch", columns=columns_to_keep)

In [14]:
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# **Fine-tuning**:


In [15]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
        loss_fct = nn.CrossEntropyLoss(label_smoothing=0.1)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [15]:
pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate>=0.26.0
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./result",
    eval_strategy="steps",
    eval_steps=125,
    save_steps=125,
    per_device_train_batch_size=32,  # Adjusted batch size
    per_device_eval_batch_size=32,   # Adjusted batch size
    num_train_epochs=3,
    learning_rate=1e-5,
    weight_decay=0.05,
    gradient_accumulation_steps=4,   # Adjusted for effective batch size
    fp16=True,
    logging_steps=100,
    load_best_model_at_end=True,
    warmup_steps=500,
    report_to="none"
)


In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [18]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [19]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
125,0.7637,0.735453,0.729674
250,0.7289,0.711021,0.74574
375,0.7155,0.698165,0.757181
500,0.6912,0.700488,0.755842
625,0.6851,0.682057,0.767892
750,0.6606,0.689664,0.766188
875,0.6464,0.683783,0.766796
1000,0.6542,0.667605,0.774464
1125,0.6501,0.660234,0.780428
1250,0.6374,0.672968,0.778116


TrainOutput(global_step=1731, training_loss=0.6625320582221795, metrics={'train_runtime': 1732.4042, 'train_samples_per_second': 128.047, 'train_steps_per_second': 0.999, 'total_flos': 2.9137969219451904e+16, 'train_loss': 0.6625320582221795, 'epoch': 2.99524015577672})

In [20]:
trainer.save_model("./youtube_sentiment_model_telugu")
tokenizer.save_pretrained("./youtube_sentiment_model_telugu")

('./youtube_sentiment_model_telugu/tokenizer_config.json',
 './youtube_sentiment_model_telugu/special_tokens_map.json',
 './youtube_sentiment_model_telugu/tokenizer.json')