In [None]:
pip install torch  transformers pandas scikit-learn

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import classification_report
import torch


In [17]:
train_file = f"/content/train.csv"
test_file = f"/content/test.csv"

# Load the datasets
train_df = pd.read_csv(train_file,encoding='latin1')
test_df = pd.read_csv(test_file,encoding='latin1')

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Preprocess the data
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [18]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Assuming the train and test CSVs have a 'text' column for the text and a 'label' column for the sentiment label
# If the column names are different, replace 'text' and 'label' accordingly
train_df = train_df.rename(columns={"text": "text", "sentiment": "label"})
test_df = test_df.rename(columns={"text": "text", "sentiment": "label"})

train_df = train_df[['text', 'label']]
test_df=test_df[['text', 'label']]

In [19]:
train_df.head()

Unnamed: 0,text,label
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [20]:
label_map = {"positive": 2, "neutral": 1, "negative": 0}  # Adjust the mapping based on your actual labels
train_df['label'] = train_df['label'].map(label_map).fillna(-1).astype(int)
test_df['label'] = test_df['label'].map(label_map).fillna(-1).astype(int)

In [21]:
train_df = train_df[train_df['label'] != -1]
test_df = test_df[test_df['label'] != -1]

In [22]:
train_df = train_df.dropna()
test_df = test_df.dropna()

train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128)

In [23]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, list(train_df['label']))
test_dataset = SentimentDataset(test_encodings, list(test_df['label']))


In [24]:
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.001,
    logging_dir="./logs",
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [35]:
trainer.train()


Step,Training Loss
500,0.4587
1000,0.6035
1500,0.5792
2000,0.5309
2500,0.4714
3000,0.4469
3500,0.4561
4000,0.3589
4500,0.3602
5000,0.3528


TrainOutput(global_step=8590, training_loss=0.3699308122272958, metrics={'train_runtime': 3148.1517, 'train_samples_per_second': 43.645, 'train_steps_per_second': 2.729, 'total_flos': 7908202661587200.0, 'train_loss': 0.3699308122272958, 'epoch': 5.0})

In [37]:
from sklearn.metrics import classification_report
import numpy as np
predictions = trainer.predict(test_dataset)
y_true = np.array([example["labels"] for example in test_dataset])
y_pred = np.argmax(predictions.predictions, axis=-1)

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.81      0.79      1001
           1       0.76      0.75      0.75      1430
           2       0.85      0.83      0.84      1103

    accuracy                           0.79      3534
   macro avg       0.79      0.80      0.79      3534
weighted avg       0.79      0.79      0.79      3534

