In [1]:
!pip install transformers
!pip install sklearn



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch


In [35]:

# Replace this with the path to your CSV file
csv_path = "/content/drive/MyDrive/all.csv"


data = pd.read_csv(csv_path)

# Add the function to convert sentiment polarity to labels
def label_sentiment(polarity):
    if polarity > 0:
        return "positive"
    elif polarity < 0:
        return "negative"
    else:
        return "neutral"

# Apply the function to create a new column with sentiment labels
data["sentiment_label"] = data["sentiment_polarity"].apply(label_sentiment)

texts = data["processed_text"].tolist()
sentiments = data["sentiment_label"].tolist()

tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")




In [36]:
def encode(texts, labels):
    input_ids = []
    attention_masks = []

    # Create a dictionary to map labels to integers
    label_map = {"positive": 0, "negative": 1, "neutral": 2}
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=64, truncation=True, padding="max_length", return_tensors="pt"
        )
        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    # Convert the labels to integers using the label_map dictionary and then to floats
    labels = torch.tensor([label_map[label] for label in labels], dtype=torch.float)

    return input_ids, attention_masks, labels




In [37]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, sentiments, test_size=0.7, random_state=42)


In [38]:
train_input_ids, train_attention_masks, train_labels = encode(train_texts, train_labels)
test_input_ids, test_attention_masks, test_labels = encode(test_texts, test_labels)


In [39]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx],
        }


In [40]:
train_dataset = CustomDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = CustomDataset(test_input_ids, test_attention_masks, test_labels)


In [41]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Add this line to ensure metrics are logged
)
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=3)  # Use num_labels=3 as you have 3 classes


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

In [42]:


from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"eval_accuracy": acc, "eval_precision": precision, "eval_recall": recall, "eval_f1": f1}




In [43]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()




Step,Training Loss
10,1.0861
20,1.0717
30,1.0865
40,1.0631
50,1.0098
60,1.0091
70,1.0139
80,1.0929
90,0.9821
100,0.9582


TrainOutput(global_step=4260, training_loss=0.1834370637730916, metrics={'train_runtime': 738.757, 'train_samples_per_second': 46.05, 'train_steps_per_second': 5.766, 'total_flos': 1303816336704000.0, 'train_loss': 0.1834370637730916, 'epoch': 10.0})

In [44]:

eval_results = trainer.evaluate()
accuracy = eval_results["eval_accuracy"]
precision = eval_results["eval_precision"]
recall = eval_results["eval_recall"]
f1_score = eval_results["eval_f1"]


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")




Precision: 0.9494
Recall: 0.9494
F1 Score: 0.9489
Accuracy: 0.9494
