In [1]:
!pip install sentencepiece
!pip install transformers
!pip install transformers accelerate
!pip install torch==1.11.00.676617

[31mERROR: Could not find a version that satisfies the requirement torch==1.11.00.676617 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.11.00.676617[0m[31m
[0m

In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.utils import shuffle
import torch.nn as nn
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import os


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Dataframe preparation
df = pd.read_csv('/content/drive/My Drive/Backtranslation/Mizo/MIZO_to_DE.csv')

le = preprocessing.LabelEncoder()
df['Sentiment'] = le.fit_transform(df.Sentiment.values)

df = shuffle(df)

In [5]:
train_texts, val_texts, train_labels_str, val_labels_str = train_test_split(list(df['Text']), list(df['Sentiment']), test_size=.2)

In [6]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]



In [7]:
class XLMR_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_labels = le.fit_transform(train_labels_str)
val_labels = le.transform(val_labels_str)

In [9]:
train_dataset = XLMR_Dataset(train_encodings, train_labels)
val_dataset = XLMR_Dataset(val_encodings, val_labels)

In [10]:
class XLMRClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(XLMRClassifier, self).__init__()
        self.base_model = XLMRobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))

        return (loss, logits) if loss is not None else logits

In [11]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [12]:
model_name = 'xlm-roberta-base'
num_labels = 2  # Number of sentiment labels
xlmr_model = XLMRClassifier(model_name, num_labels)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {'Accuracy': acc, 'F1': f1, 'Precision': precision, 'Recall': recall}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    eval_steps=10
)

trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [None]:
test_df = pd.read_excel("/content/drive/My Drive/SoftwareProject/Test_data_Chungli_ao.xlsx")

In [None]:
test_texts = list(test_df['Text'])
test_df['Sentiment'] = le.fit_transform(test_df.Sentiment.values)
test_labels = test_df['Sentiment']

In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = XLMR_Dataset(test_encodings, test_labels)

In [None]:
test_metrics = trainer.evaluate(test_dataset)

In [None]:
print(test_metrics)