In [95]:
# Here we are  mounting the Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [96]:
# we are importing pandas library.
import pandas as pd

In [97]:
#Reading the xlsx file using pandas
df = pd.read_excel("/content/drive/MyDrive/Data_DL_project/Spanish_data_sets/train_spanish2_dataframe.xlsx")

In [98]:
#we are printing the df.
df

Unnamed: 0,text,label
0,Esta película es una pérdida de tiempo total,negative
1,Esta película es una verdadera obra de arte de...,positive
2,el guioncillo es de ser lo menos,negative
3,cuida muchos detalles,positive
4,Esta película es visualmente impresionante,positive
...,...,...
318,Esta película es un drama conmovedor que te to...,positive
319,Una historia deliciosa,positive
320,llena de acción y efectos especiales asombrosos.,positive
321,Esta pelicula es un mito,positive


In [99]:
# we are converting the labels to numbers using fit transform.
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['labels'] = le.fit_transform(df.label.values)

In [100]:
# we are printing the head of the df
df.head()

Unnamed: 0,text,label,labels
0,Esta película es una pérdida de tiempo total,negative,0
1,Esta película es una verdadera obra de arte de...,positive,1
2,el guioncillo es de ser lo menos,negative,0
3,cuida muchos detalles,positive,1
4,Esta película es visualmente impresionante,positive,1


In [101]:
#we are counting the number of positive and negative labels in df.
positive_labels = (df['labels'] == 1).sum()
negative_labels = (df['labels'] == 0).sum()

In [102]:
# we are printing the number of positive and negative labels in the df.
print(f"Number of positive labels: {positive_labels}")
print(f"Number of negative labels: {negative_labels}")

Number of positive labels: 161
Number of negative labels: 162


In [103]:
#Shuffling the data using the sklearn.
from sklearn.utils import shuffle
df = shuffle(df)

In [104]:
#seeing the shuffeled data in df
df

Unnamed: 0,text,label,labels
173,desde el principio ya pintaba mal,negative,0
132,decir eso no resta misterio al bodrio este,negative,0
197,soy un gran fan,positive,1
9,el mercado del cine no deberia seguir,negative,0
104,la pelicula se hace pesada en algunos tramos,negative,0
...,...,...,...
188,Ha sido sencillamente agradable,positive,1
71,creo que no puede ser mas frustrante,negative,0
106,Puro y duro entretenimiento,positive,1
270,Yo no se de que va Spielberg pero a este paso ...,negative,0


In [105]:
#Using the train test split of sk learn to split the data in train and validation sets.
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(list(df['text']), list(df['labels']), test_size=.2)

In [106]:
#Installing sentencepiece
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [107]:
#Installing transformers.
!pip install transformers==4.6.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [108]:
#Installing torch.
!pip install torch==1.7.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement torch==1.7.0 (from versions: 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.7.0[0m[31m
[0m

In [109]:

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
# here we load the pre-trained XLM-RoBERTa tokenizer for the 'xlm-roberta-base' model.
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
#It loads the pre-trained XLM-RoBERTa model for sequence classification with two output labels.
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base',num_labels=2)
#print(model)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# Here this tokenizes the validation texts using the same pre-trained tokenizer as used for the training texts.
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [110]:
# It takes in tokenized encodings and labels as arguments. 
# It creates two instances of this class, train_dataset and val_dataset, for training and validation.
class spanish_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = spanish_Dataset(train_encodings, train_labels)
val_dataset = spanish_Dataset(val_encodings, val_labels)

In [111]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'Accuracy': acc,
            'F1': f1,
            'Precision': precision,
            'Recall': recall
           }

In [112]:
# Here we are importing the EarlyStoppingCallback, IntervalStrategy, Trainer, and TrainingArguments classes from the transformers library.
from transformers import EarlyStoppingCallback, IntervalStrategy,Trainer, TrainingArguments

In [113]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    do_eval=True,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    #EarlyStoppingCallback = EarlyStoppingCallback(early_stopping_patience=3)
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset= val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


In [114]:
# training and validation datasets and training arguments using trainer model.
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=165, training_loss=0.7077827684807055, metrics={'train_runtime': 38.7397, 'train_samples_per_second': 4.259, 'total_flos': 64562092189200.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': -768008192, 'init_mem_gpu_alloc_delta': 1112779776, 'init_mem_cpu_peaked_delta': 768008192, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 53248, 'train_mem_gpu_alloc_delta': 3344160256, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 769659392})

In [115]:
#Evaluating metrics on validation dataset.
metrics=trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


In [116]:
#printing metrics on validation data.
metrics

{'eval_loss': 0.668229341506958,
 'eval_Accuracy': 0.6461538461538462,
 'eval_F1': 0.39252336448598135,
 'eval_Precision': 0.3230769230769231,
 'eval_Recall': 0.5,
 'eval_runtime': 0.4746,
 'eval_samples_per_second': 136.943,
 'epoch': 5.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 16048640}

In [117]:
# loading the test set.
test_df = pd.read_excel("/content/drive/MyDrive/Data_DL_project/Spanish_data_sets/test_spanish2_dataframe.xlsx")

In [118]:
#length of test set.
len(test_df)

76

In [119]:
# Count the number of positive and negative labels in test set
positive_labels = (test_df['label'] == "positive").sum()
negative_labels = (test_df['label'] == "negative").sum()

In [120]:
#Printing the number of positive and negative labels in test set.
print(f"Number of positive labels: {positive_labels}")
print(f"Number of negative labels: {negative_labels}")

Number of positive labels: 38
Number of negative labels: 38


In [121]:

test_texts = list(test_df['text'])

In [122]:
# here the code converts the labels of a test dataset into numerical values.
test_labels = le.transform(test_df['label'])

In [123]:

test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = spanish_Dataset(test_encodings, test_labels)

In [124]:
# here we are evaluating a model using the Trainer object trainer on a test dataset.
test_metrics = trainer.evaluate(test_dataset)

In [125]:
#Printing test_metrics of test set.
test_metrics

{'eval_loss': 0.6828727126121521,
 'eval_Accuracy': 0.5394736842105263,
 'eval_F1': 0.4155130740496594,
 'eval_Precision': 0.7602739726027397,
 'eval_Recall': 0.5394736842105263,
 'eval_runtime': 0.4173,
 'eval_samples_per_second': 182.122,
 'epoch': 5.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 7040000}