In [4]:
!pip install datasets transformers torch
!apt-get install git-lfs
from google.colab import files
uploaded = files.upload()
from sklearn.metrics import f1_score, accuracy_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


Saving data.csv to data (1).csv


In [6]:
#Loading the data
data = pd.read_csv('data.csv')

In [7]:
#Normalizing labels to the range [0, 100]
data['Score'] = data['Score'].apply(lambda x: int((x / data['Score'].max()) * 100))

In [8]:
#Converting DataFrame to Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2)
dataset_dict = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']
})

In [11]:
#tokenizing
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def preprocess_function(examples):
    result = tokenizer(examples['Text'], truncation=True, padding=True)
    result['labels'] = examples['Score']
    return result

tokenized_train = dataset_dict['train'].map(preprocess_function, batched=True, remove_columns=['Text', 'Suggestion', 'Score'])
tokenized_validation = dataset_dict['validation'].map(preprocess_function, batched=True, remove_columns=['Text', 'Suggestion', 'Score'])

Map:   0%|          | 0/2608 [00:00<?, ? examples/s]

Map:   0%|          | 0/653 [00:00<?, ? examples/s]

In [12]:
#Checking label values
print(f"Unique labels in train set: {set(tokenized_train['labels'])}")
print(f"Unique labels in validation set: {set(tokenized_validation['labels'])}")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Unique labels in train set: {0, 4, 8, 10, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 90, 92, 94, 96, 98, 100}
Unique labels in validation set: {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 82, 84, 88, 90, 92, 94, 96, 100}


In [13]:
#Defining the model with the number of main labels
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=101)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}


In [9]:
#Defining TrainingArguments
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        #Calculating model output and standard loss function
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits

        # Calculating L1 penalty
        l1_lambda =5e-9
        l1_norm = sum(p.abs().sum() for p in model.parameters())
        loss = loss + l1_lambda * l1_norm

        return (loss, outputs) if return_outputs else loss

# Configuring TrainingArguments
training_args = TrainingArguments(
    output_dir='./output',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=10,
    save_steps=10,
    save_total_limit=2
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    data_collator=data_collator,
    compute_metrics=lambda eval_pred: {
        "accuracy": np.mean(np.argmax(eval_pred.predictions, axis=1) == eval_pred.label_ids),
        "f1": np.mean(
            (np.argmax(eval_pred.predictions, axis=1) == eval_pred.label_ids).astype(int)
        )
    }
)
# training model
trainer.train()
# saving model
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.0254,3.024453,0.154671,0.154671
2,2.7788,2.848267,0.171516,0.171516
3,2.8785,2.788878,0.17611,0.17611
4,2.7136,2.747259,0.194487,0.194487
5,2.6036,2.706863,0.20827,0.20827
6,2.6228,2.693949,0.199081,0.199081
7,2.4373,2.692162,0.188361,0.188361
8,2.4252,2.729591,0.17611,0.17611
9,2.403,2.706671,0.183767,0.183767
10,2.3292,2.712035,0.188361,0.188361


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [10]:
from transformers import pipeline

#Creating a pipeline for sentiment analysis using the trained model
sentiment_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

#Using the model to predict the sentiment of input sentences
print(sentiment_model(['خیلی افتضاحه', 'عالی']))



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_100', 'score': 0.29095637798309326}, {'label': 'LABEL_100', 'score': 0.3948359787464142}]
