In [None]:
!pip install datasets transformers torch
!apt-get install git-lfs
from google.colab import files
uploaded = files.upload()
from sklearn.metrics import f1_score, accuracy_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from huggingface_hub import notebook_login

notebook_login()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


Saving data.csv to data.csv


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#Loading the data
data = pd.read_csv('data.csv')

In [None]:
#Normalizing labels to the range [0, 100]
data['Score'] = data['Score'].apply(lambda x: int((x / data['Score'].max()) * 100))

In [None]:
#Converting DataFrame to Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2)
dataset_dict = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']
})

In [None]:
#tokenizing
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def preprocess_function(examples):
    result = tokenizer(examples['Text'], truncation=True, padding=True)
    result['labels'] = examples['Score']
    return result

tokenized_train = dataset_dict['train'].map(preprocess_function, batched=True, remove_columns=['Text', 'Suggestion', 'Score'])
tokenized_validation = dataset_dict['validation'].map(preprocess_function, batched=True, remove_columns=['Text', 'Suggestion', 'Score'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/2608 [00:00<?, ? examples/s]

Map:   0%|          | 0/653 [00:00<?, ? examples/s]

In [None]:
#Checking label values
print(f"Unique labels in train set: {set(tokenized_train['labels'])}")
print(f"Unique labels in validation set: {set(tokenized_validation['labels'])}")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Unique labels in train set: {0, 4, 8, 10, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 82, 84, 88, 90, 92, 94, 96, 98, 100}
Unique labels in validation set: {0, 4, 8, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 94, 96, 100}


In [None]:
#Defining the model with the number of main labels
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=101)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}


In [None]:
#Defining TrainingArguments
# تعریف کلاس CustomTrainer و تنظیمات TrainingArguments (کد شما)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        #Calculating model output and standard loss function
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits

        # Calculating L1 penalty
        l1_lambda =5e-9
        l1_norm = sum(p.abs().sum() for p in model.parameters())
        loss = loss + l1_lambda * l1_norm

        return (loss, outputs) if return_outputs else loss

# Configuring TrainingArguments
training_args = TrainingArguments(
    output_dir='fatemehramezani/my_awesome_model',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.001,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=10,
    save_steps=10,
    save_total_limit=2
)


# تعریف Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    data_collator=data_collator,
    compute_metrics=lambda eval_pred: {
        "accuracy": np.mean(np.argmax(eval_pred.predictions, axis=1) == eval_pred.label_ids),
        "f1": np.mean(
            (np.argmax(eval_pred.predictions, axis=1) == eval_pred.label_ids).astype(int)
        )
    }
)

# آموزش مدل
trainer.train()

# ذخیره‌سازی مدل
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.7379,2.700971,0.192956,0.192956
2,2.5556,2.690208,0.185299,0.185299
3,2.3675,2.709349,0.18683,0.18683


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [None]:
trainer.push_to_hub()

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

events.out.tfevents.1723878921.923924bc5c95.336.2:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fatemehramezani/my_awesome_model/commit/b68c7c9f42adcc32f1f3714a5ac9635600bac9d7', commit_message='End of training', commit_description='', oid='b68c7c9f42adcc32f1f3714a5ac9635600bac9d7', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import os

# چک کردن وجود فایل‌ها
model_dir = './saved_model'
if os.path.exists(model_dir):
    print("Model directory exists.")
    print("Files:", os.listdir(model_dir))
else:
    print("Model directory does not exist.")


Model directory exists.
Files: ['vocab.txt', 'tokenizer.json', 'special_tokens_map.json', 'model.safetensors', 'tokenizer_config.json', 'config.json']


In [None]:
from transformers import pipeline

#Creating a pipeline for sentiment analysis using the trained model
sentiment_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

#Using the model to predict the sentiment of input sentences
print(sentiment_model(['خیلی افتضاحه', 'عالی']))



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_60', 'score': 0.27455851435661316}, {'label': 'LABEL_100', 'score': 0.2452296018600464}]


In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/fatemehramezani/my_awesome_model/commit/2e50d90b188733a9b559aa322b1dbf920f7c4930', commit_message='End of training', commit_description='', oid='2e50d90b188733a9b559aa322b1dbf920f7c4930', pr_url=None, pr_revision=None, pr_num=None)