In [1]:
import emoji
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("../resources/dataset_train_comments_v1.2.csv")

In [3]:
df=df.drop(columns="Unnamed: 0",axis=0)

In [4]:
df=df.dropna()

In [5]:
# Supponendo che la colonna attuale si chiami "category" e vuoi rinominarla in "labels"
df = df.rename(columns={"category": "labels"})

# Se df["labels"] contiene stringhe, prima mappale su numeri
label2id = {label: i for i, label in enumerate(df["labels"].unique())}
df["labels"] = df["labels"].map(label2id)


In [6]:
label2id

{'Neutrale': 0,
 'Positiva': 1,
 'Negativo': 2,
 'Discriminatorio': 3,
 'Complottismo': 4,
 'Allarmismo': 5,
 'Disinformazione': 6,
 'Estremismi ideologici': 7}

### Clean emoj

In [7]:
df["text_clean"] = df["text"].apply(lambda x: emoji.demojize(x))
df

Unnamed: 0,commentId,text,labels,text_clean
0,UgyYF8bf6AXRrjhGdJ14AaABAg,Peggiori condivido le tre che hai messo in sbo...,0,Peggiori condivido le tre che hai messo in sbo...
1,Ugxywle64-SAKNjBKS94AaABAg,Rido troppo coi dissing al Pisa hahaha ormai è...,1,Rido troppo coi dissing al Pisa hahaha ormai è...
2,UgyYuLWkoyFkl-uwfed4AaABAg,Man utd❤,1,Man utd:red_heart:
3,UgzA-XvP_yXKGfI2hE14AaABAg,adesso voglio anche il video per le terze,0,adesso voglio anche il video per le terze
4,UgzWWZLdUVVxJCgHRLh4AaABAg,Ciao Vito. Voglio un tuo parere per la Partita...,0,Ciao Vito. Voglio un tuo parere per la Partita...
...,...,...,...,...
1616,2b9d6ed2-82e5-487b-a71e-7cbf4e8e9bda,I’m vax forever voi morirete,5,I’m vax forever voi morirete
1617,72057cf5-7c95-40ee-a026-c5c53b4f13b5,I’m no vax forever e voi Es morirete,5,I’m no vax forever e voi Es morirete
1618,69ea6aba-d5e3-4aa5-8d40-614576767c4e,I’m no vax forever e morirete voi,5,I’m no vax forever e morirete voi
1619,23ea4c72-b573-41b4-b53b-a2c29af1df68,I’m forever e voi morirete,5,I’m forever e voi morirete


### Tokenizer

In [8]:
# Trasformo il dataframe in Dataset Hugging Face
dataset = Dataset.from_pandas(df)

# Carico il tokenizer del modello scelto
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenizzazione
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)

# Split train/test
dataset = dataset.train_test_split(test_size=0.2)


Map: 100%|██████████| 1605/1605 [00:00<00:00, 5142.33 examples/s]


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['commentId', 'text', 'labels', 'text_clean', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1284
    })
    test: Dataset({
        features: ['commentId', 'text', 'labels', 'text_clean', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 321
    })
})

### Fine Tune

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(set(df["labels"]))
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.898811
2,No log,1.607034
3,No log,1.348364
4,No log,1.09905
5,No log,0.997841
6,No log,0.907654
7,1.285900,0.855616
8,1.285900,0.843061


TrainOutput(global_step=648, training_loss=1.117331893355758, metrics={'train_runtime': 284.208, 'train_samples_per_second': 36.143, 'train_steps_per_second': 2.28, 'total_flos': 1360850716459008.0, 'train_loss': 1.117331893355758, 'epoch': 8.0})

In [11]:
model.save_pretrained("./yc107-comment-classifier")
tokenizer.save_pretrained("./yc107-comment-classifier")

('./yc107-comment-classifier/tokenizer_config.json',
 './yc107-comment-classifier/special_tokens_map.json',
 './yc107-comment-classifier/vocab.txt',
 './yc107-comment-classifier/added_tokens.json',
 './yc107-comment-classifier/tokenizer.json')

SyntaxError: invalid syntax (1355118443.py, line 1)