<a href="https://colab.research.google.com/github/firefive555/testColab/blob/main/Copia_di_multilabel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install matplot
!pip install torch
!pip install datasets
!pip install evaluate
!pip install accelerate
!pip install optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Nuova sezione

In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import pipeline, EvalPrediction
from datasets import load_dataset, load_metric
import evaluate
from accelerate.utils import write_basic_config
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import optuna
write_basic_config(mixed_precision='fp16', )

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-wikipedia-uncased-v1")


In [None]:
attributes = ["Anger", "Joy" , "Disgust" , "Neutral" , "Surprise" , "Sadness" , "Fear" , "Trust" , "Anticipation" , "Love"]

In [None]:
dataset = (load_dataset('csv' , data_files="/content/emit_train_A.csv", split='train').train_test_split(test_size=0.2 , seed=0))


In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['text' , 'id']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

In [None]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

In [None]:
encoded_dataset.set_format("torch")

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result


In [None]:
batch = 32
metric_name = "f1"

In [None]:
def objective(trial):
  epoch = trial.suggest_loguniform("epoch" , 1 , 15)
  learning = trial.suggest_loguniform("learning_rate" , 5e-6 , 5e-4)

  model = AutoModelForSequenceClassification.from_pretrained("Musixmatch/umberto-wikipedia-uncased-v1",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)
  model.to("cpu")
  arg = TrainingArguments(
      f"bert-finetuned-sem_eval-english",
      evaluation_strategy = "epoch",
      save_strategy = "epoch",
      learning_rate=learning,
      per_device_train_batch_size=batch,
      per_device_eval_batch_size=batch,
      num_train_epochs=epoch,
      weight_decay=0.01,
      load_best_model_at_end=True,
      metric_for_best_model=metric_name,
      #push_to_hub=True,
  )
  trainer = Trainer(
      model = model,
      args = arg,
      train_dataset=encoded_dataset["train"],
      eval_dataset=encoded_dataset["test"],
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  trainer.train()
  evaluation = trainer.evaluate()
  return evaluation["eval_f1"]


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials = 100)

In [None]:
best = study.best_params

In [None]:
best

{'epoch': 6.106632873269415, 'learning_rate': 2.980729613776038e-05}

# Nuova sezione