In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

In [None]:
import transformers
import torch
import numpy as np
import random
import pandas as pd
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
import ast

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import datasets
from datasets import Dataset, DatasetDict

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
#FUNCTIONS DEFINITION

#READ SPLIT TOKENS
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing value {val}: {e}")
        return val  # Return the original value if there is an error

#MERGE TOKENS AS A WHOLE TEXT
def join_tokens(token_list):
    if isinstance(token_list, list):
        return ' '.join(token_list)
    return token_list


#SPLIT TRAIN + TEST 80-20
def split_train_test(df, label_name):
    train, test= train_test_split(df, test_size=0.2, stratify=df[label_name],random_state=42)
    return train, test

def compute_metricsweighted(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

def compute_metricsbinary(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [None]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")
CleanData=CleanData[["clean_review","is_spoiler"]]

In [None]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [None]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [None]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})
CleanData = CleanData[['text','label']]

In [None]:
train, test,= train_test_split(CleanData, test_size=0.2, stratify=CleanData['label'],random_state=42)

train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

In [None]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

In [None]:
def encodeBig(text):
    return tokenizer(text['text'], padding="max_length", truncation=True, max_length=128)

In [None]:
Train=Train.map(encodeBig,batched=True)

In [None]:
Eval=Eval.map(encodeBig,batched=True)

In [None]:
Test=Test.map(encodeBig,batched=True)

In [None]:
BATCH_SIZE = 16
WEIGHT_DECAY=0.01
LR = 2e-5
EPOCHS = 3

In [None]:
model=AutoModelForSequenceClassification.from_pretrained("/opt/models/bert-base-cased")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
output_dir="test_dirClean",
learning_rate=LR,
weight_decay=WEIGHT_DECAY,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
save_strategy='no',
fp16=True
)


In [None]:
model.cuda()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metricsbinary,
)

In [None]:
history=trainer.train()