In [None]:
import transformers
import torch
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [None]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

In [None]:
dataRew=pd.read_json("Dataset/IMDB_reviews.json",lines=True)

In [None]:
dataRew.drop(columns=["movie_id","rating","review_date","user_id","review_summary"],inplace=True)

In [None]:
dataRew['is_spoiler'] = dataRew['is_spoiler'].map({True: 1, False: 0})
dataRew = dataRew.rename(columns={'is_spoiler': 'label'})

### Split the Dataset

In [None]:
train, test,= train_test_split(dataRew, test_size=0.2, stratify=dataRew['label'],random_state=42)

#### Divide the Dataset in Valuation and Training

In [None]:
train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

In [None]:
train['label'].value_counts()

In [None]:
val['label'].value_counts()

In [None]:
test['label'].value_counts()

### Change the Dataset

In [None]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

In [None]:
Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


### Tokenization

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [None]:
def encodeBig(text):
    return tokenizer(text['review_text'], padding="max_length", truncation=True, max_length=128)

In [None]:
Train=Train.map(encodeBig,batched=True)

In [None]:
Eval=Eval.map(encodeBig,batched=True)

In [None]:
Test=Test.map(encodeBig,batched=True)

### Apply the Model

In [None]:
BATCH_SIZE = 16
WEIGHT_DECAY=0.01
LR = 2e-5
EPOCHS = 3


In [None]:

model = RobertaForSequenceClassification.from_pretrained("roberta-base")

In [None]:


training_args = TrainingArguments(
output_dir="test_dirRob",
learning_rate=LR,
weight_decay=WEIGHT_DECAY,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
report_to="tensorboard",


)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model.cuda()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metrics2,
)

In [None]:
history=trainer.train()