In [6]:
from sklearn.model_selection import train_test_split
import torch
from tools.read import get_data
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from datasets import load_metric

In [7]:
NUM_CLASSES = 2
NAME = f"{NUM_CLASSES}_camembert"
BASE = "camembert-base"

In [8]:
df = get_data("data/dataset500.csv")
text = []
labels = []
for comments in df["comments"]:
    for comment in comments:
        text.append(comment["comment"][0])
        if(comment["grade"][0]< 10):
            labels.append(0)
        elif(comment["grade"][0]< 21):
            labels.append(1)
        # if(comment["grade"][0]< 5):
        #     labels.append(0)
        # elif(comment["grade"][0]< 10):
        #     labels.append(1)
        # elif(comment["grade"][0]< 15):
        #     labels.append(2)
        # elif(comment["grade"][0]< 21):
        #     labels.append(3)
        # labels.append(comment["grade"][0])

In [9]:
class CustomTextBertDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels):
        self.labels = labels
        self.text = text
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = tokenize_bert_function(self.text[idx])
        sample = {"input_ids": text["input_ids"], "attention_mask": text["attention_mask"], "label": label}
        return sample

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE)
def tokenize_bert_function(examples):
    return tokenizer(
        examples,
        padding="max_length", # Pad to the maximum length accepted by the model
        truncation=True, # Truncate to the maximum length accepted by the model
    )

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text, labels, shuffle=True, random_state=42, test_size=0.33)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BASE, num_labels=NUM_CLASSES)

In [None]:
train_dataset = CustomTextBertDataset(X_train, y_train)
test_dataset = CustomTextBertDataset(X_test, y_test)

In [None]:
metric = load_metric("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=NAME, evaluation_strategy="epoch", auto_find_batch_size=True, num_train_epochs=1)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
model = trainer.train()

In [14]:
trainer.save_model(NAME)

Saving model checkpoint to 2_camembert
Configuration saved in 2_camembert\config.json
Model weights saved in 2_camembert\pytorch_model.bin
