# Background

Project ini merupakan implementasi Transfer Learning dengan kasus prediksi genre film berdasarkan judul dan sinopsis. Project ini menggunakan pretrained model dari BERT yaitu bert-base-uncased (model yang teksnya sudah dalam bentuk lower case) beserta tokenizernya yang kemudian model akan dilatih ulang menggunakan dataset baru, istilah ini disebut fine tuning. Pretrained model dan tokenizer yang digunakan dalam project ini disimpan dalam direktori lokal, sehingga untuk menjalankan script ini diperlukan direktori yang menyimpan model, tokenizer, dan konfigurasinya yang dapat didownload pada URL berikut: https://huggingface.co/bert-base-uncased. Fine tuning menggunakan framework deep learning yaitu PyTorch. 

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk

# pip3 install 'transformers[torch]'

# Load Dataset

In [2]:
data = load_from_disk("cleaned_data/")
data = data.class_encode_column("genre")
data["train"][0]

{'id': 44978,
 'movie_name': 'Super Me',
 'synopsis': 'A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.',
 'genre': 4,
 'final_text': 'movie name - super me, synopsis - a young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. selling them makes him rich.'}

# Load Pretrained BERT Model and Tokenizer

In [3]:
device = torch.device("mps")
print(device)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased/", use_fast=True, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased/",
    num_labels=len(data['train'].features["genre"]._int2str),
).to(device)

mps


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Count Max Len

In [4]:
max_len = 0

for example in data['train']:
    input_ids = tokenizer.encode(example['final_text'], add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print(f'Max sentence len - {max_len}')

Max sentence len - 99


# Define Train and Evaluate Model

In [5]:
class ClassificationDataset:
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        text = str(self.data[item]["final_text"])
        target = int(self.data[item]["genre"])
        inputs = self.tokenizer(text, max_length=max_len, padding="max_length", truncation=True)

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long).to(device),
            "attention_mask": torch.tensor(mask, dtype=torch.long).to(device),
            "labels": torch.tensor(target, dtype=torch.long).to(device),
        }


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = metrics.accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


def train(ds):
    ds_train = ds["train"]
    ds_test = ds["test"]

    temp_ds = ds_train.train_test_split(test_size=0.1, stratify_by_column="genre")
    ds_train = temp_ds["train"]
    ds_val = temp_ds["test"]

    train_dataset = ClassificationDataset(ds_train, tokenizer)
    valid_dataset = ClassificationDataset(ds_val, tokenizer)
    test_dataset = ClassificationDataset(ds_test, tokenizer)

    args = TrainingArguments(
        "model",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
        save_total_limit=1, 
        optim="adamw_torch"
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    preds = trainer.predict(test_dataset).predictions
    preds = np.argmax(preds, axis=1)

    # generate result file
    result = pd.DataFrame({"id": ds_test["id"], "genre": preds})
    result.loc[:, "genre"] = result.genre.apply(lambda x: ds_train.features["genre"].int2str(x))
    result.to_csv(f"result.csv", index=False)

In [6]:
train(data)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6129,1.601356,0.417963
