In [1]:
from data_preprocessing.data_preprocessor import DataPreprocessor
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, hamming_loss
from scipy.special import expit
import os
from joblib import dump

In [2]:
data_preprocessor = DataPreprocessor('data/processed_data.csv')
data_preprocessor.filter_genres(k=5)

Unnamed: 0,plot_summary,genres
0,Murugappa is a small time farm labourer who l...,[Drama]
1,A hyper-vigilant agent of the Department of Pu...,"[Thriller, Drama]"
2,"Four friends- Gangu , Abdul , Nihal and Gary ...","[Action, Drama]"
3,A married man is having an affair with another...,[Drama]
4,"The movie concerns the life of Tomasina ""Tommy...","[Romance Film, Comedy]"
...,...,...
29990,Jimmy Boland has been sentenced to a Californ...,[Action]
29991,Englishman Ronald Quayle was accused of murder...,[Drama]
29995,Managing Editor Sam Gatlin arrives in the afte...,[Drama]
29997,The film is about a woman named Jennefer who ...,"[Thriller, Drama]"


In [3]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data_preprocessor.data['genres'])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data_preprocessor.data['plot_summary'], y, test_size=0.2,
                                                    random_state=42)

In [5]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_),
                                                      problem_type="multi_label_classification")

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [7]:
class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)


In [8]:
train_dataset = MovieDataset(train_encodings, y_train)
test_dataset = MovieDataset(test_encodings, y_test)

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = expit(logits)
    predictions = (predictions > 0.5).astype(int)
    return {"f1": f1_score(labels, predictions, average='samples'), "hamming_loss": hamming_loss(labels, predictions)}


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy='epoch',
    save_strategy='no',
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model('bert-genre-classifier')
dump(mlb, os.path.join("bert-genre-classifier", 'mlb.joblib'))

In [4]:
from modeling.model_handler import ModelHandler

model, tokenizer, mlb = ModelHandler.load_model_hf('bert-genre-classifier')

In [7]:
movie_summary = "A group of friends embark on a dangerous journey in an effort to imprison their oppressor, the evil wizard ZeeBad."

ModelHandler.inference_model_hf(model, tokenizer, mlb, movie_summary, 0.2)

[('Action', 0.49227488), ('Drama', 0.65509963)]