In [1]:
from datasets import load_dataset

data = load_dataset("datadrivenscience/movie-genre-prediction")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/home/codespace/.cache/huggingface/datasets/datadrivenscience___parquet/datadrivenscience--movie-genre-prediction-01acd85570f2b187/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 2/2 [00:00<00:00, 29.30it/s]


In [2]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 54000
    })
    test: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 36000
    })
})

In [3]:
data["train"][0]

{'id': 44978,
 'movie_name': 'Super Me',
 'synopsis': 'A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.',
 'genre': 'fantasy'}

In [4]:
# Preprocessing
processed_data = {"train": [], "test": []}
for type in ["train", "test"]:
    for idx in range(len(data[type])):
        processed_data[type].append({"id": data[type][idx]["id"], "text":  data[type][idx]['movie_name'] + " " + data[type][idx]['synopsis'], "genre":data[type][idx]["genre"]})

In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

train_df = pd.DataFrame(processed_data["train"])
test_df = pd.DataFrame(processed_data["test"])
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_df["genre"])
test_labels = mlb.transform(test_df["genre"])

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_df["text"]), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df["text"]), truncation=True, padding=True)


In [7]:
import torch

class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = MovieDataset(train_encodings, train_labels)
test_dataset = MovieDataset(test_encodings, test_labels)

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

Downloading model.safetensors: 100%|██████████| 440M/440M [00:02<00:00, 168MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [10]:
trainer.train()



: 

: 