In [4]:
!pip install transformers transformers[torch] datasets huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m115.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sklearn import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
# !pip install transformers transformers[torch], datasets
# pip install huggingface_hub

In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [None]:
# !pip install transformers transformers[torch]

In [7]:
class ClassificationDataset:
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        # print(f"__getitem__ called with value {item}")
        text = str(self.data[item]['synopsis'])
        target = int(self.data[item]['genre'])
        inputs = self.tokenizer(text, max_length=20, padding="max_length", truncation=True)
        # print("inputs.keys()", inputs.keys())
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(target, dtype=torch.long)

        }

In [8]:
def compute_metrics(eval_pred):
    print("executing compute_metrics")
    prediction, labels = eval_pred
    prediction = np.argmax(prediction, axis=1)
    accuracy = metrics.accuracy_score(labels, prediction)
    return {
        "accuracy": accuracy
    }

In [11]:
from google.colab import files

In [17]:
def train():
    df = load_dataset("datadrivenscience/movie-genre-prediction")
    df = df.class_encode_column("genre")
    df_train = df["train"]
    df_test = df["test"]
    temp_df = df_train.train_test_split(test_size=0.2, stratify_by_column="genre")
    df_train = temp_df["train"]
    df_val = temp_df['test']
    # print('len(df_train.features["genre"].int2str)', len(df_train.features["genre"]._int2str))
    tokenizers = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                               num_labels=len(df_train.features["genre"]._int2str), )

    train_dataset = ClassificationDataset(df_train, tokenizers)
    val_dataset = ClassificationDataset(df_val, tokenizers)
    test_dataset = ClassificationDataset(df_test, tokenizers)
    args = TrainingArguments(
        "model",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=0.001,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to=["none"],
        save_total_limit=1
    )

    trainer = Trainer(model,
                      args,
                      train_dataset=train_dataset,
                      eval_dataset=val_dataset,
                      tokenizer=tokenizers,
                      compute_metrics=compute_metrics)

    trainer.train()
    preds = trainer.predict(test_dataset).predictions
    preds = np.argmax(preds, axis=1)
    submission = pd.DataFrame({"id": df_test['id'], "genre": preds})
    # submission.loc[:, "genre"] = submission.genre.astype('str')
    submission.loc[:, "genre"] = submission.genre.apply(lambda x: df_train.features['genre'].int2str(x))
    submission.to_csv("submission.csv", index=False)
    files.download('submission.csv')

In [18]:
train()



  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy
1,2.4685,2.351102,0.1
2,2.3095,2.306044,0.1
3,2.3068,2.304058,0.1
4,2.3053,2.303313,0.1
5,2.3024,2.302644,0.1


executing compute_metrics
executing compute_metrics
executing compute_metrics
executing compute_metrics
executing compute_metrics


executing compute_metrics


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
!zip -r /content/model.zip /content/model
files.download('/content/model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>