In [1]:
%load_ext autoreload
%autoreload 2


# Import Libraries

In [2]:
import os

from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer

from dataset import *
from model import *
from trainer import Trainer


In [3]:
PATH = "../"
MAX_LEN = 128
BATCH_SIZE = 64


# Loading data

In [4]:
train_data = pd.read_csv(os.path.join(PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(PATH, "test.csv"))

train_data.head()


Unnamed: 0,movie_name,target,movie_description,id
0,Furies,0,Three furious vigilantes unite to take down a ...,133529636342330622371894152500993949030
1,RRR,0,The story of freedom fighters Komaram Bheem an...,133529660110779376651195430564179049830
2,John Wick,0,Legendary assassin John Wick (Keanu Reeves) re...,133529680710101630359923204885606137190
3,John Wick: Chapter 3 -- Parabellum,0,After gunning down a member of the High Table ...,133529687048354631501070212369122164070
4,Top Gun: Maverick,0,After more than thirty years of service as one...,133529699724860633783364227336154217830


# Train Test split

In [5]:
train_split, val_split = train_test_split(train_data, train_frac=0.85)


# Loading tokenizer from pretrained

In [6]:
tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased", truncation=True, do_lower_case=True)


# Creating datasets and dataloaders

In [7]:
train_dataset = RottenTomatoesDataset(train_split, tokenizer, MAX_LEN)
val_dataset = RottenTomatoesDataset(val_split, tokenizer, MAX_LEN)
test_dataset = RottenTomatoesDataset(test_data, tokenizer, MAX_LEN)


In [8]:
train_params = {"batch_size": BATCH_SIZE,
                "shuffle": True,
                "num_workers": 0
                }

test_params = {"batch_size": BATCH_SIZE,
               "shuffle": False,
               "num_workers": 0
               }

train_dataloader = DataLoader(train_dataset, **train_params)
val_dataloader = DataLoader(val_dataset, **test_params)
test_dataloader = DataLoader(test_dataset, **test_params)


# Loading pretrained model from Huggingface

In [9]:
config = {
    "num_classes": 6,
    "dropout_rate": 0.1
}
model = DistilBertForClassification(
    "distilbert-base-uncased",
    config=config
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Creating Trainer object and fitting the model

In [10]:
trainer_config = {
    "lr": 3e-4,
    "n_epochs": 2,
    "weight_decay": 1e-6,
    "batch_size": BATCH_SIZE,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}
t = Trainer(trainer_config)
t.fit(
    model,
    train_dataloader,
    val_dataloader
)


Epoch 1/2


  0%|          | 0/107 [00:00<?, ?it/s]



  0%|          | 0/19 [00:00<?, ?it/s]

0.6054590344429016
Epoch 2/2


  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

0.6029776930809021


DistilBertForClassification(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

# Save model

In [11]:
t.save("baseline_model.ckpt")


# Load pretrained Model

In [12]:
t = Trainer.load("baseline_model.ckpt")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Get testset predictions


In [13]:
predictions = t.predict(test_dataloader)


# Create submission


In [14]:
sample_submission = pd.read_csv(os.path.join(PATH, "sample_submission.csv"))
sample_submission["target"] = predictions
sample_submission.head()


Unnamed: 0,id,target
0,133529667241314002934985813983134580070,5
1,133529693386607632642217219852638190950,0
2,133529737754378640630246272237250379110,1
3,133529756769137644053687294687798459750,0
4,133529828866765532034234504812793265510,0


In [15]:
sample_submission.to_csv("submission.csv", index=False)
