In [1]:
%load_ext autoreload
%autoreload 2


# Import Libraries

In [2]:
import os

from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer

from dataset import *
from model import *
from trainer import Trainer


In [3]:
PATH = "../"
MAX_LEN = 128
BATCH_SIZE = 64


# Loading data

In [4]:
train_data = pd.read_csv(os.path.join(PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(PATH, "test.csv"))

train_data.head()


Unnamed: 0,id,movie_name,movie_description,target
0,3525e31d,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,051f6309,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,12a9bfcf,X,"In 1979, a group of young filmmakers set out t...",Horror
3,e5373c77,Piggy,With the summer sun beating down on her rural ...,Horror
4,473cdb82,Deadstream,After a public controversy left him disgraced ...,Horror


# Encoding labels

In [5]:
le = LabelEncoder()
le.fit(train_data["target"])
train_data["target"] = le.transform(train_data["target"])
train_data.head()


Unnamed: 0,id,movie_name,movie_description,target
0,3525e31d,Hellraiser,A new take on Clive Barker's 1987 horror class...,3
1,051f6309,Hocus Pocus 2,It's been 29 years since someone lit the Black...,4
2,12a9bfcf,X,"In 1979, a group of young filmmakers set out t...",3
3,e5373c77,Piggy,With the summer sun beating down on her rural ...,3
4,473cdb82,Deadstream,After a public controversy left him disgraced ...,3


In [6]:
train_split, val_split = train_test_split(train_data, train_frac=0.85)


# Loading tokenizer from pretrained

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased", truncation=True, do_lower_case=True)


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

# Creating datasets and dataloaders

In [8]:
train_dataset = RottenTomatoesDataset(train_split, tokenizer, MAX_LEN)
val_dataset = RottenTomatoesDataset(val_split, tokenizer, MAX_LEN)
test_dataset = RottenTomatoesDataset(test_data, tokenizer, MAX_LEN)


In [9]:
train_params = {"batch_size": BATCH_SIZE,
                "shuffle": True,
                "num_workers": 0
                }

test_params = {"batch_size": BATCH_SIZE,
               "shuffle": False,
               "num_workers": 0
               }

train_dataloader = DataLoader(train_dataset, **train_params)
val_dataloader = DataLoader(val_dataset, **test_params)
test_dataloader = DataLoader(test_dataset, **test_params)


# Loading pretrained model from Huggingface

In [10]:
config = {
    "num_classes": 6,
    "dropout_rate": 0.3
}
model = DistilBertForClassification(
    "distilbert-base-uncased",
    config=config
)


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Creating Trainer object and fitting the model

In [11]:
trainer_config = {
    "lr": 3e-4,
    "n_epochs": 2,
    "weight_decay": 1e-6,
    "batch_size": 64,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}
t = Trainer(trainer_config)
t.fit(
    model,
    train_dataloader,
    val_dataloader
)


Epoch 1/2


  0%|          | 0/73 [00:00<?, ?it/s]



  0%|          | 0/13 [00:00<?, ?it/s]

0.5289855003356934
Epoch 2/2


  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

0.5458937287330627


DistilBertForClassification(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

# Save model

In [12]:
t.save("baseline_model.ckpt")


# Load pretrained Model

In [13]:
t = Trainer.load("baseline_model.ckpt")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Get testset predictions


In [14]:
predictions = t.predict(test_dataloader)


# Create submission


In [15]:
sample_submission = pd.read_csv(os.path.join(PATH, "sample_submission.csv"))
sample_submission["target"] = le.inverse_transform(predictions)
sample_submission.head()


Unnamed: 0,id,target
0,d996f823,Horror
1,1cf01f9c,Comedy
2,856ea05c,Horror
3,c97899ee,Horror
4,73f0740f,Comedy


In [16]:
sample_submission.to_csv("submission.csv", index=False)
