# Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2
import os
from pathlib import Path

import torch
import pandas as pd
import gensim.downloader as api
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_sequence

from dataset import *
from model import *
from trainer import Trainer

# Read Data

In [2]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

train = train[train["movie_description"].notna()]

train.head()

Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,X,"In 1979, a group of young filmmakers set out t...",Horror
3,Piggy,With the summer sun beating down on her rural ...,Horror
4,Deadstream,After a public controversy left him disgraced ...,Horror


# Encoding Categorial Target

In [3]:
le = LabelEncoder()
le.fit(train["target"])
train["target"] = le.transform(train["target"])
train.head()

Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,3
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,4
2,X,"In 1979, a group of young filmmakers set out t...",3
3,Piggy,With the summer sun beating down on her rural ...,3
4,Deadstream,After a public controversy left him disgraced ...,3


# Create pretrained tokenizers

In [4]:
tok = Tokenizer()
tok_texts = [tok.tokenize(t) for t in train.movie_description.values]
vocab = Vocab(tok_texts, max_vocab_size=30000)

# Splitting Data

In [5]:
train_texts, train_labels, val_texts, val_labels = train_test_split(train)
train_dataset = TextDataset([tok.tokenize(t) for t in train_texts], train_labels, vocab)
val_dataset = TextDataset([tok.tokenize(t) for t in val_texts], val_labels, vocab)

# Create embeddings from tokens

In [6]:
os.environ["GENSIM_DATA_DIR"] = str(Path.cwd())
gensim_model = api.load("glove-wiki-gigaword-100")
emb_matrix = prepare_emb_matrix(gensim_model, vocab)

# Init Model and Config

In [7]:
config = {
    "freeze": True,
    "cell_type": "LSTM",
    "cell_dropout": 0.3,
    "num_layers": 2,
    "hidden_size": 128,
    "out_activation": "relu",
    "bidirectional": False,
    "out_dropout": 0.2,
    "out_sizes": [200],
}

trainer_config = {
    "lr": 3e-4,
    "n_epochs": 10,
    "weight_decay": 1e-6,
    "batch_size": 128,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}
clf_model = RecurrentClassifier(config, vocab, emb_matrix)

# Create Dataloaders and Train

In [8]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=trainer_config["batch_size"],
                              shuffle=True,
                              num_workers=0,
                              collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=trainer_config["batch_size"],
                            shuffle=False,
                            num_workers=0,
                            collate_fn=val_dataset.collate_fn)
t = Trainer(trainer_config)
t.fit(clf_model, train_dataloader, val_dataloader)

Epoch 1/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 2/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 3/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 4/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 5/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 6/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 7/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 8/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 9/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 10/10


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

RecurrentClassifier(
  (embeddings): Embedding(30003, 100, padding_idx=0)
  (cell): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.3)
  (out_dropout): Dropout(p=0.2, inplace=False)
  (out_proj): Sequential(
    (0): Linear(in_features=256, out_features=200, bias=True)
    (1): Linear(in_features=200, out_features=6, bias=True)
  )
)

# Save Model

In [9]:
t.save("baseline_model.ckpt")

# Load pretrained Model

In [10]:
t = Trainer.load("baseline_model.ckpt")

In [11]:
def predict(model, text):
    tok_text = tok.tokenize(text)
    indexed_text = torch.tensor(vocab.vectorize(tok_text)).to(t.device)
    genre = model(pack_sequence([indexed_text])).argmax().item()
    return genre

In [12]:
for desc in test.iloc[:3].movie_description.values:
    print(f"Genre for description: '{desc}' is {le.inverse_transform([predict(t.model, desc)])[0]}")
    print()

Genre for description: 'When Craig, a young boy living in a small town (Jaeden Martell) befriends Mr. Harrigan, an older, reclusive billionaire (Donald Sutherland), the two begin to form an unlikely bond over their love of books and reading. But when Mr. Harrigan sadly passes away, Craig discovers that not everything is dead and gone and strangely finds himself able to communicate with his friend from the grave through the iPhone in this supernatural coming-of-age story that shows that certain connections are never lost.' is Horror

Genre for description: 'Leslie (Andrea Riseborough) is a West Texas single mother struggling to provide for her son (Owen Teague) when she wins the lottery and a chance at a good life. But a few short years later the money is gone and Leslie is on her own, living hard and fast at the bottom of a bottle as she runs from the world of heartbreak she left behind. With her charm running out and with nowhere to go, Leslie is forced to return home to her former fr

In [13]:
test_dataloader = DataLoader( TextDataset([tok.tokenize(t) for t in test.movie_description.values], [-1] * test.shape[0], vocab), 
                            batch_size=trainer_config["batch_size"],
                            shuffle=False,
                            num_workers=0,
                            collate_fn=val_dataset.collate_fn)

predictions = t.predict(test_dataloader)

In [14]:
sample_submission = pd.read_csv("../sample_submission.csv")
sample_submission["target"] = le.inverse_transform(predictions)
sample_submission.head()

Unnamed: 0,target
0,Kids
1,Drama
2,Kids
3,Horror
4,Action


In [15]:
sample_submission.to_csv("submission.csv", index=False)