In [1]:
import pandas as pd
from core.dataset import IMDBDataset
from core import models, training
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torch
from omegaconf import OmegaConf

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = OmegaConf.load('./config/config.yaml')

In [3]:
df = pd.read_csv(config['dataset_path'])
df = df.drop_duplicates(['title', 'simple_desc'],)

# make train test split
train, test = train_test_split(df, test_size=0.25, random_state=42, stratify=df.genre)
# make validation split
train, val = train_test_split(train, test_size=0.25, random_state=42, stratify=train.genre)
print(len(train), len(val), len(test))

6349 2117 2823


In [4]:
train = train.to_dict(orient='records')
test = test.to_dict(orient='records')
val = val.to_dict(orient='records')

train[0]

{'title': 'They Rode Good Horses',
 'simple_desc': "The Birth of the American Cowboy. Two young boys are stranded and alone in the bleak 1840's wilderness of the Wind River Mountains with nothing but their friendship, determination to survive and two good horses.",
 'genre': 'Western',
 'img_local_path': './datasets/ml-industry/task2/images/Western/They Rode Good Horses.jpg',
 'Action': 0,
 'Adventure': 0,
 'Animation': 0,
 'Biography': 0,
 'Comedy': 0,
 'Crime': 0,
 'Documentary': 0,
 'Drama': 0,
 'Family': 0,
 'Fantasy': 0,
 'Film Noir': 0,
 'History': 0,
 'Horror': 0,
 'Music': 0,
 'Musical': 0,
 'Mystery': 0,
 'Romance': 0,
 'Sci-Fi': 0,
 'Short Film': 0,
 'Sport': 0,
 'Superhero': 0,
 'Thriller': 0,
 'War': 0,
 'Western': 1}

In [5]:
# title dataset
title_trainset = IMDBDataset(train, type='title')
title_validset = IMDBDataset(val, type='title')
title_testset = IMDBDataset(test, type='title')

# description dataset
desc_trainset = IMDBDataset(train, type='description')
desc_validset = IMDBDataset(val, type='description')
desc_testset = IMDBDataset(test, type='description')

# image dataset
img_trainset = IMDBDataset(train, type='image')
img_validset = IMDBDataset(val, type='image')
img_testset = IMDBDataset(test, type='image')

In [6]:
# get label dictionary
label2id = title_trainset.label2id
id2label = title_trainset.id2label

## Multimodal with Late Fusion


#### Text Model (Title)

In [7]:
title_bert_tokenizer, title_bert_model = models.get_bert_model(config['text_model_ckpt'], label2id=label2id, id2label=id2label)
title_bert_model = title_bert_model#.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
# set tokenizer
title_trainset.set_tokenizer(title_bert_tokenizer)
title_validset.set_tokenizer(title_bert_tokenizer)
title_testset.set_tokenizer(title_bert_tokenizer)

In [9]:
# dataloader
title_train_loader = DataLoader(title_trainset, batch_size=config['batch_size'])
title_valid_loader = DataLoader(title_validset, batch_size=config['batch_size'])
title_test_loader = DataLoader(title_testset, batch_size=config['batch_size'])

In [10]:
title_bert_model(**next(iter(title_train_loader)))

SequenceClassifierOutput(loss=tensor(0.6889, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 2.6845e-02, -7.1120e-02,  2.8396e-01,  1.6569e-01, -1.6389e-01,
          1.1107e-01, -3.8786e-01, -3.2737e-01,  2.8821e-03,  2.1767e-01,
          9.2506e-02, -6.4739e-02, -3.8056e-01,  1.9024e-01,  2.0301e-01,
         -5.7529e-02,  2.4740e-01,  5.9080e-02, -8.0943e-02, -5.8867e-01,
         -7.2026e-02, -5.1621e-02, -4.2886e-01,  3.9723e-01],
        [ 6.2134e-02, -6.9676e-02,  3.9724e-01,  2.5927e-01, -3.9059e-02,
         -3.4112e-02, -4.2083e-01, -2.8844e-01,  3.3558e-02,  2.3532e-01,
          1.2541e-01, -5.7245e-02, -3.1408e-01,  2.2613e-01,  2.6800e-01,
         -1.9972e-02,  2.2119e-01, -1.2280e-02, -6.1951e-02, -6.3328e-01,
         -1.3159e-01, -1.2045e-01, -3.8325e-01,  5.2744e-01],
        [-7.1641e-02, -7.7957e-02,  6.4193e-01,  3.6660e-02, -2.2491e-02,
          2.0866e-01, -9.6747e-02, -4.3537e-01,  2.5131e-01,  7.4340e-02,
         -5.3770e-02, -1.1945e-01,

##### Modelling

In [None]:
model_name = 'title_bert'
model, history = training.train(title_bert_model, model_name, title_train_loader,
                                title_valid_loader, config=config, device=device)

#### Text Model (Description)

In [None]:
desc_bert_tokenizer, desc_bert_model = models.get_bert_model(config['text_model_ckpt'])
desc_bert_model = desc_bert_model.to(device)

In [None]:
# set tokenizer
desc_trainset.set_tokenizer(desc_bert_tokenizer)
desc_validset.set_tokenizer(desc_bert_tokenizer)
desc_testset.set_tokenizer(desc_bert_tokenizer)

In [None]:
# dataloader
desc_train_loader = DataLoader(desc_trainset, batch_size=config['batch_size'])
desc_valid_loader = DataLoader(desc_validset, batch_size=config['batch_size'])
desc_test_loader = DataLoader(desc_testset, batch_size=config['batch_size'])

##### Modelling

In [None]:
model_name = 'desc_bert'
model, history = training.train(desc_bert_model, model_name, desc_train_loader,
                                desc_valid_loader, config=config, device=device)