In [1]:
import pandas as pd
from core.dataset import IMDBDataset
from core import models, training
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torch
from omegaconf import OmegaConf

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = OmegaConf.load('./config/config.yaml')

In [3]:
df = pd.read_csv('./datasets/ml-industry/task2/processed.csv')

# make train test split
train, test = train_test_split(df, test_size=0.25, random_state=42, stratify=df.genre)
# make validation split
train, val = train_test_split(train, test_size=0.25, random_state=42, stratify=train.genre)
print(len(train), len(val), len(test))

7119 2373 3164


In [4]:
train = train.to_dict(orient='records')
test = test.to_dict(orient='records')
val = val.to_dict(orient='records')

train[0]

{'title': 'Tale of the Nine Tailed',
 'simple_desc': 'An urban dark fantasy drama about a gumiho who settles into the city and a producer who is after it.',
 'genre': 'Fantasy',
 'img_local_path': './datasets/ml-industry/task2/images/Fantasy/Tale of the Nine Tailed.jpg'}

In [5]:
# title dataset
title_trainset = IMDBDataset(train, type='title')
title_validset = IMDBDataset(val, type='title')
title_testset = IMDBDataset(test, type='title')

# description dataset
desc_trainset = IMDBDataset(train, type='description')
desc_validset = IMDBDataset(val, type='description')
desc_testset = IMDBDataset(test, type='description')

# image dataset
img_trainset = IMDBDataset(train, type='image')
img_validset = IMDBDataset(val, type='image')
img_testset = IMDBDataset(test, type='image')

In [6]:
# get label dictionary
label2id = title_trainset.label2id
id2label = title_trainset.id2label

## Multimodal with Late Fusion


#### Text Model (Title)

In [7]:
title_bert_tokenizer, title_bert_model = models.get_bert_model(config['text_model_ckpt'], label2id=label2id, id2label=id2label)
title_bert_model = title_bert_model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
# set tokenizer
title_trainset.set_tokenizer(title_bert_tokenizer)
title_validset.set_tokenizer(title_bert_tokenizer)
title_testset.set_tokenizer(title_bert_tokenizer)

In [8]:
# dataloader
title_train_loader = DataLoader(title_trainset, batch_size=config['batch_size'])
title_valid_loader = DataLoader(title_validset, batch_size=config['batch_size'])
title_test_loader = DataLoader(title_testset, batch_size=config['batch_size'])

##### Modelling

In [None]:
model_name = 'title_bert'
model, history = training.train(title_bert_model, model_name, title_train_loader,
                                title_valid_loader, config=config, device=device)

#### Text Model (Description)

In [None]:
desc_bert_tokenizer, desc_bert_model = models.get_bert_model(config['text_model_ckpt'])
desc_bert_model = desc_bert_model.to(device)

In [None]:
# set tokenizer
desc_trainset.set_tokenizer(desc_bert_tokenizer)
desc_validset.set_tokenizer(desc_bert_tokenizer)
desc_testset.set_tokenizer(desc_bert_tokenizer)

In [None]:
# dataloader
desc_train_loader = DataLoader(desc_trainset, batch_size=config['batch_size'])
desc_valid_loader = DataLoader(desc_validset, batch_size=config['batch_size'])
desc_test_loader = DataLoader(desc_testset, batch_size=config['batch_size'])

##### Modelling

In [None]:
model_name = 'desc_bert'
model, history = training.train(desc_bert_model, model_name, desc_train_loader,
                                desc_valid_loader, config=config, device=device)