In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!git clone https://github.com/fhrzn/industrial-ai

fatal: destination path 'industrial-ai' already exists and is not an empty directory.


In [None]:
!cd industrial-ai && git pull

Already up to date.


In [None]:
!cd industrial-ai/Task\ 2/ && pip install -q -r requirement.txt

In [None]:
import sys
sys.path.append('industrial-ai/Task 2')

In [None]:
import pandas as pd
import numpy as np
from core.dataset import IMDBDataset
from core import models, training
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import torch
from omegaconf import OmegaConf
from torch import optim
from transformers import get_scheduler

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = OmegaConf.load('./industrial-ai/Task 2/config/config.yaml')
config['batch_size'] = 128

In [None]:
BASE_PATH_DRIVE = './drive/MyDrive/ITMO Master/'
config['dataset_path'] = config['dataset_path'].replace('./', BASE_PATH_DRIVE)

In [None]:
df = pd.read_csv(config['dataset_path'])
# make path adjustment because we running from colab
df['img_local_path'] = df['img_local_path'].apply(lambda x: x.replace('./', BASE_PATH_DRIVE))

# make train test split
train, test = train_test_split(df, test_size=0.25, random_state=42, stratify=df.genre)
# make validation split
train, val = train_test_split(train, test_size=0.25, random_state=42, stratify=train.genre)
print(len(train), len(val), len(test))

7119 2373 3164


In [None]:
train = train.to_dict(orient='records')
test = test.to_dict(orient='records')
val = val.to_dict(orient='records')

train[0]

{'title': 'Tale of the Nine Tailed',
 'simple_desc': 'An urban dark fantasy drama about a gumiho who settles into the city and a producer who is after it.',
 'genre': 'Fantasy',
 'img_local_path': './drive/MyDrive/ITMO Master/datasets/ml-industry/task2/images/Fantasy/Tale of the Nine Tailed.jpg'}

In [None]:
# title dataset
title_trainset = IMDBDataset(train, type='title')
title_validset = IMDBDataset(val, type='title')
title_testset = IMDBDataset(test, type='title')

# description dataset
desc_trainset = IMDBDataset(train, type='description')
desc_validset = IMDBDataset(val, type='description')
desc_testset = IMDBDataset(test, type='description')

# image dataset
img_trainset = IMDBDataset(train, type='image')
img_validset = IMDBDataset(val, type='image')
img_testset = IMDBDataset(test, type='image')

## Multimodal with Late Fusion

### Text Model (Title)

In [None]:
title_bert_tokenizer, title_bert_model = models.get_bert_model(config['text_model_ckpt'])
title_bert_model = title_bert_model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# set tokenizer
title_trainset.set_tokenizer(title_bert_tokenizer)
title_validset.set_tokenizer(title_bert_tokenizer)
title_testset.set_tokenizer(title_bert_tokenizer)

In [None]:
# dataloader
title_train_loader = DataLoader(title_trainset, batch_size=config['batch_size'])
title_valid_loader = DataLoader(title_validset, batch_size=config['batch_size'])
title_test_loader = DataLoader(title_testset, batch_size=config['batch_size'])

#### Modelling

In [None]:
model_name = 'title_bert'
model, history = training.train(title_bert_model, model_name, title_train_loader,
                                title_valid_loader, config=config, device=device)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  0%|          | 0/168 [00:00<?, ?it/s]

TLoss: 3.196 | TAcc: 0.045 | VLoss: 3.174 | VAcc: 0.014
TLoss: 3.168 | TAcc: 0.054 | VLoss: 3.161 | VAcc: 0.020
TLoss: 3.157 | TAcc: 0.060 | VLoss: 3.154 | VAcc: 0.024


In [None]:
config

{'dataset_path': './drive/MyDrive/ITMO Master/datasets/ml-industry/task2/processed.csv', 'text_model_ckpt': 'bert-base-uncased', 'image_model_ckpt': 'google/vit-base-patch16-224-in21k', 'image_model_ckpt_2': 'microsoft/resnet-152', 'batch_size': 128, 'num_epoch': 3, 'early_stop': 5, 'save_strategy': 'epoch', 'text_model_learning_rate': 1e-05, 'image_model_learning_rate': 1e-05, 'log_every': 1}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir='./content',
                                  num_train_epochs=5,
                                  learning_rate=1e-3,
                                  per_device_train_batch_size=config['batch_size'],
                                  per_device_eval_batch_size=config['batch_size'],
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=config['batch_size'] // 5,
                                  )

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [None]:
model(**next(iter(title_train_loader)).to(device))

SequenceClassifierOutput(loss=tensor(3.0988, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0505, -0.3824, -0.4711,  ...,  0.3142,  0.6722,  0.8860],
        [ 0.4047,  1.0001,  0.9609,  ..., -0.4933, -0.5287, -0.9323],
        [ 0.0966,  0.0229,  0.4293,  ...,  0.2549, -0.9046, -0.5038],
        ...,
        [-0.1490, -0.4990, -0.6038,  ...,  0.6131,  0.4280,  1.0121],
        [ 0.0107, -0.6798, -0.4360,  ...,  0.6404,  0.5896,  0.8996],
        [-0.2443,  0.2599,  0.2968,  ..., -0.6214,  0.2111, -0.1794]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
trainer = Trainer(model=title_bert_model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=title_trainset,
                  eval_dataset=title_validset,
                  tokenizer=title_bert_tokenizer)

trainer.train()

***** Running training *****
  Num examples = 7119
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 280
  Number of trainable parameters = 109500696


Epoch,Training Loss,Validation Loss,Accuracy
1,3.2092,3.195771,0.039612
2,3.1955,3.189003,0.042984
3,3.1895,3.183756,0.041298
4,3.1862,3.180531,0.042984
5,3.1799,3.177661,0.042984


***** Running Evaluation *****
  Num examples = 2373
  Batch size = 128
Saving model checkpoint to ./content/checkpoint-56
Configuration saved in ./content/checkpoint-56/config.json
Model weights saved in ./content/checkpoint-56/pytorch_model.bin
tokenizer config file saved in ./content/checkpoint-56/tokenizer_config.json
Special tokens file saved in ./content/checkpoint-56/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2373
  Batch size = 128
Saving model checkpoint to ./content/checkpoint-112
Configuration saved in ./content/checkpoint-112/config.json
Model weights saved in ./content/checkpoint-112/pytorch_model.bin
tokenizer config file saved in ./content/checkpoint-112/tokenizer_config.json
Special tokens file saved in ./content/checkpoint-112/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2373
  Batch size = 128
Saving model checkpoint to ./content/checkpoint-168
Configuration saved in ./content/checkpoint-168/config.json
Model wei

TrainOutput(global_step=280, training_loss=3.1956513677324567, metrics={'train_runtime': 417.0848, 'train_samples_per_second': 85.342, 'train_steps_per_second': 0.671, 'total_flos': 1170910995287040.0, 'train_loss': 3.1956513677324567, 'epoch': 5.0})