<a href="https://colab.research.google.com/github/egemenpamukcu/Content-Analysis-Final-Project/blob/main/multimodal_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
!pip install transformers

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from collections import Counter
import numpy as np
import pandas as pd
wiki = pd.read_csv('/content/drive/MyDrive/deep-learning/movies.csv')
wiki = wiki[~wiki['Genre'].isna()]

wiki['Genre'] = wiki['Genre'].apply(lambda x: x.split('|'))
genres = wiki['Genre'].sum()
genres = Counter(genres)
genres = [k for k, v in genres.items() if v > 900] + ['Western']
genres = dict(zip(genres, range(len(genres))))

one_hot_genres = []
for genre_list in wiki['Genre']: 
    one_hot_genre = np.zeros(len(genres))
    for genre in genre_list: 
        try:
            one_hot_genre[genres[genre]] = 1
        except KeyError: 
            continue
    one_hot_genres.append(one_hot_genre)

one_hot_genres = np.array(one_hot_genres)

In [3]:
# Create sentence and label lists
sentences = wiki['plot'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = one_hot_genres

In [4]:
from transformers import DistilBertTokenizerFast

MAX_LEN = 350

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', do_lower_case=True, truncation=True, max_length=MAX_LEN)

tokenized_texts = [tokenizer.tokenize(sent)[:MAX_LEN - 1] + ['[SEP]'] for sent in sentences]
print("Tokenize the first plot:")
print(tokenized_texts[1])

Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors


Tokenize the first plot:
['[CLS]', 'a', 'series', 'of', 'murders', 'of', 'rich', 'young', 'women', 'throughout', 'arizona', 'bear', 'distinctive', 'signatures', 'of', 'a', 'serial', 'killer', '.', 'clues', 'lead', 'detective', 'charles', 'mendoza', 'to', 'visit', 'paul', 'white', ',', 'a', 'sound', 'expert', 'installing', 'hi', '-', 'fi', 'systems', 'in', 'wealthy', 'people', "'", 's', 'homes', '.', 'his', 'special', 'talent', 'is', 'to', 'make', 'a', 'noise', 'which', 'echoes', 'through', 'the', 'air', 'ca', '##vi', '##ties', 'in', 'his', 'head', 'and', 'shows', 'him', 'where', 'the', 'sound', 'of', 'the', 'speakers', 'should', 'come', 'from', 'and', 'echo', 'in', 'the', 'room', '.', 'he', 'is', 'married', 'to', 'joan', ',', 'whom', ',', 'ten', 'years', 'earlier', ',', 'he', 'had', 'seduce', '##d', 'away', 'from', 'mike', 'des', '##anto', '##s', ',', 'her', 'then', 'current', 'boyfriend', '.', 'joan', 'is', 'questioned', 'by', 'mendoza', ',', 'but', 'does', 'not', 'believe', 'his', 'i

In [5]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.sequence import pad_sequences

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

from sklearn.model_selection import train_test_split

# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2020, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2020, test_size=0.2)


import torch

# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [6]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, DistilBertForSequenceClassification
from tqdm import tqdm, trange
import io

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [7]:
output_dim = len(genres)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=output_dim).cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [8]:
import torch.nn as nn
import torch.nn.functional as F

class MultiModalNet(nn.Module):

    def __init__(self):
        super(MultiModalNet, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.distilbert = model
        # self.conv1 = nn.Conv2d(1, 6, 5)
        # self.conv2 = nn.Conv2d(6, 16, 5)
        # # an affine operation: y = Wx + b
        # self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        # self.fc2 = nn.Linear(120, 84)
        self.fc1 = nn.Linear(output_dim, output_dim)

    def forward(self, input_ids, input_mask):
        # Max pooling over a (2, 2) window
        # x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # # If the size is a square, you can specify with a single number
        # x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        outputs = self.distilbert(input_ids, 
                            attention_mask=input_mask)
        x = F.relu(outputs[0])
        x = self.fc1(outputs[0])

        return x, outputs[1:]

mm_model = MultiModalNet().cuda()

In [9]:
param_optimizer = list(mm_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [10]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(mm_model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [11]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

criterion = nn.BCEWithLogitsLoss()

In [12]:
import time
import datetime
from sklearn.metrics import roc_auc_score

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_labels = (preds > .5).astype(int)
    mean_accuracy = (labels == pred_labels).mean(axis = 0).mean()
    return mean_accuracy

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [13]:
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    mm_model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        mm_model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = mm_model(b_input_ids, b_input_mask)
        logits = torch.sigmoid(outputs[0])
        loss = criterion(logits, b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        # loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(mm_model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    mm_model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
             outputs = mm_model(b_input_ids, b_input_mask)

        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = torch.sigmoid(outputs[0])

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of    385.    Elapsed: 0:00:41.
  Batch    80  of    385.    Elapsed: 0:01:24.
  Batch   120  of    385.    Elapsed: 0:02:06.
  Batch   160  of    385.    Elapsed: 0:02:48.
  Batch   200  of    385.    Elapsed: 0:03:30.
  Batch   240  of    385.    Elapsed: 0:04:12.
  Batch   280  of    385.    Elapsed: 0:04:55.
  Batch   320  of    385.    Elapsed: 0:05:37.
  Batch   360  of    385.    Elapsed: 0:06:19.

  Average training loss: 0.79
  Training epcoh took: 0:06:45

Running Validation...
  Accuracy: 0.78
  Validation took: 0:00:38

Training...
  Batch    40  of    385.    Elapsed: 0:00:42.
  Batch    80  of    385.    Elapsed: 0:01:24.
  Batch   120  of    385.    Elapsed: 0:02:07.
  Batch   160  of    385.    Elapsed: 0:02:49.
  Batch   200  of    385.    Elapsed: 0:03:31.
  Batch   240  of    385.    Elapsed: 0:04:13.
  Batch   280  of    385.    Elapsed: 0:04:56.
  Batch   320  of    385.    Elapsed: 0:05:38.
  Batch   360  of    385.    Elapsed: 0:06:20.

In [14]:
start = True
for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():        
        outputs = mm_model(b_input_ids, b_input_mask)

    # Get the "logits" output by the model. The "logits" are the output
    # values prior to applying an activation function like the softmax.
    logits = torch.sigmoid(outputs[0])

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    if start: 
        probs = logits 
        test_labels = label_ids
        start = False
    else:
        probs = np.concatenate((probs, logits))
        test_labels = np.concatenate((test_labels, label_ids))

In [15]:
from sklearn import metrics
# precision, recall, accuracy, f1, roc_auc
genres_i = [k for k, v in sorted(genres.items(), key=lambda x: x[1])]

metric_dict = {
    'f1_score': [],
    'precision': [],
    'recall': [],
    'accuracy': [],
    'roc_auc': []
}
for i, genre in enumerate(genres_i):
    metric_dict['f1_score'].append(metrics.f1_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['precision'].append(metrics.precision_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['recall'].append(metrics.recall_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['accuracy'].append(metrics.accuracy_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['roc_auc'].append(metrics.roc_auc_score(test_labels[:, i], probs[:, i] > .5))

metric_dict['genres'] = genres_i
metric_df = pd.DataFrame(metric_dict)
metric_df.sort_values('roc_auc', ascending=False, inplace=True)
metric_df

Unnamed: 0,f1_score,precision,recall,accuracy,roc_auc,genres
4,0.602492,0.823529,0.475,0.771856,0.708374,Comedy
0,0.0,0.0,0.0,0.842379,0.5,Action
1,0.0,0.0,0.0,0.887553,0.5,Horror
2,0.0,0.0,0.0,0.936952,0.5,Sci-Fi
3,0.0,0.0,0.0,0.863503,0.5,Thriller
5,0.0,0.0,0.0,0.790705,0.5,Romance
6,0.69729,0.535262,1.0,0.535262,0.5,Drama
7,0.0,0.0,0.0,0.938252,0.5,Family
8,0.102374,0.053949,1.0,0.053949,0.5,Fantasy
9,0.0,0.0,0.0,0.825804,0.5,Crime


In [16]:
(probs > .0009).sum(0)

array([   0, 3077,  860,  589, 3077, 3077, 3077,  450, 3077, 3077,  551,
       3077, 3077])

In [17]:
genres_i

['Action',
 'Horror',
 'Sci-Fi',
 'Thriller',
 'Comedy',
 'Romance',
 'Drama',
 'Family',
 'Fantasy',
 'Crime',
 'Adventure',
 'Mystery',
 'Western']