https://colab.research.google.com/drive/1ayyWPuOw8ET2SRZ5KD-r4dwMH4jBn-B8#scrollTo=h2GTb3WrrH7y

# Data

In [1]:
import os
import sys
import pandas as pd

In [2]:
os.getcwd()

'C:\\Users\\frank\\OneDrive\\Desktop\\7643_Project\\deephumor-master'

In [3]:
from deephumor.data.vocab import build_vocab_from_file
from deephumor.data.tokenizers import WordPunctTokenizer, CharTokenizer

DATA_DIR = 'C:/Users/frank/OneDrive/Desktop/7643_Project/dataset/memes900k'
CAPTIONS_FILE = os.path.join(DATA_DIR, 'captions_train.txt')

In [4]:
df = pd.read_csv(CAPTIONS_FILE, on_bad_lines = 'skip', header = None, sep = '\t')

In [5]:
df.head()

Unnamed: 0,0,1,2
0,Y U No,984,Victoria <sep> y u no tell us your secret?!
1,Y U No,908,KONY <sep> Y u no take justin bieber
2,Y U No,823,Google <sep> Y U NO LET ME FINISH TYPING?
3,Y U No,727,universal remote <sep> y u no work on universe?
4,Y U No,707,pink floyd <sep> y u no need no education?


In [6]:
list_cap = []

In [7]:
with open(CAPTIONS_FILE, 'r') as f:
    for i, line in enumerate(f):
        label, _, caption = line.strip().split('\t')
        list_cap.append(caption)

In [8]:
list_cap[4000:4010]

['joined isis to meet girls <sep> they made him a suicide bomber',
 "Appeared in a girl's dream <sep> nightmare",
 'tried to step on a spider <sep> The spider was actully a nail...',
 'Finds out who put the bomp in the bomp-shabomp-shabomp <sep> Forgets',
 'gives dollar to bum <sep> needs it back',
 "There are 3.2 Billion women on earth <sep> Still can't get laid",
 'front row tickets for a comedy night <sep> pete prodge is the headline act',
 'reached for the last slice <sep> got my hand slapped',
 'Spends 6500 credits on repairs <sep> Picks up an Instant Repair PowerUp',
 'Gets a girlfriend <sep> its his long lost cousin']

# Word-Level

In [9]:
MIN_DOC_COUNT = 5

tokenizer = WordPunctTokenizer()
vocab = build_vocab_from_file(CAPTIONS_FILE, tokenizer, min_df=MIN_DOC_COUNT)
len(vocab)

36541

In [10]:
print(vocab.tokens[:100])

['<pad>', '<unk>', '<bos>', '<eos>', '<sep>', '<emp>', '!', '!!', '!!!', '!!!"', "!!!'", '!!!)', '!!!*', '!!"', "!!'", "!!''", '!!)', '!!,', '!!.', '!!?', '!"', "!'", "!''", '!)', '!*', '!,', '!-', '!.', '!..', '!:', '!=', '!?', '!?!', '!?"', '!??', '"', '"!', '"!!', '"!!!', '"!?', '"#', '"$', '"\'', '".', '"..', '"...', '"..."', '"?', '"?!', '"??', '"???', '#', '#!', '##', '###', '#%', "#'", '#*', '#?', '$', '$$', '$$$', '$&', '$,', '$.', '$?', '%', '%!', '%#', '%.', '%...', '%?', '%?!', '&', "'", "''", "''i", "''the", "''you", "'12", "'86", "'a", "'a'", "'all", "'and", "'b'", "'bad", "'bout", "'c'", "'cause", "'come", "'cuz", "'d", "'d'", "'do", "'don't", "'e'", "'em", "'er", "'free"]


# Character-level

In [11]:
tokenizer_chars = CharTokenizer()
vocab_chars = build_vocab_from_file(CAPTIONS_FILE, tokenizer_chars, min_df=5)
len(vocab_chars)

71

In [12]:
print(vocab_chars.tokens)

['<pad>', '<unk>', '<bos>', '<eos>', '<sep>', '<emp>', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', '[', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~']


# Datasets

In [13]:
from deephumor.data import MemeDataset

# NUM_CLASSES = 200  # use this to limit the dataset size
NUM_CLASSES = 2  # use this to limit the dataset size
PAD_IDX = vocab.stoi['<pad>']

from torchvision import transforms
image_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

splits = ['train', 'val', 'test']
datasets = {
    # WORD-LEVEL
    split: MemeDataset(DATA_DIR, vocab, tokenizer, image_transform=image_transform,
                       num_classes=NUM_CLASSES, split=split, preload_images=True)
    
    # CHAR-LEVEL
    # split: MemeDataset(DATA_DIR, vocab_chars, tokenizer_chars, image_transform=image_transform,
    #                    num_classes=NUM_CLASSES, split=split, preload_images=True)
    for split in splits
}

for split in splits:
    print(split, len(datasets[split]))

train 5000
val 500
test 500


## DataLoaders

In [14]:
from torch.utils.data import DataLoader
from deephumor.data import pad_collate

BATCH_SIZE = 128

dataloaders = {
    split:  DataLoader(dataset=datasets[split], batch_size=BATCH_SIZE, 
                       shuffle=split == 'train', collate_fn=pad_collate)
    for split in splits
}

for (labels, captions, images) in dataloaders['val']:
    print(labels.size(), captions.size(), images.size())
    break

lengths = captions.size(1) - (captions == PAD_IDX).sum(dim=1)
lengths

torch.Size([128, 4]) torch.Size([128, 23]) torch.Size([128, 3, 224, 224])


tensor([11, 13,  9, 13,  9,  8, 10, 13, 11, 12, 18, 10,  8,  6, 11, 16, 12, 12,
        11, 12,  9, 12,  9, 15, 11, 14, 10,  6, 14, 12, 17, 10, 19,  9, 13, 10,
        12, 16,  9,  8, 10, 10, 11, 13, 11,  7, 10, 12, 14,  9, 11, 10,  9, 10,
        12, 13, 10,  9, 13, 11, 11,  8, 13, 10, 11, 12,  8, 14, 10, 11, 12, 11,
         9, 10, 14,  9,  9, 12, 11, 10,  7, 10, 12, 13,  9, 12,  8, 14, 10, 14,
         9,  7, 16,  9,  9,  9,  8, 23, 12,  8, 16, 10,  9, 11, 11, 10, 13,  9,
        12,  8,  9,  8, 11,  9, 12, 11, 12, 11,  9,  9, 14, 11,  9, 12,  9, 10,
        10,  7])

# Experiments

### Load models and Trainer

In [15]:
import torch
import torch.nn as nn
from deephumor.models import CaptioningLSTM, CaptioningLSTMWithLabels, CaptioningTransformer, CaptioningTransformerBase
from deephumor.experiments import Trainer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Words

In [16]:
NUM_TOKENS = len(vocab)
LEARNING_RATE = 2e-4

In [17]:
NUM_CLASSES = 2
# NUM_CLASSES = 200

datasets = {
    # WORD-LEVEL
    split: MemeDataset(DATA_DIR, vocab, tokenizer, image_transform=image_transform,
                       num_classes=NUM_CLASSES, split=split, preload_images=True)
    
    # CHAR-LEVEL
    # split: MemeDataset(DATA_DIR, vocab_chars, tokenizer_chars, image_transform=image_transform,
    #                    num_classes=NUM_CLASSES, split=split, preload_images=True)
    for split in splits
}

for split in splits:
    print(split, len(datasets[split]))

dataloaders = {
    split:  DataLoader(dataset=datasets[split], batch_size=BATCH_SIZE, 
                       shuffle=split == 'train', collate_fn=pad_collate)
    for split in splits
}

for (labels, captions, images) in dataloaders['val']:
    lengths = captions.size(1) - (captions == PAD_IDX).sum(dim=1)
    print(labels.size(), captions.size(), images.size())
    break

train 5000
val 500
test 500
torch.Size([128, 4]) torch.Size([128, 23]) torch.Size([128, 3, 224, 224])


### Transformer Decoder with Global Image embedding (base)

In [18]:
torch.manual_seed(0)

model_transformer = CaptioningTransformerBase(
    num_tokens=len(vocab),
    hid_dim=512, 
    n_layers=3, 
    n_heads=8, 
    pf_dim=2048,
    enc_dropout=0.3, 
    dec_dropout=0.1, 
    pad_index=0, 
    max_len=128
).to(device)

# out = transformer_model(images.to(device), captions.to(device))
# out.size()

print('# parameters:', count_parameters(model_transformer))

# parameters: 47240893


In [19]:
with torch.no_grad():
    out = model_transformer(images.cuda(), captions.cuda())

out.size()

torch.Size([128, 24, 36541])

In [20]:
trainer_transformer = Trainer('TransformerDecoderBaseWords', log_dir='C:\\Users\\frank\\OneDrive\\Desktop\\7643_Project\\codes\\deephumor-master\\logs', text_labels=False)

In [21]:
%reload_ext tensorboard
%tensorboard --logdir ./logs

Reusing TensorBoard on port 6006 (pid 23484), started 5:34:14 ago. (Use '!kill 23484' to kill it.)

In [22]:
torch.manual_seed(0)
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model_transformer.parameters(), lr=3e-4) #LEARNING_RATE)
optimizer = torch.optim.Adam(model_transformer.parameters(), lr=8e-5) #LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

exp_data_transf = trainer_transformer.train_model(
    model_transformer, dataloaders, optimizer, 
    criterion, scheduler=scheduler, 
    n_epochs=20
)

Epoch 01/20
  train loss: 8.10094, perplexity: 6364.940
  val   loss: 6.95657, perplexity: 1566.304
  epoch time: 28.73s
Epoch 02/20
  train loss: 6.43369, perplexity: 929.733
  val   loss: 6.13372, perplexity: 685.602
  epoch time: 28.51s
Epoch 03/20
  train loss: 5.86891, perplexity: 508.978
  val   loss: 5.88458, perplexity: 545.196
  epoch time: 39.79s
Epoch 04/20
  train loss: 5.65066, perplexity: 416.722
  val   loss: 5.71437, perplexity: 474.061
  epoch time: 39.93s
Epoch 05/20
  train loss: 5.48487, perplexity: 359.243
  val   loss: 5.80910, perplexity: 581.573
  epoch time: 39.68s
Epoch 06/20
  train loss: 5.42726, perplexity: 349.468
  val   loss: 5.54040, perplexity: 438.253
  epoch time: 46.26s
Epoch 07/20
  train loss: 5.29142, perplexity: 312.167
  val   loss: 5.45215, perplexity: 419.549
  epoch time: 42.22s
Epoch 08/20
  train loss: 5.21179, perplexity: 293.723
  val   loss: 5.39442, perplexity: 407.312
  epoch time: 44.41s
Epoch 09/20
  train loss: 5.15006, perplexity:

# CHARS

In [16]:
NUM_TOKENS = len(vocab_chars)
# LEARNING_RATE = 5e-4
LEARNING_RATE = 1e-4

In [17]:
# NUM_CLASSES = 200
NUM_CLASSES = 2

datasets = {
    # WORD-LEVEL
    #split: MemeDataset(DATA_DIR, vocab, tokenizer, image_transform=image_transform,
    #                   num_classes=NUM_CLASSES, split=split, preload_images=True)
    
    # CHAR-LEVEL
    split: MemeDataset(DATA_DIR, vocab_chars, tokenizer_chars, image_transform=image_transform,
                        num_classes=NUM_CLASSES, split=split, preload_images=True)
    for split in splits
}

for split in splits:
    print(split, len(datasets[split]))

dataloaders = {
    split:  DataLoader(dataset=datasets[split], batch_size=BATCH_SIZE, 
                       shuffle=split == 'train', collate_fn=pad_collate)
    for split in splits
}

for (labels, captions, images) in dataloaders['val']:
    lengths = captions.size(1) - (captions == PAD_IDX).sum(dim=1)
    print(labels.size(), captions.size(), images.size())
    break

train 5000
val 500
test 500
torch.Size([128, 7]) torch.Size([128, 94]) torch.Size([128, 3, 224, 224])


In [18]:
torch.manual_seed(0)

model_transformer = CaptioningTransformerBase(
    num_tokens=len(vocab_chars),
    hid_dim=512, 
    n_layers=3, 
    n_heads=8, 
    pf_dim=2048,
    enc_dropout=0.3, 
    dec_dropout=0.1, 
    pad_index=0, 
    max_len=128
).to(device)

# out = transformer_model(images.to(device), captions.to(device))
# out.size()

print('# parameters:', count_parameters(model_transformer))

# parameters: 10645575


In [19]:
trainer_transformer = Trainer('TransformerDecoderBaseChars', log_dir='./logs', text_labels=False)

In [20]:
# !rm -rf ./logs

%reload_ext tensorboard
%tensorboard --logdir ./logs

Reusing TensorBoard on port 6006 (pid 23484), started 4:46:36 ago. (Use '!kill 23484' to kill it.)

In [21]:
torch.manual_seed(0)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_transformer.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

exp_data_transf = trainer_transformer.train_model(
    model_transformer, dataloaders, optimizer, 
    criterion, scheduler=scheduler, 
    n_epochs=50
)

Epoch 01/50
  train loss: 3.17052, perplexity: 25.408
  val   loss: 2.99453, perplexity: 20.176
  epoch time: 51.05s
Epoch 02/50
  train loss: 2.98905, perplexity: 20.067
  val   loss: 2.96844, perplexity: 19.675
  epoch time: 50.00s
Epoch 03/50
  train loss: 2.96921, perplexity: 19.693
  val   loss: 2.95120, perplexity: 19.350
  epoch time: 50.89s
Epoch 04/50
  train loss: 2.93631, perplexity: 19.029
  val   loss: 2.84124, perplexity: 17.263
  epoch time: 52.03s
Epoch 05/50
  train loss: 2.87794, perplexity: 17.912
  val   loss: 2.80054, perplexity: 16.532
  epoch time: 51.88s
Epoch 06/50
  train loss: 2.85185, perplexity: 17.429
  val   loss: 2.78078, perplexity: 16.313
  epoch time: 50.70s
Epoch 07/50
  train loss: 2.83282, perplexity: 17.125
  val   loss: 2.75142, perplexity: 15.774
  epoch time: 51.18s
Epoch 08/50
  train loss: 2.81266, perplexity: 16.770
  val   loss: 2.73329, perplexity: 15.474
  epoch time: 53.12s
Epoch 09/50
  train loss: 2.79755, perplexity: 16.526
  val   lo