In [None]:
#Using https://www.kaggle.com/sauravmaheshkar/neural-image-captioning

In [15]:
%%capture
!pip install --upgrade wandb

## Importing Packages

import os
import torch
import random
import warnings
import numpy as np
import transformers
import pandas as pd 
from PIL import Image
import torch.nn as nn
warnings.filterwarnings("ignore")
import torchvision.transforms as T
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from typing import Callable, Optional

## Logging into Weights and Biases
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("WANDB_API_KEY")
import wandb
wandb.login(key=api_key);

wandb.init(project="show-and-tell", entity="collaborativeml")

## For Reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

## Tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)

## Device Configuration 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


OSError: Could not find kaggle.json. Make sure it's located in C:\Users\u1\.kaggle. Or use the environment method.

In [2]:
## Basic File Paths
data_dir = 'C:/Users/u1\Desktop/archive'
image_dir = f'{data_dir}/flickr30k_images'
csv_file = f'{data_dir}/results.csv'

In [3]:
df = pd.read_csv(csv_file, delimiter='|')
df[' comment_number'][19999] = ' 4'
df[' comment'][19999] = ' A dog runs across the grass .'
df['image_name'] = image_dir+'/'+df['image_name']
df.head(5)

Unnamed: 0,image_name,comment_number,comment
0,C:/Users/u1\Desktop/archive/flickr30k_images/1...,0,Two young guys with shaggy hair look at their...
1,C:/Users/u1\Desktop/archive/flickr30k_images/1...,1,"Two young , White males are outside near many..."
2,C:/Users/u1\Desktop/archive/flickr30k_images/1...,2,Two men in green shirts are standing in a yard .
3,C:/Users/u1\Desktop/archive/flickr30k_images/1...,3,A man in a blue shirt standing in a garden .
4,C:/Users/u1\Desktop/archive/flickr30k_images/1...,4,Two friends enjoy time spent together .


In [4]:
#Restructuring Data
image_name = {
    'image_name':df[df[' comment_number'] == df[' comment_number'][0]]['image_name'].values,
}
comments = {
    'comment_0':df[df[' comment_number'] == df[' comment_number'][0]][' comment'].values,
    'comment_1':df[df[' comment_number'] == df[' comment_number'][1]][' comment'].values,
    'comment_2':df[df[' comment_number'] == df[' comment_number'][2]][' comment'].values,
    'comment_3':df[df[' comment_number'] == df[' comment_number'][3]][' comment'].values,
    'comment_4':df[df[' comment_number'] == df[' comment_number'][4]][' comment'].values,
}

image_name_df = pd.DataFrame.from_dict(image_name)
comments_df = pd.DataFrame.from_dict(comments)

df = pd.concat([image_name_df,comments_df], axis=1)
df.head(5)

Unnamed: 0,image_name,comment_0,comment_1,comment_2,comment_3,comment_4
0,C:/Users/u1\Desktop/archive/flickr30k_images/1...,Two young guys with shaggy hair look at their...,"Two young , White males are outside near many...",Two men in green shirts are standing in a yard .,A man in a blue shirt standing in a garden .,Two friends enjoy time spent together .
1,C:/Users/u1\Desktop/archive/flickr30k_images/1...,Several men in hard hats are operating a gian...,Workers look down from up above on a piece of...,Two men working on a machine wearing hard hats .,Four men on top of a tall structure .,Three men on a large rig .
2,C:/Users/u1\Desktop/archive/flickr30k_images/1...,A child in a pink dress is climbing up a set ...,A little girl in a pink dress going into a wo...,A little girl climbing the stairs to her play...,A little girl climbing into a wooden playhouse,A girl going into a wooden building .
3,C:/Users/u1\Desktop/archive/flickr30k_images/1...,Someone in a blue shirt and hat is standing o...,A man in a blue shirt is standing on a ladder...,A man on a ladder cleans the window of a tall...,man in blue shirt and jeans on ladder cleanin...,a man on a ladder cleans a window
4,C:/Users/u1\Desktop/archive/flickr30k_images/1...,"Two men , one in a gray shirt , one in a blac...",Two guy cooking and joking around with the ca...,Two men in a kitchen cooking food on a stove .,Two men are at the stove preparing food .,Two men are cooking a meal .


In [5]:
#Splitting into Train, Valid and Split
from sklearn.model_selection import train_test_split

## Obtain Train and Test Split 
train, test = train_test_split(df, test_size=0.2, random_state=42)

## Reset Indexes 
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## Obtain Train and Validation Split 
train, val = train_test_split(train, test_size=0.25, random_state=42)

## Reset Indexes 
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

## Let's see how many entries we have
print(train.shape)
print(val.shape)
print(test.shape)

(19069, 6)
(6357, 6)
(6357, 6)


In [6]:
from torch.utils.data import Dataset

class FlickrDataset(Dataset):
    
    def __init__(self, df, transforms: Optional[Callable] = None) -> None:
        self.df = df
        self.transforms = T.Compose([T.ToTensor(), T.Normalize(mean = [0.5], std = [0.5]), T.Resize((256,256)),])
        
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, idx: int):
        
        image_id = self.df.image_name.values[idx]
        image = Image.open(image_id).convert('RGB')
            
        if self.transforms is not None:
            image = self.transforms(image)
            
        comments = self.df[self.df.image_name == image_id].values.tolist()[0][1:][0] # Last zero is to obtain the first caption ONLY
        encoded_inputs = tokenizer(comments, return_token_type_ids = False, return_attention_mask = False, max_length = 100, padding = "max_length", return_tensors = "pt")
        
        sample = {"image":image.to(device),"captions": encoded_inputs["input_ids"].flatten().to(device)}
        
        return sample

In [7]:
batch_size = 32

train_dataset = FlickrDataset(train, transforms = True)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, drop_last=True)

val_dataset = FlickrDataset(val, transforms = True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size,drop_last=True)

test_dataset = FlickrDataset(test, transforms = True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size,drop_last=True)

In [8]:
class CNN(nn.Module):
    
    def __init__(self, embed_size):
        super(CNN, self).__init__()
        model = models.resnet50(pretrained=True)
        for param in model.parameters():
            param.requires_grad_(False)
        
        modules = list(model.children())[:-1]
        self.model = nn.Sequential(*modules)
        self.embed = nn.Linear(model.fc.in_features, embed_size)
        
    def forward(self, image):
        features = self.model(image)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
                
        return features

In [9]:
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, embedding_dim,vocab_size):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(num_embeddings = vocab_size,embedding_dim = embedding_dim)
        
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            batch_first=True)
        
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def init_hidden(self, features):
        
        return (torch.autograd.Variable(torch.zeros(1,32,512).to(device)), 
                torch.autograd.Variable(features.unsqueeze(0)).to(device))
        
    def forward(self, features, captions):
        
        state = self.init_hidden(features)
        
        embed = self.embedding(captions)
                    
        lstm_out, state = self.lstm(embed, state)
                        
        outputs = self.fc(lstm_out)
        outputs = outputs.view(-1, self.vocab_size)
        
        return outputs

In [11]:
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer

example_batch = next(iter(train_dataloader))

image, captions = example_batch["image"], example_batch["captions"]

encoder = CNN(embed_size = 512).to(device)
decoder = RNN(input_size = 512, hidden_size = 512, embedding_dim=512, vocab_size = 28881).to(device)

features = encoder(image)
embed = decoder(features, captions)

print("Image Transformation: ", image.shape, " --> ", features.shape)
print("Captions Transformation: ", captions.shape, " --> ", embed.shape)

NameError: name 'tokenizer' is not defined