In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import torchvision
from torchvision import datasets, transforms, models
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from collections import OrderedDict
from PIL import Image
import seaborn as sns
import splitfolders
from IPython.display import display, HTML

import torchtext
from torchtext.data import get_tokenizer   # for tokenization
from collections import Counter     # for tokenizer


In [2]:
# splitfolders.ratio("Food Images", output="food_images", 
#                    seed=30, ratio=(.7, .2, .1), 
#                    group_prefix=None)

In [3]:
data_df = pd.read_csv('Food Ingredients and Recipe Dataset with Image Name Mapping.csv')
data_df.iloc[1,4]

'crispy-salt-and-pepper-potatoes-dan-kluger'

In [4]:
data_dir = './food_images'
train_dir = data_dir + '/train'
valid_dir = data_dir + '/val'
test_dir = data_dir + '/test'

In [5]:
train_transforms = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485,0.456,0.406],
                                                           [0.229,0.224,0.225])])
valid_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(size=224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485,0.456,0.406],
                                                        [0.229,0.224,0.225])])
test_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(size=224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485,0.456,0.406],
                                                        [0.229,0.224,0.225])])

# train_data = datasets.ImageFolder(train_dir, transform=train_transforms)
# valid_data = datasets.ImageFolder(valid_dir, transform=valid_transforms)
# test_data = datasets.ImageFolder(test_dir, transform=test_transforms)


# trainloader = torch.utils.data.DataLoader(train_data,batch_size = 64, shuffle=True)
# validloader = torch.utils.data.DataLoader(valid_data,batch_size = 64, shuffle=True)
# testloader = torch.utils.data.DataLoader(test_data,batch_size = 64, shuffle=True)

In [6]:
class Vocab:
    def __init__(self, min_freq = 1):
        self.itos = {0:'<PAD>',1:'<START>',2:'<END>',3:'<UNK>'}
        self.stoi = {'<PAD>':0,'<START>':1,'<END>':2,'<UNK>':3}

        self.min_freq = min_freq

        self.tokenizer = get_tokenizer('basic_english')
        self.frequencies = Counter()

    def __len__(self):
        return len(self.itos)
    
    def build_vocab(self, sentence_list):
        idx = 4
        for sentence in sentence_list:
            sentence_tokens = self.tokenizer(str(sentence))
            self.frequencies.update(sentence_tokens)
            #TODO: handle null sentences/instructions
            for token in sentence_tokens:
                if token not in self.stoi.keys() and self.frequencies[token] >= self.min_freq:
                    self.stoi[token] = idx
                    self.itos[idx] = token
                    idx +=1

    def numericalize(self, sentence):
        sentence_tokens = self.tokenizer(str(sentence))
        return [self.stoi[token] if token in self.stoi else self.stoi['<UNK>'] for token in sentence_tokens]



In [7]:
class CollateFn:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        recipes = [item[1] for item in batch]
        recipes = pad_sequence(recipes, batch_first=False, padding_value=self.pad_idx)

        return imgs, recipes

In [8]:
class CustomDataset(Dataset):
    def __init__(self, img_dir, recipe_csv, transform = None, min_freq = 3):
        self.img_dir = img_dir
        self.transform = transform
        init_df = pd.read_csv(recipe_csv)
        ###TODO cleaning function :
        # clean_df = init_df.dropna(subset=['Instructions'], inplace=True)
        clean_df = init_df[init_df["Image_Name"] != '#NAME?']
        
        self.recipe_df = clean_df
        self.tokenizer = get_tokenizer('basic_english')

        self.vocab = Vocab(min_freq)
        self.vocab.build_vocab(self.recipe_df["Instructions"].tolist())


    def __len__(self):
        return len(self.recipe_df)
    
    def __getitem__(self, index):
        img_file = os.path.join(self.img_dir, self.recipe_df.iloc[index,4] + '.jpg')
        img = Image.open(img_file)
        img = img.convert('RGB')
        img_name = self.recipe_df.iloc[index,1]
        recipe = self.recipe_df.iloc[index,3]

        recipe_tokens = []
        recipe_tokens += [self.vocab.stoi['<START>']]
        recipe_tokens += self.vocab.numericalize(str(recipe))
        recipe_tokens += [self.vocab.stoi['<END>']] 


        if self.transform:
            img = self.transform(img)

        return img, torch.tensor(recipe_tokens)


In [9]:
torch_data = CustomDataset(recipe_csv='Food Ingredients and Recipe Dataset with Image Name Mapping.csv',
                                    img_dir='Food Images', transform=train_transforms)

In [10]:
# img,recipe_tokens = torch_data[10]

# fig, ax = plt.subplots()
# img


In [11]:
# [torch_data.vocab.itos[token] for token in recipe_tokens.tolist()]

In [12]:
# image_data_ref = pd.read_csv('Food Ingredients and Recipe Dataset with Image Name Mapping.csv')
# image_data_ref.ndim
# display(image_data_ref)


In [13]:
##recipe

In [14]:
tokenizer = get_tokenizer('basic_english')

In [15]:
# resnet = models.resnet101(pretrained = True)
# resnet.fc.in_features

In [16]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, train_CNN=False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN
        self.arch = models.resnet101(pretrained = True)
        self.arch.fc = nn.Linear(self.arch.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.times = []
        self.dropout = nn.Dropout(0.5)

    def forward(self, images):
        features = self.arch(images)
        return self.dropout(self.relu(features))
    
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, recipe_tokens):
        embeddings = self.dropout(self.embed(recipe_tokens))
        #Add features as a dimension for embeddings as the 'first' input in the sequence
        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

In [17]:
class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, recipe_tokens):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, recipe_tokens)
        return outputs

    def recipe_generate(self, image, vocabulary, max_length=50):
        result_recipe = []

        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                
                output_dist = output.data.view(-1).div(0.8).exp()
                top_i = torch.multinomial(output_dist, 1)[0]
                
                result_recipe.append(top_i.item())

                x = self.decoderRNN.embed(torch.unsqueeze(top_i, 0)).unsqueeze(0)

                if vocabulary.itos[top_i.item()] == "<END>":
                    break

        return [vocabulary.itos[idx] for idx in result_recipe]

In [18]:
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("Saving checkpoint ...")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("Loading checkpoint ... ")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    step = checkpoint["step"]
    return step

def train():
    # train_transforms = transforms.Compose([transforms.RandomRotation(30),
    #                                    transforms.RandomResizedCrop(224),
    #                                    transforms.RandomHorizontalFlip(),
    #                                    transforms.ToTensor(),
    #                                    transforms.Normalize([0.485,0.456,0.406],
    #                                                        [0.229,0.224,0.225])])
    # valid_transforms = transforms.Compose([transforms.Resize(255),
    #                                     transforms.CenterCrop(size=224),
    #                                     transforms.ToTensor(),
    #                                     transforms.Normalize([0.485,0.456,0.406],
    #                                                         [0.229,0.224,0.225])])
    # test_transforms = transforms.Compose([transforms.Resize(255),
    #                                     transforms.CenterCrop(size=224),
    #                                     transforms.ToTensor(),
    #                                     transforms.Normalize([0.485,0.456,0.406],
    #                                                         [0.229,0.224,0.225])])

    train_dls = torch.utils.data.DataLoader(torch_data, 
                                           batch_size=64, shuffle=False,
                                           collate_fn = CollateFn(pad_idx=torch_data.vocab.stoi["<PAD>"]))
    torch.backends.cudnn.benchmark = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    load_model = False
    save_model = True
    train_CNN = False

    # Hyperparameters
    embed_size = 256
    hidden_size = 256
    vocab_size = len(torch_data.vocab)
    num_layers = 2
    learning_rate = 3e-4
    num_epochs = 10

   
    step = 0

    # initialize model, loss etc
    model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=torch_data.vocab.stoi["<PAD>"])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Only finetune the CNN
    for param in model.encoderCNN.parameters():
       param.requires_grad = False

    if load_model:
        step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

    model.train()

    for epoch in range(num_epochs):
        # Uncomment the line below to see a couple of test cases
        # print_examples(model, device, dataset)

        for idx, (imgs, recipe_tokens) in tqdm(
            enumerate(train_dls), total=len(train_dls), leave=False
        ):
            imgs = imgs.to(device)
            recipe_tokens = recipe_tokens.to(device)

            outputs = model(imgs, recipe_tokens[:-1])
            loss = criterion(
                outputs.reshape(-1, outputs.shape[2]), recipe_tokens.reshape(-1)
            )
            if idx % 200 == 0:
                print("Loss: ")
                print(loss.item())
            # writer.add_scalar("Training loss", loss.item(), global_step=step)
            step += 1

            optimizer.zero_grad()
            loss.backward(loss)
            optimizer.step()
    
    if save_model:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "step": step,
            }
            save_checkpoint(checkpoint)

In [19]:
# train()

In [20]:
path = 'my_checkpoint.pth.tar'
checkpoint = torch.load(path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
embed_size = 256
hidden_size = 256
vocab_size = len(torch_data.vocab)
num_layers = 2
learning_rate = 3e-4
num_epochs = 1

model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
model.load_state_dict(checkpoint['state_dict'])

# No use
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']  # ALREADY DEFFINED ABOE
# loss = checkpoint['loss']



<All keys matched successfully>

In [21]:
img,recipe_tokens = torch_data[1]
# model.caption_image(img.unsqueeze(0), torch_data.vocab, max_length = 100)



In [22]:
torch_data.recipe_df.iloc[1]

Unnamed: 0                                                             1
Title                                    Crispy Salt and Pepper Potatoes
Ingredients            ['2 large egg whites', '1 pound new potatoes (...
Instructions           Preheat oven to 400°F and line a rimmed baking...
Image_Name                    crispy-salt-and-pepper-potatoes-dan-kluger
Cleaned_Ingredients    ['2 large egg whites', '1 pound new potatoes (...
Name: 1, dtype: object