## Unimodal Training before Multimodal Training

### Imports and class name extraction

We write this notebook with the help of following notebook: 
https://colab.research.google.com/drive/1dMTdO5vxdVX0NA2Qe7AV9WGEy8ZH67Xn?usp=sharing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input/upmcfood101/texts'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/upmcfood101/texts/train_titles.csv
/kaggle/input/upmcfood101/texts/test_titles.csv


### Dataset organization

In [2]:
image_root = '/kaggle/input/upmcfood101/images/'
datalink = {
    'train': '/kaggle/input/upmcfood101/texts/train_titles.csv',
    'test' : '/kaggle/input/upmcfood101/texts/test_titles.csv'
}
col_Names=["Images", "Titles", "Labels"]
def fast_scandir(dirname):
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders

### Dataloader and Augmentation

In [3]:
classnames = fast_scandir('/kaggle/input/upmcfood101/images/test')
n_classes = len(classnames)
print('Number of classes: ',n_classes)

print(classnames[0].split('/')[-1])
for iter1 in range(len(classnames)):
    classnames[iter1] = classnames[iter1].split('/')[-1]

classes =  sorted(classnames)
print(classes)
col_Names=["Images", "Titles", "Labels"]

train_df = pd.read_csv(datalink['train'], names=col_Names)
test_df = pd.read_csv(datalink['test'], names=col_Names)

for iter1 in range(len(train_df)):
    train_df['Labels'][iter1] = train_df['Labels'][iter1].split('\n')[0]
    train_df['Labels'][iter1] = classes.index(train_df['Labels'][iter1])

for iter1 in range(len(test_df)):
    test_df['Labels'][iter1] = test_df['Labels'][iter1].split('\n')[0]
    test_df['Labels'][iter1] = classes.index(test_df['Labels'][iter1])

train_df.head()

Number of classes:  101
macarons
['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche', 'cheese_plate', 'cheesecake', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel', 'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad', 'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', 'lobster_bisque', 'lobster_roll_sandwich', 'macaroni_a

Unnamed: 0,Images,Titles,Labels
0,apple_pie_851.jpg,Crock-Pot Ladies Crock-Pot Apple Pie Moonshine,0
1,apple_pie_140.jpg,Mom's Maple-Apple Pie Recipe | Taste of Home,0
2,apple_pie_858.jpg,Cookin&#8217; Canuck &#8211; Baked Apple Pie E...,0
3,apple_pie_449.jpg,Dutch Apple Pie Recipe | Just A Pinch Recipes,0
4,apple_pie_695.jpg,Our Share of the Harvest &raquo; Grandma&#8217...,0


In [4]:
for iter1 in range(len(train_df)):
    if len(train_df['Titles'][iter1]) < 2:
        print('Small title: ',train_df['Titles'][iter1],'-Index:', iter1)
        train_df = train_df.drop(iter1)

print(len(train_df), len(test_df))

Small title:  : -Index: 39338
67971 22716


In [5]:
#np.random.seed(112)
#df_train, df_val = np.split(train_df.sample(frac=1, random_state=35),[int(0.8*len(train_df))])
df_train = []
df_val = []

split_counter = 0
for iter1 in range(len(train_df)):
    split_counter = split_counter + 1
    if split_counter%10 ==0:
        df_val.append(train_df.iloc[iter1])
    if split_counter%10 >0:
        df_train.append(train_df.iloc[iter1])

df_train = pd.DataFrame(df_train, columns=train_df.columns)
df_train = df_train.dropna()

df_val = pd.DataFrame(df_val, columns=train_df.columns)
df_val = df_val.dropna()


df_test = test_df
print(len(df_train), len(df_val), len(df_test))

61174 6797 22716


## Text Classification Module gpt2

In [6]:
import torch
from transformers import GPT2Model, GPT2Tokenizer

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
#labels = classes

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [int(label) for label in df['Labels']]
        self.texts = [tokenizer(text,
                                padding='max_length',
                                max_length=64,
                                truncation=True,
                                return_tensors="pt") for text in df['Titles']]
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        # Get a batch of labels
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        # Get a batch of inputs
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
from torch import nn
from torch.optim import Adam

from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

class SimpleGPT2SequenceClassifier(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str):
        super(SimpleGPT2SequenceClassifier,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(gpt_model_name)
        self.fc1 = nn.Linear(hidden_size*max_seq_len, num_classes)

        
    def forward(self, input_id, mask):
        """
        Args:
                input_id: encoded inputs ids of sent.
        """
        gpt_out, _ = self.gpt2model(input_ids=input_id, attention_mask=mask, return_dict=False)
        batch_size = gpt_out.shape[0]
        linear_output = self.fc1(gpt_out.view(batch_size,-1))
        return linear_output

## Details of Texts

In [9]:
batch_size_all = 120
def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size_all, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size_all)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)
            
            model.zero_grad()

            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1)==train_label).sum().item()
            total_acc_train += acc

            batch_loss.backward()
            optimizer.step()
            
        total_acc_val = 0
        total_loss_val = 0
        
        with torch.no_grad():
            
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                
                output = model(input_id, mask)
                
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1)==val_label).sum().item()
                total_acc_val += acc
                
            print(
            f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train/len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}")

In [10]:
EPOCHS = 3
model = SimpleGPT2SequenceClassifier(hidden_size=768, num_classes=n_classes, max_seq_len=64, gpt_model_name="gpt2")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

100%|██████████| 510/510 [05:59<00:00,  1.42it/s]


Epochs: 1 | Train Loss:  0.016             | Train Accuracy:  0.614             | Val Loss:  0.008             | Val Accuracy:  0.816


100%|██████████| 510/510 [05:58<00:00,  1.42it/s]


Epochs: 2 | Train Loss:  0.006             | Train Accuracy:  0.845             | Val Loss:  0.007             | Val Accuracy:  0.829


100%|██████████| 510/510 [05:59<00:00,  1.42it/s]


Epochs: 3 | Train Loss:  0.005             | Train Accuracy:  0.870             | Val Loss:  0.007             | Val Accuracy:  0.835


## Test

In [11]:
def evaluate(model, test_data, phase):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size_all)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

        
    # Tracking variables 
    predictions_labels = []
    true_labels = []
    
    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            
            # add original labels
            true_labels += test_label.cpu().numpy().flatten().tolist()
            # get predicitons to list
            predictions_labels.extend(output.data.cpu().numpy())
    
    print(phase+ f' Accuracy: {total_acc_test / len(test_data): .3f}')
    return true_labels, predictions_labels
    
test_labels, output_test = evaluate(model, df_test,'Test')
print(output_test[0].shape)

Test Accuracy:  0.846
(101,)


## Generating text output data for multimodal training

In [12]:
train_labels, output_train = evaluate(model, df_train,'Training')
val_labels, output_val = evaluate(model, df_val,'Validation')

Training Accuracy:  0.900
Validation Accuracy:  0.835


In [13]:
prediction_test_df = pd.DataFrame(output_test)
prediction_train_df = pd.DataFrame(output_train)
prediction_val_df = pd.DataFrame(output_val)

print(len(df_train), len(df_val), len(df_test))

train_df = pd.concat([df_train.reset_index(drop=True), prediction_train_df.reset_index(drop=True)], axis=1)#.fillna(-20)
test_df = pd.concat([df_test.reset_index(drop=True), prediction_test_df.reset_index(drop=True)], axis=1)#.fillna(-20)
val_df = pd.concat([df_val.reset_index(drop=True), prediction_val_df.reset_index(drop=True)], axis=1, ignore_index=True)#.fillna(-20)

print(len(train_df), len(val_df), len(test_df))

61174 6797 22716
61174 6797 22716


### Save model

In [14]:
train_df.to_csv('train.csv',index=False)
test_df.to_csv('test.csv',index=False)
val_df.to_csv('valid.csv',index=False)

#prediction_train_df.head()

PATH = "./NN_text_model.pt"
torch.save(model.state_dict(), PATH)

import pickle

with open('objs.pkl', 'wb') as f:
    pickle.dump(classes, f)

'''
# Getting back the objects:
with open('objs.pkl') as f: 
    vocab_size, classes = pickle.load(f)
'''

"\n# Getting back the objects:\nwith open('objs.pkl') as f: \n    vocab_size, classes = pickle.load(f)\n"