<a href="https://colab.research.google.com/github/chinmay5/guided-research/blob/colab/Guided_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#installing all the libraries needed for the task
import pandas as pd
import json
import nltk
from pandas.io.json import json_normalize
nltk.download('stopwords')
nltk.download('punkt')
import re  
from nltk.corpus import stopwords
stops1 = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
import ipywidgets
import traitlets

In [0]:
processed_df = pd.read_pickle('./processed_df.pkl')

In [0]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(processed_df, test_size = 0.2, random_state = 0)

## Neural Network Part

In [0]:
# I am assuming that we are able to get the categories here
# Now this should become a LSTM based model which will try and do binary prediction
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.utils.data.dataloader as dataloader
from torch.nn.utils.rnn import pad_sequence

In [0]:
# Some constants
batch_size = 5

In [0]:
# https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, trg_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of tuple (src_seq, trg_seq).
            - src_seq: torch tensor of shape (?); variable length.
            - trg_seq: torch tensor of shape (?); variable length.
    Returns:
        src_seqs: torch tensor of shape (batch_size, padded_length).
        src_lengths: list of length (batch_size); valid length for each padded source sequence.
        trg_seqs: torch tensor of shape (batch_size, padded_length).
        trg_lengths: list of length (batch_size); valid length for each padded target sequence.
    """
    def merge(sequences):
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq[:end]
        return padded_seqs, lengths

    # sort a list by sequence length (descending order) to use pack_padded_sequence
#     print(data[0]) # list of tuples
    data.sort(key=lambda x: len(x[0]), reverse=True)

    # seperate source and target sequences
    src_seqs, trg_seqs = zip(*data)

    # merge sequences (from tuple of 1D tensor to 2D tensor)
    src_seqs, src_lengths = merge(src_seqs)
    # target sequence for us is a single tensor so we do not need to 
    # merge it
    #trg_seqs, trg_lengths = merge(trg_seqs)
    trg_seqs = torch.as_tensor(trg_seqs)
    return src_seqs, src_lengths, trg_seqs #, trg_lengths


In [0]:
class RecipeData(data.Dataset):
    
    def __init__(self, df):
        super(RecipeData, self).__init__()
        self.df = df
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        X = torch.as_tensor(self.df.Ingredient_Numeric.iloc[idx])
        y = torch.as_tensor(self.df.Recipe_id_numeric.iloc[idx])
        return X,y
    

In [0]:
train_dataset = RecipeData(df_train)
test_dataset = RecipeData(df_test)
train_data_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              collate_fn=collate_fn,
                                               drop_last=True) # Done for cases when num_samples not exact multiple
test_data_loader = torch.utils.data.DataLoader(dataset=test_dataset, # of the batch_size
                                              batch_size=batch_size,
                                              shuffle=True,
                                              collate_fn=collate_fn,
                                              drop_last=True)

In [0]:
df_train.Recipe_id_numeric[4]

5

In [0]:
train_dataset[0]

(tensor([ 65,  33, 140,  36]), tensor(0))

In [0]:
for (X,X_len,y) in train_data_loader:
    print(X)
    print(X_len)
    print(y)
    break

tensor([[ 47, 349,  20, 161,  99,   1,  67, 384,  49,  33,  21,  36,  57, 279,
         342, 187,   7, 114],
        [ 44,  20, 242, 438,   7, 159,  42,  43, 260,   0,   0,   0,   0,   0,
           0,   0,   0,   0],
        [ 29,   3, 124,  19,  39,  95,  30,  61, 225,   0,   0,   0,   0,   0,
           0,   0,   0,   0],
        [ 20,  44,  17, 107, 142,  49,  21,  19,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0],
        [252, 208,  20,  24, 203, 542,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0]])
[18, 9, 9, 8, 6]
tensor([ 0,  8, 12,  0,  5])


In [0]:
class RecipePredictor(nn.Module):
    
    def __init__(self, vocab_size, hidden_dim, embedding_dim, batch_size, output_dim):
        super(RecipePredictor, self).__init__()
        self.vocab_size = vocab_size
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim)
        self.batch_size = batch_size
        self.predictor = nn.Linear(self.hidden_dim, self.output_dim)
        self.non_linearity = nn.ReLU()
        self.init_hidden() # TODO:  This should happen at the beginning of each epoch
        
    def init_hidden(self):
        self.h_n = torch.randn(1, self.batch_size, self.hidden_dim)
        self.c_n = torch.randn(1, self.batch_size, self.hidden_dim)
        
    
    def forward(self, input_sequence, max_len):
#         print(input_sequence)
#         print(max_len)
        embedded = self.embedding(input_sequence)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, max_len, batch_first=True)
        outputs, (self.c_n, self.h_n) = self.lstm(packed, (self.c_n, self.h_n))
        # Unpack padding
        """
            Honestly, I do not know if at this point, I need the output. I would rather prefer to work with the
            self.h_n cell and so will not `pad_padded_sequence`
        """
        #outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        # on which output should the prediction be done?
        # self.h_n =  num_layers, batch_size, hidden_dim
        batch_size = self.h_n.shape[1]
        output_predicted = self.predictor(self.h_n.reshape(batch_size, -1))
        return output_predicted
        

In [0]:
# These are some constants that I am just copying here
# These need to be updated based on the files
ingredient_vocabulary = 1152
recipe_vocabulary = 16
vocab_size = ingredient_vocabulary + 1

In [0]:
num_recipes = recipe_vocabulary + 1
model = RecipePredictor(vocab_size=vocab_size, hidden_dim=512, embedding_dim=300, batch_size=batch_size,
                       output_dim=num_recipes)

In [0]:
model(X, torch.tensor(X_len))

tensor([[-0.1243,  0.0352,  0.0636,  0.0446,  0.0248,  0.1174,  0.0650,  0.0748,
          0.1017, -0.0285, -0.0722, -0.1552,  0.0567,  0.0761, -0.4016,  0.0074,
          0.2531],
        [ 0.0472, -0.0196, -0.2112,  0.0537, -0.0331,  0.0523, -0.0392, -0.2207,
          0.0740, -0.0726, -0.3089, -0.3189,  0.1394,  0.1757,  0.0067,  0.1906,
         -0.0131],
        [ 0.1656,  0.1994,  0.0592,  0.0142, -0.0989, -0.0683, -0.0934,  0.0474,
          0.1007, -0.0707, -0.2304,  0.0486,  0.1226,  0.1526,  0.0426,  0.0316,
          0.2209],
        [ 0.0229,  0.0763,  0.0403, -0.0182, -0.0252, -0.0771,  0.0806,  0.0205,
          0.1701,  0.1536, -0.0484, -0.1660, -0.2670, -0.0232,  0.2510, -0.0978,
         -0.0428],
        [-0.1517,  0.0475, -0.0226,  0.0896,  0.0544, -0.2032, -0.2437, -0.0679,
          0.1818,  0.2426,  0.0921, -0.1612, -0.1762,  0.1039,  0.0613, -0.1786,
         -0.1541]], grad_fn=<AddmmBackward>)

## Put loss function and categories

In [0]:
lr = 1e-3
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [0]:
num_epoch = 2

In [0]:
# Moving things to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [0]:
model.to(device)

RecipePredictor(
  (embedding): Embedding(1153, 300)
  (lstm): LSTM(300, 512)
  (predictor): Linear(in_features=512, out_features=17, bias=True)
  (non_linearity): ReLU()
)

In [0]:
def put_elements_to_device(a,b,c,device):
    print("put things on cuda")
    return a.to(device), b.to(device), c.to(device)

In [0]:
!pip install -q tb-nightly

# Load the TensorBoard notebook extension
%load_ext tensorboard

In [0]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [0]:
for epoch in range(num_epoch):
    running_loss = 0
    
    model.init_hidden()
    for idx,(X,X_len,y) in enumerate(train_data_loader):
        print("starting guys")
        model.zero_grad()
        
        # X, X_len, y = put_elements_to_device(a=X, b=torch.tensor(X_len), c=y, device=device)
        X = X.cuda()
        # y = y.cuda()

        prediction = model(X, X_len)
        loss = criterion(prediction, y)
        loss.backward(retain_graph=True)
        optimizer.step()
        running_loss += loss.item()
        if idx % 100 == 0:
            print("epoch {} loss {}".format(epoch, running_loss))
        writer.add_scalar('Loss/train', loss.item(), idx)
    # validation set is still left to create
    # Now to test the validation set
    correct = 0
    total = 0
    with torch.no_grad():
        for idx,(X,X_len,y) in enumerate(test_data_loader):
            X, X_len, y = put_elements_to_device(a=X, b=torch.tensor(X_len), c=y, device=device)
            outputs = model(X,X_len)
            _, predicted = torch.max(outputs.data, 1)
            total += X.size(0)
            correct += (predicted == y).sum().item()
        writer.add_scalar('Loss/train', loss.item(), idx)
    print('Accuracy of the network on the test samples: %d %%' % (
        100 * correct / total))
        
            

In [0]:
# !pip install tensorboard
%load_ext tensorboard
%tensorboard --logdir runs

In [0]:
# !pip uninstall tensorboard
# !pip install --force-reinstall tf-nightly-2.0-preview