In [1]:
import argparse
from collections import defaultdict, namedtuple
from io import open
import math
import os
from random import shuffle, uniform
from datetime import datetime
from future.utils import iterkeys, iteritems
import torch

from future.builtins import range
from future.utils import iteritems
import pickle
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [2]:
# !pip install scipy
# !pip install matplotlib
# !pip install sklearn
!pip install torchsummaryX -q
from torchsummaryX import summary

In [2]:
import gc

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [1]:
#Get Glove Vectors
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip

--2023-04-07 04:56:56--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-04-07 04:56:56--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-04-07 04:59:35 (5.19 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  /content/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [4]:
#Prepare Glove Vectors
word2vec = {}
with open("/content/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for l in f:
        line = l.split()
        word = line[0]
        vect = np.array(line[1:]).astype(float)
        word2vec[word] = vect
pickle.dump(word2vec, open(f'/content/6B.50_word2Vec.pkl', 'wb'))

# Data Loading Utilities

In [5]:
class InstanceData(object):
    """
    A bare-bones class to store the included properties of each instance. This is meant to act as easy access to the
    data, and provides a launching point for deriving your own features from the data.
    """
    def __init__(self, instance_properties):

        # Parameters specific to this instance
        self.instance_id = instance_properties['instance_id']
        self.token = instance_properties['token']
        self.part_of_speech = instance_properties['part_of_speech']
        self.morphological_features = instance_properties['morphological_features']
        self.dependency_label = instance_properties['dependency_label']
        self.dependency_edge_head = instance_properties['dependency_edge_head']

        # Derived parameters specific to this instance
        self.exercise_index = int(self.instance_id[8:10])
        self.token_index = int(self.instance_id[10:12])

        # Derived parameters specific to this exercise
        self.exercise_id = self.instance_id[:10]

        # Parameters shared across the whole session
        self.user = instance_properties['user']
        self.countries = instance_properties['countries']
        self.days = instance_properties['days']
        self.client = instance_properties['client']
        self.session = instance_properties['session']
        self.format = instance_properties['format']
        self.time = instance_properties['time']
        self.prompt = instance_properties.get('prompt', None)

        # Derived parameters shared across the whole session
        self.session_id = self.instance_id[:8]

    def to_features(self):
        """
        Prepares those features that we wish to use in the LogisticRegression example in this file. We introduce a bias,
        and take a few included features to use. Note that this dict restructures the corresponding features of the
        input dictionary, 'instance_properties'.

        Returns:
            to_return: a representation of the features we'll use for logistic regression in a dict. A key/feature is a
                key/value pair of the original 'instance_properties' dict, and we encode this feature as 1.0 for 'hot'.
        """
        to_return = dict()

        to_return['bias'] = 1.0
        to_return['user:' + self.user] = 1.0
        to_return['format:' + self.format] = 1.0
        to_return['token:' + self.token.lower()] = 1.0

        to_return['part_of_speech:' + self.part_of_speech] = 1.0
        for morphological_feature in self.morphological_features:
            to_return['morphological_feature:' + morphological_feature] = 1.0
        to_return['dependency_label:' + self.dependency_label] = 1.0
        
        time = datetime.now()
        if(time.second %10 == 0 and time.microsecond == 0):
          print(time)
          
        return to_return

In [8]:
#Load data from the file
def load_data(filename):
    """
    This method loads and returns the data in filename. If the data is labelled training data, it returns labels too.

    Parameters:
        filename: the location of the training or test data you want to load.

    Returns:
        data: a list of InstanceData objects from that data type and track.
        labels (optional): if you specified training data, a dict of instance_id:label pairs.
    """

    # 'data' stores a list of 'InstanceData's as values.
    data = []

    # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
    training = False
    if filename.find('train') != -1:
        training = True

    if training:
        labels = dict()

    num_exercises = 0
    print('Loading instances...')
    instance_properties = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()

            # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
            if len(line) == 0:
                num_exercises += 1
                if num_exercises % 100000 == 0:
                    print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')
                instance_properties = dict()

            # If the line starts with #, then we're beginning a new exercise
            elif line[0] == '#':
                if 'prompt' in line:
                    instance_properties['prompt'] = line.split(':')[1]
                else:
                    list_of_exercise_parameters = line[2:].split()
                    for exercise_parameter in list_of_exercise_parameters:
                        [key, value] = exercise_parameter.split(':')
                        if key == 'countries':
                            value = value.split('|')
                        elif key == 'days':
                            value = float(value)
                        elif key == 'time':
                            if value == 'null':
                                value = None
                            else:
                                assert '.' not in value
                                value = int(value)
                        instance_properties[key] = value

            # Otherwise we're parsing a new Instance for the current exercise
            else:
                line = line.split()
                if training:
                    assert len(line) == 7
                else:
                    assert len(line) == 6
                assert len(line[0]) == 12

                instance_properties['instance_id'] = line[0]

                instance_properties['token'] = line[1]
                instance_properties['part_of_speech'] = line[2]

                instance_properties['morphological_features'] = dict()
                for l in line[3].split('|'):
                    [key, value] = l.split('=')
                    if key == 'Person':
                        value = int(value)
                    instance_properties['morphological_features'][key] = value

                instance_properties['dependency_label'] = line[4]
                instance_properties['dependency_edge_head'] = int(line[5])
                if training:
                    label = float(line[6])
                    labels[instance_properties['instance_id']] = label
                data.append(InstanceData(instance_properties=instance_properties))

        print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
              ' exercises.\n')

    if training:
        return data, labels
    else:
        return data

In [9]:
def load_labels(filename):
    """
    This loads labels, either the actual ones or your predictions.

    Parameters:
        filename: the filename pointing to your labels

    Returns:
        labels: a dict of instance_ids as keys and labels between 0 and 1 as values
    """
    labels = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            else:
                line = line.split()
            instance_id = line[0]
            label = float(line[1])
            labels[instance_id] = label
    return labels


# Load Data

In [9]:
#test_data = load_data("/content/en_es/en_es.slam.20190204.test")

In [10]:
training_data, training_labels = load_data("/content/en_es/en_es.slam.20190204.train")

Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591344 instances across 500000 exercises...
Loaded 1911212 instances across 600000 exercises...
Loaded 2227444 instances across 700000 exercises...
Loaded 2546704 instances across 800000 exercises...
Done loading 2622957 instances across 824012 exercises.



In [11]:
valid_data = load_data("/content/en_es/en_es.slam.20190204.dev")
valid_labels = load_labels("/content/en_es/en_es.slam.20190204.dev.key")

Loading instances...
Loaded 334439 instances across 100000 exercises...
Done loading 387374 instances across 115770 exercises.



# Pytorch Datasets

In [12]:
class ExcerciseDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, sequence_size): 
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''
        token_idx = 1
        self.word2Idx = {"unk" : 0}
        self.token_vocabulary = ["unk"]


        pos_idx = 1
        self.pos2Idx = {"unk" : 0}
        self.pos_vocabulary = ["unk"]

        morph_idx = 1
        self.morph2Idx = {"unk" : 0}
        self.morph_vocab = ["unk"]

        dep_label_idx = 1
        self.depLabel2Idx = {"unk": 0}
        self.depLabelVocab = ["unk"]

        self.user_keyed_data = {}
        self.user_keyed_label = {}
        
        for i, instance in enumerate(data):
          user = instance.user
          if user not in self.user_keyed_data:
            self.user_keyed_data[user] = []
            self.user_keyed_label[user] = []
          
          exercise = []
          
          
          token = instance.token.lower()
          pos_tag = instance.part_of_speech.lower()
          morphology = instance.morphological_features
          dependency_label = instance.dependency_label.lower()
          label = labels[instance.instance_id]

          if token not in self.word2Idx:
            self.word2Idx[token] = token_idx
            self.token_vocabulary.append(token)
            token_idx += 1

          exercise.append(self.word2Idx[token])

          if pos_tag not in self.pos2Idx:
            self.pos2Idx[pos_tag] = pos_idx
            self.pos_vocabulary.append(pos_tag)
            pos_idx += 1

          exercise.append(self.pos2Idx[pos_tag])

          # morph_list = []
          # for morph_feature in morphology:
          #   if morph_feature not in self.morph2Idx:
          #     self.morph2Idx[morph_feature] = morph_idx
          #     self.morph_vocab.append(morph_feature)
          #     morph_idx += 1
          #   morph_list.append(self.morph2Idx[morph_feature])
          
          # exercise.append(morph_list)

          if dependency_label not in self.depLabel2Idx:
            self.depLabel2Idx[dependency_label] = dep_label_idx
            self.depLabelVocab.append(dependency_label)
            dep_label_idx += 1

          exercise.append(self.depLabel2Idx[dependency_label])

          self.user_keyed_data[user].append(torch.tensor(exercise, dtype = int))
          self.user_keyed_label[user].append(label)
          

        #At this point we have the exercises for each user.

        self.timesteped_data = []
        self.timesteped_labels = []

        #print(self.user_keyed_data[user])

        for user in self.user_keyed_data:
          #self.user_keyed_data[user] = torch.tensor(self.user_keyed_data[user])
          for i in range(0, len(self.user_keyed_data[user]), 50):
            chunk = self.user_keyed_data[user][i:i + sequence_size]
            if(len(chunk) < 2):
              continue;
            self.timesteped_data.append(torch.stack(chunk,dim=0))
            self.timesteped_labels.append(torch.FloatTensor(self.user_keyed_label[user][i:i + sequence_size]))
        
        self.length = len(self.timesteped_data)
        
    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''
        data = self.timesteped_data[ind] # TODO
        labels = self.timesteped_labels[ind] # TODO
        return data, labels


    def collate_fn(batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. 
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, 
            and lengths of labels.
        '''
        # batch of input mfcc coefficients
        batch_data_encoder = [x[0:len(x)//2] for x,y in batch] # TODO
        batch_data_decoder = [x[len(x)//2:] for x,y in batch] # TODO
        # batch of output phonemes
        batch_labels_encoder = [y[0:len(y)//2] for x,y in batch] # TODO
        batch_labels_decoder = [y[len(y)//2:] for x,y in batch] 

        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)
        batch_data_encoder_pad = pad_sequence(batch_data_encoder, batch_first=True, padding_value=0) # TODO
        batch_data_decoder_pad = pad_sequence(batch_data_decoder, batch_first=True, padding_value=0)

        encoder_lengths_data = [len(x) for x in batch_data_encoder] # TODO
        decoder_lengths_data = [len(x) for x in batch_data_decoder] 

        batch_labels_encoder_pad = pad_sequence(batch_labels_encoder, batch_first=True, padding_value=0) # TODO
        batch_labels_decoder_pad = pad_sequence(batch_labels_decoder, batch_first=True, padding_value=0)
        encoder_lengths_labels =  [len(x) for x in batch_labels_encoder] # TODO
        decoder_lengths_labels =  [len(x) for x in batch_labels_decoder] # TODO


        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?
        
        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_data_encoder_pad,batch_data_decoder_pad, batch_labels_encoder_pad,batch_labels_decoder_pad, torch.tensor(encoder_lengths_data), torch.tensor(decoder_lengths_data), torch.tensor(encoder_lengths_labels), torch.tensor(decoder_lengths_labels)

In [13]:
train_data = ExcerciseDataset(training_data, training_labels, 256) #TODO
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 8,
    batch_size  = 64, 
    pin_memory  = True,
    shuffle     = True,
    collate_fn = ExcerciseDataset.collate_fn
) #TODO



In [14]:
val_dataset = ExcerciseDataset(valid_data, valid_labels, 256) #TODO
val_loader = torch.utils.data.DataLoader(
    dataset     = val_dataset, 
    num_workers = 8,
    batch_size  = 64, 
    pin_memory  = True,
    shuffle     = True,
    collate_fn = ExcerciseDataset.collate_fn
) #TODO

In [15]:
# sanity check
i = 0;

for data in val_loader:
    x_encoder,x_decoder, y_encoder,y_decoder, lx_encoder,lx_decoder, ly_encoder, ly_decoder = data
    print(x_encoder.shape,x_decoder.shape, y_encoder.shape,y_decoder.shape, lx_encoder.shape, lx_decoder.shape,ly_encoder.shape, ly_decoder.shape)
    i += 1
    if(i==2):
      break 

torch.Size([64, 128, 3]) torch.Size([64, 128, 3]) torch.Size([64, 128]) torch.Size([64, 128]) torch.Size([64]) torch.Size([64]) torch.Size([64]) torch.Size([64])
torch.Size([64, 128, 3]) torch.Size([64, 128, 3]) torch.Size([64, 128]) torch.Size([64, 128]) torch.Size([64]) torch.Size([64]) torch.Size([64]) torch.Size([64])


# Encoder

In [16]:
#Prepare GloVe for encoder
matrix_len = len(train_data.token_vocabulary)
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(train_data.token_vocabulary):
    try: 
        weights_matrix[i] = word2vec[word]
        words_found += 1
    except KeyError:
      if(i == 0):
        print("here")
        weights_matrix[i] = np.zeros((50, ))
      else:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(50, ))

In [17]:
len(train_data.pos_vocabulary)

17

In [18]:
class Encoder(torch.nn.Module):
    '''
    The Encoder takes utterances as inputs and returns latent feature representations
    '''
    def __init__(self, token_embedding_weight_matrix, encoder_hidden_size):
        super(Encoder, self).__init__()

        self.token_embedding = torch.nn.Embedding(len(train_data.token_vocabulary), 50)
        self.token_embedding.load_state_dict({'weight': torch.as_tensor(token_embedding_weight_matrix)})
        self.token_embedding.weight.requires_grad = False

        self.pos_embedding = torch.nn.Embedding(len(train_data.pos_vocabulary), 10)
        self.dependency_embedding = torch.nn.Embedding(len(train_data.depLabelVocab), 10)

        self.lstm1 = torch.nn.LSTM(input_size = 71, hidden_size = encoder_hidden_size, num_layers = 3, bidirectional = True, batch_first = True, dropout = 0.3)

    def forward(self, x, x_lens, labels):
        # Where are x and x_lens coming from? The dataloader
        #TODO: Call the embedding layer
        token_embeddings = self.token_embedding(x[:,:,0])
        pos_embeddings = self.pos_embedding(x[:,:,1])
        dependency_embeddings = self.dependency_embedding(x[:,:,2])

        concatenated_out = torch.cat((token_embeddings,pos_embeddings,dependency_embeddings, labels.reshape(labels.shape[0], labels.shape[1], 1)), dim=2)

        #print(concatenated_out.shape)
        # TODO: Pack Padded Sequence
        packed_out = pack_padded_sequence(concatenated_out, x_lens, batch_first = True, enforce_sorted=False)
        # TODO: Pass Sequence through the pyramidal Bi-LSTM layer
        out = self.lstm1(packed_out)[0]
        #out = self.pBLSTMs(out)
        # TODO: Pad Packed Sequence
        encoder_outputs, encoder_lens = pad_packed_sequence(out, batch_first = True)
        
        # Remember the number of output(s) each function returns

        return encoder_outputs, encoder_lens

In [19]:
# encoder = Encoder(weights_matrix, 16)

In [20]:
# for i, data in enumerate(train_loader):
#   x_encoder,x_decoder, y_encoder,y_decoder, lx_encoder,lx_decoder, ly_encoder, ly_decoder = data
#   encoder_outputs, encoder_lens = encoder(x_encoder, lx_encoder, y_encoder)
#   # encoder_out_new = torch.zeros((encoder_outputs.shape[0], encoder_outputs.shape[2]))
#   # for batch_output_index in range(encoder_outputs.shape[0]):
#   #   encoder_out_new[batch_output_index] = torch.sum(encoder_outputs[batch_output_index,0:encoder_lens[batch_output_index]], dim = 0)
#   # print(encoder_out_new.shape)
#   #print(torch.sum(encoder_outputs[:,:encoder_lens], dim = 1).shape)

# Decoder

In [21]:
class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)

In [22]:
class Decoder(torch.nn.Module):

    def __init__(self,token_embedding_weight_matrix, decoder_hidden_size, output_size = 1):
        super().__init__()

        self.token_embedding = torch.nn.Embedding(len(train_data.token_vocabulary), 50)
        self.token_embedding.load_state_dict({'weight': torch.as_tensor(token_embedding_weight_matrix)})
        self.token_embedding.weight.requires_grad = False

        self.pos_embedding = torch.nn.Embedding(len(train_data.pos_vocabulary), 10)
        self.dependency_embedding = torch.nn.Embedding(len(train_data.depLabelVocab), 10)

        self.lstm1 = torch.nn.GRU(input_size = 70, hidden_size = decoder_hidden_size, num_layers = 1, bidirectional = False, batch_first = True, dropout = 0.3)

        self.mlp = torch.nn.Sequential(
            PermuteBlock(), 
            torch.nn.BatchNorm1d(decoder_hidden_size), 
            PermuteBlock(),
            torch.nn.Linear(decoder_hidden_size, decoder_hidden_size//2),
            PermuteBlock(), 
            torch.nn.BatchNorm1d(decoder_hidden_size//2), 
            PermuteBlock(),
            torch.nn.GELU(),
            torch.nn.Dropout(p = 0.2),
            torch.nn.Linear(decoder_hidden_size//2, output_size),
        )
        
        #self.sigmoid = torch.nn.Sigmoid(dim=2)

    def forward(self, encoder_out, x, x_lens):
        #TODO call your MLP
        #TODO Think what should be the final output of the decoder for the classification 

        #Assuming encoder_out is of size BatchSize, H_out
        token_embeddings = self.token_embedding(x[:,:,0])
        pos_embeddings = self.pos_embedding(x[:,:,1])
        dependency_embeddings = self.dependency_embedding(x[:,:,2])

        concatenated_out = torch.cat((token_embeddings,pos_embeddings,dependency_embeddings), dim=2)

        packed_out = pack_padded_sequence(concatenated_out, x_lens, batch_first = True, enforce_sorted=False)
        out = self.lstm1(packed_out, encoder_out.reshape(1,encoder_out.shape[0], encoder_out.shape[1]))[0]
        decoder_output, decoder_lens = pad_packed_sequence(out, batch_first = True)

        out = self.mlp(decoder_output)
        return out.reshape(out.shape[0],out.shape[1])

In [23]:
# decoder = Decoder(weights_matrix, 32, 1)

In [24]:
# for i, data in enumerate(train_loader):
#   x_encoder,x_decoder,y_encoder,y_decoder,lx_encoder,lx_decoder, ly_encoder, ly_decoder = data

#   #print(encoder(x_encoder, lx_encoder, y_encoder)[0].shape)
#   encoder_out, encoder_lens = encoder(x_encoder, lx_encoder, y_encoder)

#   encoder_out_new = torch.zeros((encoder_out.shape[0], encoder_out.shape[2]))

#   for batch_output_index in range(encoder_out.shape[0]):
#     encoder_out_new[batch_output_index] = torch.sum(encoder_out[batch_output_index,0:encoder_lens[batch_output_index]], dim = 0)

#   decoder_out  = decoder(encoder_out_new, x_decoder, lx_decoder)
#   print(decoder_out.shape)

# Complete Model

In [25]:
class KnowledgeModel(torch.nn.Module):

    def __init__(self, embed_size=128, output_size=1):
        super().__init__()

        #Prepare GloVe for encoder
        matrix_len = len(train_data.token_vocabulary)
        weights_matrix = np.zeros((matrix_len, 50))
        words_found = 0

        for i, word in enumerate(train_data.token_vocabulary):
          try: 
              weights_matrix[i] = word2vec[word]
              words_found += 1
          except KeyError:
            if(i == 0):
              weights_matrix[i] = np.zeros((50, ))
            else:
              weights_matrix[i] = np.random.normal(scale=0.6, size=(50, ))

        self.encoder        = Encoder(weights_matrix, embed_size) # TODO: Initialize Encoder
        self.decoder        = Decoder(weights_matrix, 2*embed_size, output_size) # TODO: Initialize Decoder 
    
    def forward(self, x_encoder,x_encoder_lengths, y_encoder_labels, x_decoder, x_decoder_lengths):
        encoder_out, encoder_lens = self.encoder(x_encoder, x_encoder_lengths, y_encoder_labels)
        encoder_out_new = torch.zeros((encoder_out.shape[0], encoder_out.shape[2]))
        for batch_output_index in range(encoder_out.shape[0]):
          encoder_out_new[batch_output_index] = torch.sum(encoder_out[batch_output_index,0:encoder_lens[batch_output_index]], dim = 0)
        #print(encoder_out_new.shape)
        encoder_out_new = encoder_out_new.to(device)
        decoder_out  = self.decoder(encoder_out_new, x_decoder,  x_decoder_lengths)

        return decoder_out

In [26]:
model = KnowledgeModel(
    embed_size  = 512,
    output_size = 1
).to(device)
print(model)
summary(model,x_encoder.to(device),lx_encoder,y_encoder.to(device), x_decoder.to(device),lx_decoder)



KnowledgeModel(
  (encoder): Encoder(
    (token_embedding): Embedding(1968, 50)
    (pos_embedding): Embedding(17, 10)
    (dependency_embedding): Embedding(42, 10)
    (lstm1): LSTM(71, 512, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (token_embedding): Embedding(1968, 50)
    (pos_embedding): Embedding(17, 10)
    (dependency_embedding): Embedding(42, 10)
    (lstm1): GRU(70, 1024, batch_first=True, dropout=0.3)
    (mlp): Sequential(
      (0): PermuteBlock()
      (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PermuteBlock()
      (3): Linear(in_features=1024, out_features=512, bias=True)
      (4): PermuteBlock()
      (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): PermuteBlock()
      (7): GELU(approximate='none')
      (8): Dropout(p=0.2, inplace=False)
      (9): Linear(in_features=512, out_features=1, bias=True)
    )
  )
)


NameError: ignored

# **Training**

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler =  torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3, mode='min') # fill this out
criterion = torch.nn.BCEWithLogitsLoss(reduction = 'none', pos_weight = torch.tensor([10]).to(device))
# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

In [28]:
def create_loss_mask(lens):
    mask = torch.arange(max(lens))
    mask = torch.tile(mask, (len(lens), 1))
    t = torch.tile( lens.reshape((len(lens), 1)) , (1, mask.shape[1]))
    mask = mask < t
    return mask

In [29]:
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer):
    
    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        x_encoder,x_decoder,y_encoder,y_decoder,lx_encoder,lx_decoder, ly_encoder, ly_decoder = data
        x_encoder,x_decoder,y_encoder,y_decoder = x_encoder.to(device),x_decoder.to(device),y_encoder.to(device),y_decoder.to(device)
        #x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():     
            decoder_out = model(x_encoder,lx_encoder,y_encoder,x_decoder,lx_decoder)
            #print(h.shape)
            #print(lh)
            loss = criterion(decoder_out, y_decoder)
            loss_mask = create_loss_mask(ly_decoder)
            loss_mask = loss_mask.to(device)
            masked_loss = loss * loss_mask

            loss = torch.sum(masked_loss)/torch.sum(loss_mask)


        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        del x_encoder,x_decoder,y_encoder,y_decoder,lx_encoder,lx_decoder, ly_encoder, ly_decoder, decoder_out, loss 
        torch.cuda.empty_cache()

    batch_bar.close() # You need this to close the tqdm bar
    
    return total_loss / len(train_loader)


def validate_model(model, val_loader):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0

    for i, data in enumerate(val_loader):

        x_encoder,x_decoder,y_encoder,y_decoder,lx_encoder,lx_decoder, ly_encoder, ly_decoder = data
        x_encoder,x_decoder,y_encoder,y_decoder = x_encoder.to(device),x_decoder.to(device),y_encoder.to(device),y_decoder.to(device)

        #x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():   
          with torch.inference_mode():
              decoder_out = model(x_encoder,lx_encoder,y_encoder,x_decoder,lx_decoder)
              #h = torch.permute(h, (1, 0, 2))
              loss = criterion(decoder_out, y_decoder)
              loss_mask = create_loss_mask(ly_decoder)
              loss_mask = loss_mask.to(device)
              masked_loss = loss * loss_mask

              loss = torch.sum(masked_loss)/torch.sum(loss_mask)

        total_loss += float(loss.item())

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))

        batch_bar.update()
    
        del x_encoder,x_decoder,y_encoder,y_decoder,lx_encoder,lx_decoder, ly_encoder, ly_decoder,decoder_out, loss
        torch.cuda.empty_cache()
        
    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    return total_loss

In [30]:
del word2vec

In [None]:
torch.cuda.empty_cache()
gc.collect()

for epoch in range(0, 50):

    print("\nEpoch: {}/{}".format(epoch+1, 50))
    
    curr_lr = optimizer.param_groups[0]['lr']

    train_loss              = train_model(model, train_loader, criterion, optimizer) 
    valid_loss  = validate_model(model, val_loader)
    scheduler.step(valid_loss)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("Val Loss {:.04f}".format(valid_loss))


    # wandb.log({
    #     'train_loss': train_loss,  
    #     'valid_dist': valid_dist, 
    #     'valid_loss': valid_loss, 
    #     'lr'        : curr_lr
    # })
    
#     save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, epoch_model_path)
#     wandb.save(epoch_model_path)
#     print("Saved epoch model")

#     if valid_dist <= best_lev_dist:
#         best_lev_dist = valid_dist
#         save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, best_model_path)
#         wandb.save(best_model_path)
#         print("Saved best model")
#       # You may find it interesting to exlplore Wandb Artifcats to version your models
# run.finish()


Epoch: 1/50


Train:  73%|███████▎  | 611/839 [07:45<51:17, 13.50s/it, loss=1.3144, lr=0.001000]

Extra


In [None]:
# class ExcerciseToken(object):
#   def __init__(self, token, id, pos_tag, morphological_features, dep_label, dep_edge_head):
#     self.token = token
#     self.id = id
#     self.pos_tag = pos_tag
#     self.morphological_features = morphological_features
#     self.dep_label = dep_label
#     self.dep_edge_head = dep_edge_head
  
#   def set_label(self, label):
#     self.label = label


In [None]:
# class ExcerciseInformation(object):
#   def __init__(self):
#     self.exercise_tokens = []
#     self.prompt = ""
#     self.user = ""
#     self.countries = ""
#     self.days = 0.0
#     self.client = ""
#     self.session = ""
#     self.format = ""
#     self.time = 0.0
  
#   def add_exercise_token(self, excercise_token_info: ExcerciseToken):
#     self.exercise_tokens.append(excercise_token_info)
  
#   def set_prompt(self, prompt: str):
#     self.prompt = prompt

#   def set_user(self, user: str):
#     self.user = user
  
#   def set_countries(self, countries: str):
#     self.countries = countries

#   def set_days(self, days: float):
#     self.days = days
  
#   def set_client(self, client: str):
#     self.client = client
  
#   def set_session(self, session: str):
#     self.session = session
  
#   def set_format(self, format: str):
#     self.format = format

#   def set_time(self, time: float):
#     self.time = time
  

