<a href="https://colab.research.google.com/github/subhalingamd/my-TechTrack/blob/next-step-prediction/lstm_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pytorch-lightning 
!pip install -q bpemb  --no-deps
!pip install -q sentencepiece
!pip install -q transformers

[K     |████████████████████████████████| 584 kB 5.1 MB/s 
[K     |████████████████████████████████| 136 kB 56.4 MB/s 
[K     |████████████████████████████████| 409 kB 40.0 MB/s 
[K     |████████████████████████████████| 596 kB 45.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 48.0 MB/s 
[K     |████████████████████████████████| 144 kB 52.3 MB/s 
[K     |████████████████████████████████| 271 kB 43.7 MB/s 
[K     |████████████████████████████████| 94 kB 2.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 42.4 MB/s 
[K     |████████████████████████████████| 84 kB 3.1 MB/s 
[?25h

In [2]:
%load_ext tensorboard

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import get_scheduler, AdamW

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence


# import torchtext
# from torchtext import datasets, vocab
# from torchtext.legacy import data as textdata
## ???from torchtext.vocab import GloVe

import numpy as np
import pandas as pd

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

from bpemb import BPEmb

import random
import os

from argparse import ArgumentParser

import tqdm

import pickle
from collections import namedtuple 
Transition = namedtuple('Transition',['entity', 'property', 'value'])



In [4]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [5]:
def parse_args():
	parser = ArgumentParser(description='Word Meaning Comparison')
	parser.add_argument('--data', '-d', type=str, default='./processed_data/sampled.0_25.new.cat_constrained.pkl')
	parser.add_argument('--embedding', '-e', type=str, default='bpe', choices=['bpe'])

	parser.add_argument('--gpu', type=int, default=1)
	parser.add_argument('--batch_size', type=int, default=32)
	parser.add_argument('--epochs', type=int, default=25)
	parser.add_argument('--lr', type=float, default=0.001)
	parser.add_argument('--lr_decay', type=float, default=0.0005)	# removed

	parser.add_argument('--rnn_hidden_dim', type=int, default=8)
	parser.add_argument('--num_rnn_layers', type=int, default=1)
	parser.add_argument('--dropout', type=float, default=0.5)

	return parser.parse_known_args()[0]


args = parse_args()
print(args)

Namespace(batch_size=32, data='./processed_data/sampled.0_25.new.cat_constrained.pkl', dropout=0.5, embedding='bpe', epochs=25, gpu=1, lr=0.001, lr_decay=0.0005, num_rnn_layers=1, rnn_hidden_dim=8)


In [6]:
with open(args.data, 'rb') as f:
    data = pickle.load(f)

In [7]:
len(data['train'][0]['next']), len(data['test'][0]['next'])

(6, 190)

In [8]:
x = [len(i['next']) for i in data['test']]
max(x), sum(x)/len(x), min(x)

(706, 474.85172413793106, 20)

In [9]:
# build Dataset class
class TechTrackDataset(Dataset):
    def __init__(self, data, embedding='bpe'):
        assert len(data) > 0, 'data must be a non-empty list'

        assert embedding in ['bpe'], 'embedding must be bpe'
        self.embedding_name = embedding
        if self.embedding_name == 'bpe':
            PRETRAINED_EMBEDDING_DIM = 50
            self.EMBEDDING_DIM = 2*PRETRAINED_EMBEDDING_DIM + 1
            self.embedding = BPEmb(lang='en', dim=PRETRAINED_EMBEDDING_DIM, vs=200000)

        self._PROPERTIES_MAP = {
            'isConnected': 'connected',
            'isOpened': 'opened',
            'isPowered': 'powered',
            'isInstalled': 'installed',
            'isSettingsChanged': 'settings changed',
            'isUsed': 'used',
            'isSetup': 'setup',
            'isRelatedDeviceConnected': 'related device connected',
        }

        self.data = []
        for d in tqdm.tqdm(data):
            given_embeddings = torch.stack([self._get_embeddings(entity=t.entity, property=t.property, value=t.value) for t in d['given']], dim=0)
            # print(given_embeddings.shape)
            next_embeddings = [self._get_embeddings(entity=t.entity, property=t.property, value=t.value) for t in d['next']]
            self.data.append({
                'given': given_embeddings,
                'next': next_embeddings,
                'answer': d['answer']
            })


    def _get_embeddings(self, entity, property, value):
        assert value in ('True', 'False'), 'value must be True or False'
        if self.embedding_name == 'bpe':
            entity_embedding = torch.tensor(self.embedding.embed(entity).mean(axis=0))
            property_embedding = torch.tensor(self.embedding.embed(self._PROPERTIES_MAP[property]).mean(axis=0))
            # value_embedding = torch.tensor(self.embedding.embed(value).mean(axis=0))
            value_embedding = torch.tensor([1]) if self.embedding.embed(value)=='True' else torch.tensor([0])
        overall_embedding =  torch.cat((entity_embedding, property_embedding, value_embedding), dim=0)
        assert overall_embedding.size()[0] == self.EMBEDDING_DIM, 'embedding size is wrong'
        return overall_embedding

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]



In [10]:
# construct train, val, test dataset
train_dataset = TechTrackDataset(data['train'])
val_dataset = TechTrackDataset(data['val'])
test_dataset = TechTrackDataset(data['test'])

seed_everything()

# construct dataloader
train_dataloader = DataLoader(
                            dataset=train_dataset, 
                            batch_size=1, 
                            shuffle=True,
                            )
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs200000.model


100%|██████████| 3776868/3776868 [00:00<00:00, 5343989.57B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs200000.d50.w2v.bin.tar.gz


100%|██████████| 38736185/38736185 [00:02<00:00, 17890763.37B/s]
100%|██████████| 1504/1504 [00:04<00:00, 302.82it/s]
100%|██████████| 179/179 [00:14<00:00, 11.98it/s]
100%|██████████| 290/290 [00:24<00:00, 11.85it/s]


In [11]:
# construct train, val, test dataset
test_dataset1 = TechTrackDataset(data['test'][:10])

seed_everything()

test_dataloader1 = DataLoader(test_dataset1, batch_size=1, shuffle=False)

100%|██████████| 10/10 [00:00<00:00, 12.03it/s]


In [12]:
# for d in val_dataloader:
#   print(len(d))
#   print(d.keys())
#   print(d['answer'].shape)
#   print(d['given'].shape)
#   print(len(d['next']))
#   print(d['next'][0].shape)
#   break

In [13]:
args.embedding_dim = train_dataset.EMBEDDING_DIM

In [14]:
# build a model
# takes a batch of sequences as input
# uses fasttext embeddings to convert sequences, lstm and mlp
# given a set of candidates, rank them using cosine similarity

class BaselineModel(LightningModule):
    def __init__(self, more_hparams):
        super(BaselineModel, self).__init__()
        assert more_hparams.num_rnn_layers == 1, '1 rnn layer only'
        self.save_hyperparameters()
        self.more_hparams = more_hparams
        # self.embedding = nn.Embedding(self.more_hparams.vocab_size, self.more_hparams.embedding_dim)
        self.lstm = nn.LSTM(input_size = self.more_hparams.embedding_dim, 
                            hidden_size = self.more_hparams.rnn_hidden_dim, 
                            batch_first = True,
                            # proj_size = self.more_hparams.embedding_dim,
                            num_layers = self.more_hparams.num_rnn_layers,
                            dropout = (0 if self.more_hparams.num_rnn_layers == 1 else self.more_hparams.dropout),
                            bidirectional = False,
                            )
        self.mlp = nn.Linear(self.more_hparams.rnn_hidden_dim, self.more_hparams.embedding_dim)



        self.mode = 'dot_bce' # 'dot_bce'
        if self.mode == 'cosine_ranking':
          self.activation = nn.CosineSimilarity(dim=1)
          self.loss = nn.CosineEmbeddingLoss() #margin=0.5)
        elif self.mode == 'dot_bce':
          self.loss = nn.BCEWithLogitsLoss()
          self.activation = nn.Sigmoid()
        elif self.mode == 'dot_mse':
          self.loss = nn.MSELoss()
          self.activation = nn.Sigmoid()
        elif self.mode == 'cosine_mse':
          self.activation = nn.CosineSimilarity(dim=1)
          self.loss = nn.MSELoss()
        else:
          raise NotImplementedError()


    def forward(self, x):
        given = x
        
        outputs, (ht, ct) = self.lstm(given)
  
        #output = ht.squeeze(0)
        output = torch.mean(ht, dim=0)
        assert len(output.size()) == 2, 'output size is wrong (size == 2)'

        output = self.mlp(output)
        # assert output.size(1) == next.size(1), 'output size is wrong (dim 1'

        return output

    def complete_step(self, output, next, answer):
      # losses = 0
      # predictions = []
      output = torch.repeat_interleave(output, len(next), dim=0)
      next = torch.cat([n for n in next], dim=0)
      label = [-1]*len(next)
      label[answer[0].item()] = 1
      label = torch.tensor(label)
      # print(output.shape, next.shape, label.shape)
      # for idx, n in enumerate(next):
        # if idx == answer:
        #   label = torch.tensor([1])
        # else:
        #   label = torch.tensor([-1])
      if self.mode == 'cosine_ranking':
          loss = self.loss(output, next, label)
          pred = self.activation(output, next)
      elif self.mode == 'dot_bce':
          label = (label>0).float()*label
          output = torch.sum(output*next, dim=1)
          loss = self.loss(output, label.float())
          pred = self.activation(output)
      elif self.mode == 'dot_mse':
          label = (label>0).float()*label
          output = torch.sum(output*next, dim=1)
          pred = self.activation(output)
          output = pred
          loss = self.loss(output, label.float())
      elif self.mode == 'cosine_mse':
          pred = self.activation(output, next)
          output = pred
          loss = self.loss(output, label.float())
        
        # predictions.append(pred)
        # losses += loss

      # predictions = torch.cat(predictions, dim=0)
      # losses/=len(next)

      # print('loss:',loss)

      # return (predictions, losses)

      # for i, x in enumerate(pred):
      #   print(i, x.isnan())

      assert not pred.isnan().any(), 'nan found!'
      return (pred, loss)

    def _compute_metrics(self, y_hat, ans_idx=0, mode="val_"):
        ranked = self._rank_candidates(y_hat)
        # precisions = self._precision_of_ranked_candidates(ranked, ans_idx=0)
        # metrics = {f'{mode}prec_{i+1}': p for i, p in enumerate(precisions)}
        # mrr = torch.sum(precisions*(1/torch.arange(1, len(precisions)+1)))
        rank = (ranked == ans_idx).nonzero(as_tuple=True)[0] + 1
        # if mode=="test_":
        #   print(rank, ranked.shape)
        # print(rank)
        metrics = {'mrr': 1/rank, 'acc': torch.tensor([1.0]) if rank==1 else torch.tensor([0.0])}
        # self.log(f'{mode}metrics', metrics, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{mode}mrr', metrics['mrr'], on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{mode}acc', metrics['acc'], on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return metrics

    def training_step(self, batch, batch_idx):
        given, next, answer = batch['given'], batch['next'], batch['answer']
        output = self.forward(given)
        pred, loss = self.complete_step(output, next, answer)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        given, next, answer = batch['given'], batch['next'], batch['answer']
        output = self.forward(given)
        pred, loss = self.complete_step(output, next, answer)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        metrics = self._compute_metrics(pred, ans_idx=answer[0], mode="val_")
        metrics.update({'val_loss': loss, 'loss': loss})
        return metrics

    def test_step(self, batch, batch_idx):
        given, next, answer = batch['given'], batch['next'], batch['answer']
        output = self.forward(given)
        pred, loss = self.complete_step(output, next, answer)
        self.log("test_loss", loss, on_step=True, on_epoch=True,prog_bar=True, logger=True)
        metrics = self._compute_metrics(pred, ans_idx=answer[0], mode="test_")
        metrics.update({'test_loss': loss, 'loss': loss})
        return metrics

    def prediction_step(self, batch, batch_idx):
        given, next, answer = batch['given'], batch['next'], batch['answer']
        output = self.forward(given)
        pred, loss = self.complete_step(output, next, answer)
        ranked = self._rank_candidates(pred)
        return ranked

    def _rank_candidates(self, pred):
        # sort pred in descending order
        with torch.no_grad():
            return torch.sort(pred, descending=True).indices

    def _precision_of_ranked_candidates(self, ranked_candidates, ans_idx=0, at=1):
        # find P@2, P@3, P@4, ..., P@at
        # return a list of P@i
        with torch.no_grad():
            return torch.mean((ranked_candidates == ans_idx).float(), dim=0)
    

    def configure_optimizers(self):
        optim = AdamW(self.parameters(), lr=self.more_hparams.lr)
        return optim





In [15]:
#del model;

seed_everything()
model = BaselineModel(args)



In [16]:
seed_everything()

early_stop_callback = EarlyStopping(monitor="val_loss", mode="max", patience=3)
checkpoint_callback = ModelCheckpoint(monitor="val_loss", save_top_k=3, filename="{epoch}-{val_loss:.4f}-{step}")

trainer = Trainer(
    deterministic=True, 
    default_root_dir='out/',
    #gpus=args.gpu,
    max_epochs=100, # args.epochs,
    callbacks=[early_stop_callback, checkpoint_callback]
    # callbacks=[checkpoint_callback]
)


trainer.fit(model, train_dataloader, val_dataloader)




GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: out/lightning_logs

  | Name       | Type              | Params
-------------------------------------------------
0 | lstm       | LSTM              | 3.6 K 
1 | mlp        | Linear            | 909   
2 | loss       | BCEWithLogitsLoss | 0     
3 | activation | Sigmoid           | 0     
-------------------------------------------------
4.5 K     Trainable params
0         Non-trainable params
4.5 K     Total params
0.018     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [17]:
trainer.test(model, test_dataloader, verbose=True)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_epoch        0.013793103396892548
     test_loss_epoch        0.18691052496433258
     test_mrr_epoch        0.053726308047771454
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc_epoch': 0.013793103396892548,
  'test_loss_epoch': 0.18691052496433258,
  'test_mrr_epoch': 0.053726308047771454}]

In [18]:
model.eval()
predictions = []
for batch in tqdm.tqdm(val_dataloader):
    given, next, answer = batch['given'], batch['next'], batch['answer']
    output = model(given)
    pred, loss = model.complete_step(output, next, answer)
    ranked = model._rank_candidates(pred)
    predictions.append(ranked.tolist())

assert len(predictions) == len(data['val'])



100%|██████████| 179/179 [00:00<00:00, 218.41it/s]


In [19]:
print(predictions[0][:10])

[229, 457, 190, 85, 363, 439, 35, 82, 132, 270]


In [20]:
top_k = 10

with open('output.txt', 'w') as file:
  print(*['given', 'gold', 'gold_at_rank', *[f'pred_rank_{i+1}' for i in range(top_k)]],sep=':::', file=file)
  for d, p in zip(data['val'], predictions):
    print(*[
            d['given'],
            d['next'][d['answer']],
            p.index(d['answer']) + 1,
            *[d['next'][idx] for idx in p[:top_k]],
    ], sep=':::', file=file)


In [21]:
data['test'][0]['given'], data['val'][0]['given']

((Transition(entity='printer', property='isPowered', value='True'),
  Transition(entity='menu', property='isOpened', value='True'),
  Transition(entity='system preference', property='isOpened', value='True')),
 (Transition(entity='apple icon', property='isOpened', value='True'),
  Transition(entity='system preference', property='isOpened', value='True'),
  Transition(entity='trackpad', property='isOpened', value='True')))

In [22]:
# while 1:
#   pass

In [23]:
# %tensorboard --logdir out/lightning_logs/version_0

In [24]:
### ! rm -r out