### Make sure your runtime is using a GPU

In [0]:
import os
os.chdir('/content/drive/My Drive/Project_3/seq2sql')

In [1]:
!ls

drive  sample_data


In [0]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
from torchvision import transforms
from torch.utils.data import DataLoader

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np
import random
import math
import time
#import lib
from wiki_sql import WikiSQL

from extract_data import load_pickle
from tqdm import tqdm

from utils import get_decoder_vocab_dicts, save_models, zero_all_grads
from extract_data import load_pickle

import matplotlib.pyplot as plt

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        #print("test1")
        src = src.cuda()
        embedded = self.dropout(self.embedding(src))
        #print("test2")
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
        

                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden


class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        ## modified dec_hid_dim -> dec_hid_dim*2
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim*2, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        ## modified dec_hid_dim -> dec_hid_dim*2
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim*2)
        ## modified dec_hid_dim -> dec_hid_dim*2
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim*2 + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        input = input.cuda()
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        #print("embedded.size", embedded.size())
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len] 

        # print("a.size", a.size())
        # a  = a.permute(2, 1, 0)
        # print("a.size", a.size())   

        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]

        #print("encoder_outputs.size", encoder_outputs.size())
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        #print("weighted.size", weighted.size())
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

class Seq2Seq(nn.Module):
    def __init__(self, encoder,encoder_schema, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.encoder_schema = encoder_schema
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, schema,  teacher_forcing_ratio = 0):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_src_outputs, src_hidden = self.encoder(src)
        encoder_schema_outputs, schema_hidden = self.encoder_schema(schema)
        # print(encoder_src_outputs.size(), src_hidden.size())
        # print(encoder_schema_outputs.size(), schema_hidden.size())
        encoder_outputs = torch.cat((encoder_src_outputs,encoder_schema_outputs), dim=0)
        hidden = torch.cat((src_hidden, schema_hidden),dim = -1)
        # print(encoder_outputs.size(), hidden.size())

        # hidden = src_hidden
        # encoder_outputs = encoder_src_outputs
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0]
        trg = batch[1]
        schema = batch[2]
        src = src.permute(1,0)
        trg = trg.permute(1,0)
        schema = schema.permute(1,0)
        #print("src", src.size())
        #print("trg", trg.size())
        
        optimizer.zero_grad()
        trg = trg.long().cuda()
        
        output = model(src.long().cuda(), trg, schema.long().cuda() )
        
        #print(output.size())
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        #print(output.size())
        trg = trg[1:].view(-1)
        #print(trg.size())
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()

        if i % 100 == 0:
            print("batch: ", i, " loss: " , loss.item())
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0]
            trg = batch[1]
            schema = batch[2]
            src = src.permute(1,0)
            trg = trg.permute(1,0)
            schema = schema.permute(1,0)

            
            trg = trg.long().cuda()
            schema = schema.long().cuda()

            output = model(src.long(), trg, schema, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Load Data

In [0]:
root = 'data'
batch_size = 32
train_transformed_dataset = WikiSQL(text=os.path.join(root, 'train/train_questions_tokenized.pkl'),
                                        sql=os.path.join(root, 'train/train_sql_tokenized.pkl'),
                                        schema=os.path.join(root, 'train/', 'train_schema.pkl')
                                        )

test_transformed_dataset = WikiSQL(text=os.path.join(root, 'test/test_questions_tokenized.pkl'),
                                    sql=os.path.join(root, 'test/test_sql_tokenized.pkl'),
                                    schema=os.path.join(root, 'test/test_schema.pkl')
                                    )

valid_transformed_dataset = WikiSQL(text=os.path.join(root, 'valid/valid_questions_tokenized.pkl'),
                                    sql=os.path.join(root, 'valid/valid_sql_tokenized.pkl'),
                                    schema=os.path.join(root, 'valid/valid_schema.pkl')
                                    )

word2idx = load_pickle(os.path.join(root, 'word_idx_mappings/word2idx.pkl'))
idx2word = load_pickle(os.path.join(root, 'word_idx_mappings/idx2word.pkl'))
col_dict = load_pickle(os.path.join(root, 'word_idx_mappings/column_mappings1.pkl'))
table_dict = load_pickle(os.path.join(root, 'word_idx_mappings/table_mappings1.pkl'))
vocab_size = len(word2idx.keys())

vocab_dicts = get_decoder_vocab_dicts(word2idx, table_dict, col_dict)

train_loader = DataLoader(train_transformed_dataset, batch_size=batch_size, shuffle=False,
                          collate_fn=train_transformed_dataset.collate)
test_loader = DataLoader(test_transformed_dataset, batch_size=batch_size, shuffle=False,
                          collate_fn=test_transformed_dataset.collate)
valid_loader = DataLoader(valid_transformed_dataset, batch_size=batch_size, shuffle=False,
                          collate_fn=valid_transformed_dataset.collate)

### Create Model & Set Hyperparameter

In [0]:
INPUT_DIM = vocab_size
OUTPUT_DIM = vocab_size
ENC_EMB_DIM = 300 #256
DEC_EMB_DIM = 300 #256
ENC_HID_DIM = 256 #512
DEC_HID_DIM = 256 #512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LEARNING_RATE = 1e-4 # 5e-5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc_src = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
enc_schema = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc_src, enc_schema,dec, device).to(device)
model.apply(init_weights)

optimizer = optim.Adam(model.parameters(),lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss() #(ignore_index = TRG_PAD_IDX)

### Train


In [0]:
N_EPOCHS = 10
CLIP = 1

In [14]:
train_losses = []
val_losses = []
test_losses = []
best_valid_loss = 1000000


for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    # break
    valid_loss = evaluate(model, valid_loader, criterion)
    test_loss = evaluate(model,test_loader,criterion)
    train_losses.append(train_loss)
    val_losses.append(valid_loss)
    test_losses.append(test_loss)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     save_path = 'saved_models/baseline_vanila/emb{}_hid{}_Epoch{}.pt'.format(ENC_EMB_DIM,ENC_HID_DIM,epoch)
    #     torch.save(model.state_dict(), save_path )
    save_path = 'saved_models/baseline_schema/emb{}_hid{}_Epoch{}.pt'.format(ENC_EMB_DIM,ENC_HID_DIM,epoch)
    torch.save(model.state_dict(), save_path )
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\tTest. Loss: {valid_loss:.3f} | Test. PPL: {math.exp(valid_loss):7.3f}')

batch:  0  loss:  11.490984916687012
batch:  100  loss:  5.096745014190674
batch:  200  loss:  5.087321758270264
batch:  300  loss:  3.816284418106079
batch:  400  loss:  5.426336288452148
batch:  500  loss:  3.047872543334961
batch:  600  loss:  4.1541748046875
batch:  700  loss:  2.8371331691741943
batch:  800  loss:  3.1558337211608887
batch:  900  loss:  3.417754650115967
batch:  1000  loss:  3.273749351501465
batch:  1100  loss:  3.4571971893310547
batch:  1200  loss:  2.123469114303589
batch:  1300  loss:  2.6073453426361084
batch:  1400  loss:  3.0521135330200195
batch:  1500  loss:  2.89853572845459
batch:  1600  loss:  2.779675006866455
batch:  1700  loss:  3.4157965183258057
Epoch: 01 | Time: 12m 40s
	Train Loss: 3.693 | Train PPL:  40.177
	 Val. Loss: 3.403 |  Val. PPL:  30.062
	Test. Loss: 3.403 | Test. PPL:  30.062
batch:  0  loss:  4.187197685241699
batch:  100  loss:  3.1269328594207764
batch:  200  loss:  3.4098920822143555
batch:  300  loss:  2.825165033340454
batch:  

In [15]:
saved_model_path = '/content/drive/My Drive/Project_3/seq2sql/saved_models/baseline_schema/emb300_hid256_Epoch9.pt'
model.load_state_dict(torch.load(saved_model_path))

<All keys matched successfully>

In [0]:
def evaluate_custom(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for j, batch in enumerate(iterator):

            src = batch[0]
            trg = batch[1]
            schema = batch[2]
            src = src.permute(1,0)
            trg = trg.permute(1,0)
            schema = schema.permute(1,0)

            
            trg = trg.long().cuda()
            schema = schema.long().cuda()

            output = model(src.long(), trg, schema,0) #turn off teacher forcing
            value, result = torch.max(output,2)
            print(output.size())
            print(result.size())
            print('#'*30)
            print('#'*30)
            for i in src[:,0]:
              print(idx2word[int(i.data)])
            print('#'*30)
            for i in result[:,0]:
              print(idx2word[int(i.data)])
            print('*' * 30)
            for i in trg[:,0]:
              print(idx2word[int(i.data)])


            if j ==2:
              break

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            # output_dim = output.shape[-1]
            
            # output = output[1:].view(-1, output_dim)
            # trg = trg[1:].view(-1)

            # #trg = [(trg len - 1) * batch size]
            # #output = [(trg len - 1) * batch size, output dim]

            # loss = criterion(output, trg)

            # epoch_loss += loss.item()
        
    # return epoch_loss / len(iterator)

In [26]:
evaluate_custom(model, train_loader, criterion)

torch.Size([11, 32, 97837])
torch.Size([11, 32])
##############################
##############################
<s>
tell
me
what
the
notes
are
for
south
australia
</s>
final_place
final_place
final_place
final_place
final_place
final_place
##############################
final_place
select
name
from
2-12943367-1
where
name
=
r_16
</s>
final_place
******************************
<s>
select
notes
from
1-1000181-1
where
current_slogan
=
south_australia
</s>
final_place
torch.Size([15, 32, 97837])
torch.Size([15, 32])
##############################
##############################
<s>
what
nationality
is
the
player
who
played
from
1997-98
</s>
final_place
final_place
final_place
final_place
final_place
final_place
final_place
final_place
final_place
final_place
##############################
final_place
select
player
from
2-1213511-5
where
position
=
guard
</s>
final_place
final_place
final_place
final_place
final_place
******************************
<s>
select
nationality
from
1-10015132-7
whe

### Explore Data

In [0]:
ii = 0
for text, sql, _ in train_loader:
  print(text.size())
  print(sql.size())
  print(text[0])
  print(sql[0])
  for i in text[0]:
    print(idx2word[int(i.data)])
  for i in sql[0]:
    print(idx2word[int(i.data)]) 
  ii += 1
  if ii == 10:
    break