In [3]:
import torch
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import sys
sys.path.append('skip-thoughts.torch/pytorch')
from skipthoughts import UniSkip,BiSkip
import pandas as pd
from Vocabulary import Vocabulary, preprocess

In [4]:
def make_vocab(tokens):
    voc = Vocabulary(['<PAD>','<UNK>'])
    voc.add_tokens(tokens)
    print('vocab len is {}'.format(len(voc.w2idx)))
    return voc

In [5]:
def load_data(file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv'):
    df= pd.read_csv(file)
    df = df.drop('InputStoryid',axis=1)
    targets = df['AnswerRightEnding']
    df = df.drop('AnswerRightEnding',axis=1)
    df = df.drop('InputSentence1',axis=1)
    df = df.drop('InputSentence2',axis=1)
    df = df.drop('InputSentence3',axis=1)
    
    voc_str= ''
    for index, row in df.iterrows():
        voc_str+=' '.join(list(row)) + ' '
        
    df['AnswerRightEnding'] = targets
    return df,make_vocab(preprocess(voc_str))

In [6]:
import sys
from collections import OrderedDict 
class LastSentenceDataset(Dataset):
    def __init__(self,file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv',vocab=None,df=None):

        super().__init__()
        
        created_df, created_vocab = load_data(file)
        if df is None:
            df = created_df
        if vocab:
            self.vocab = vocab
        else:
            self.vocab = created_vocab
      
        
        self.dir_st = 'data/skip-thoughts'
        self.biskip = BiSkip(self.dir_st, self.vocab.convert_to_list())
        
        self.uniskip = UniSkip(self.dir_st, self.vocab.convert_to_list())
        
        
        self.data = self.make_data(df)
        
        
    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns: skip thought embedding of ending and 0/1 if it is the right ending 

        """
        return self.data[idx]

    def __len__(self):
        """
        Returns len of the dataset
        """
        return len(self.data)
       
    def make_data(self, df):
        data = []
        total = df.index
        print('skip thought encoding dataset')
        for i in total:
            #print(row['RandomFifthSentenceQuiz1'],row['RandomFifthSentenceQuiz2'])
            progress(i,len(total))
            endings =  self.gen_embbeding(df.at[i,'RandomFifthSentenceQuiz1'], 
                                          df.at[i,'RandomFifthSentenceQuiz2'],
                                          df.at[i,'InputSentence4'])
            if df.at[i,'AnswerRightEnding'] == 1:
                data.append((endings[0],1))
                data.append((endings[1],0))
            else:
                data.append((endings[0],0))
                data.append((endings[1],1))
        return data
    

    def zero_pad(self,l,n):
        l = (l + n * [0])[:n]
        return l
    
    def pad_input(self,d):
        d = OrderedDict(sorted(d.items(), key=lambda s: len(s[1])))
        for k,v in d.items():
            d[k]= self.zero_pad(v,len(list(d.items())[-1][1]))
        return d
        
    def gen_embbeding(self,sent1,sent2,last_sent):
        d = dict()
        sent1 = preprocess(sent1)
        sent2 = preprocess(sent2)
        ls = preprocess(last_sent)
        #remove random n token that is in one sentence
        if 'n' in sent2:
            sent2.remove('n')
        d['sent1'] = self.vocab.get_sentence(sent1)
        d['sent2'] = self.vocab.get_sentence(sent2)
        d['ls'] = self.vocab.get_sentence(ls)
        d = self.pad_input(d)
        
        batch = torch.LongTensor([d['sent1'],d['sent2'], d['ls']]) 
        top_half = self.uniskip(batch)
        bottom_half = self.biskip(batch)
        combine_skip = torch.cat([top_half,bottom_half],dim=1)
        end1 = combine_skip[0]
        end2 = combine_skip[1]
        ls = combine_skip[2]
        
        #print(end1[:20],end2[:20])
        end1.add_(ls)
        end2.add_(ls)
        #print('ls',ls[:20])
        #print('after',end1[:20])
        return end1,end2    
    
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

In [14]:
class LSModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.hd1 = torch.nn.Linear(4800, 2400)
        self.hd2 = torch.nn.Linear(2400, 1200)
        self.hd3 = torch.nn.Linear(1200, 600)
        self.output = torch.nn.Linear(600, 2)

    def forward(self, x):
        x = torch.nn.functional.relu(self.hd1(x))
        x = torch.nn.functional.relu(self.hd2(x))
        x = torch.nn.functional.relu(self.hd3(x))
        x = self.output(x)
        #print('output',x)
        return x

In [8]:
def score(pair,model):
    '''true if model predicts right'''
    ending1, ending2 = pair
    if ending1[1] == 1:
        target = 1
    else:
        target =  2 
        
    ending1 = ending1[0]
    ending2 = ending2[0]
    if torch.cuda.is_available():
        model = model.cuda()
        ending1 = ending1.cuda()
        ending2 = ending2.cuda()
    res1 = model(ending1)
    res2 = model(ending2)
    softm = torch.nn.Softmax(dim=0)
    prob_end1_right = softm(res1)[1].item() 
    prob_end2_right = softm(res2)[1].item()
    
    if prob_end1_right > prob_end2_right:
        pred = 1
    else:
        pred = 2
    
    if pred == target:
        return True
    else: 
        #print(prob_end1_right,prob_end2_right,pred,target)
        return False
   

In [9]:
def compute_accuracy(data_set):
    num_right = 0
    for i in range(0,len(data_set),2):
        if score((data_set[i],data_set[i+1]),model):
            num_right+=1
    return num_right / (len(data_set)/2)
    

In [8]:
def save(name):
    torch.save(model.state_dict(), 'saved_models/{}.save'.format(name))

In [11]:
from sklearn.model_selection import train_test_split
df, voc = load_data()
train, val = train_test_split(df, test_size=0.1,shuffle=False)

vocab len is 5303


In [10]:
import time
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('seaborn-paper')
batch_size = 128
lr = 0.01
num_epochs = 10
report_every = 1

In [11]:
train_data_set = LastSentenceDataset(df=train,vocab=voc)
data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True,num_workers = 0)

vocab len is 5303


  "num_layers={}".format(dropout, num_layers))


skip thought encoding dataset

In [12]:
val_data_set = LastSentenceDataset(file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv',df=val)

vocab len is 5303
skip thought encoding dataset

In [13]:
model = LSModel()
if torch.cuda.is_available():
    model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)

In [14]:
tick = time.time()
name='LS_adagrad'
epoch_losses = []
best_score= 0.5
for epoch_num in range(1, num_epochs + 1):
    batch_losses = []
    for i, batch in enumerate(data_loader):
        ### YOUR CODE BELOW ###
        # Zero the gradients
        optimizer.zero_grad()
        # Extract the inputs and the targets
        inputs, targets = batch
        # Transfer the inputs and the targets to GPUs, if available
        if torch.cuda.is_available():
            inputs = torch.FloatTensor(inputs.float()).cuda()
            targets = torch.LongTensor(targets).cuda()
        # Run the model
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs,targets)
        
        
        # Backpropagate the error
        loss.backward(retain_graph=True)
        # Update the parameters
        optimizer.step()
        
        # Append the loss
        batch_losses.append(float (loss))
        ### YOUR CODE ABOVE ###
        epoch_loss = np.mean(np.array(batch_losses))
        epoch_losses.append(epoch_loss)
        
        acc = compute_accuracy(val_data_set)
        if acc > best_score:
            best_score = acc
            print('new best saving model')
            save('{}{}_best_acc_{:.2f}'.format(name,epoch_num,acc))

        if epoch_num % report_every == 0:
            tock = time.time()
            print("Epoch {}. Loss {:.4f}. accuracy {:.4f}. Elapsed {:.0f} seconds".format(epoch_num, epoch_loss,acc, tock-tick))
    print('saving model')
    save(name + str(epoch_num))

print("Total time elapsed: {:.0f} minutes".format((tock-tick)/60))

Epoch 1. Loss 0.6921. accuracy 0.4521. Elapsed 57 seconds
Epoch 1. Loss 46.2648. accuracy 0.4521. Elapsed 111 seconds
Epoch 1. Loss 84.0700. accuracy 0.4894. Elapsed 167 seconds
new best saving model
Epoch 1. Loss 66.0568. accuracy 0.5479. Elapsed 226 seconds
Epoch 1. Loss 52.9952. accuracy 0.4521. Elapsed 283 seconds
new best saving model
Epoch 1. Loss 44.3533. accuracy 0.5532. Elapsed 344 seconds
Epoch 1. Loss 38.2443. accuracy 0.4947. Elapsed 401 seconds
Epoch 1. Loss 33.5493. accuracy 0.4734. Elapsed 459 seconds
Epoch 1. Loss 29.9440. accuracy 0.4681. Elapsed 517 seconds
Epoch 1. Loss 27.0190. accuracy 0.5053. Elapsed 573 seconds
Epoch 1. Loss 24.7128. accuracy 0.4734. Elapsed 630 seconds
Epoch 1. Loss 22.7253. accuracy 0.5426. Elapsed 687 seconds
Epoch 1. Loss 21.0309. accuracy 0.4734. Elapsed 744 seconds
Epoch 1. Loss 19.5800. accuracy 0.5479. Elapsed 802 seconds
new best saving model
Epoch 1. Loss 18.3227. accuracy 0.5851. Elapsed 863 seconds
Epoch 1. Loss 17.2206. accuracy 0.58

Epoch 5. Loss 0.6728. accuracy 0.5798. Elapsed 7746 seconds
Epoch 5. Loss 0.6739. accuracy 0.6117. Elapsed 7763 seconds
saving model
Epoch 6. Loss 0.6546. accuracy 0.5851. Elapsed 7825 seconds
Epoch 6. Loss 0.6536. accuracy 0.6170. Elapsed 7884 seconds
Epoch 6. Loss 0.6537. accuracy 0.6011. Elapsed 7941 seconds
Epoch 6. Loss 0.6509. accuracy 0.5957. Elapsed 8000 seconds
Epoch 6. Loss 0.6455. accuracy 0.5957. Elapsed 8059 seconds
Epoch 6. Loss 0.6388. accuracy 0.6011. Elapsed 8116 seconds
Epoch 6. Loss 0.6453. accuracy 0.5957. Elapsed 8173 seconds
Epoch 6. Loss 0.6501. accuracy 0.6277. Elapsed 8230 seconds
Epoch 6. Loss 0.6556. accuracy 0.5798. Elapsed 8289 seconds
Epoch 6. Loss 0.6570. accuracy 0.5957. Elapsed 8346 seconds
Epoch 6. Loss 0.6579. accuracy 0.5904. Elapsed 8405 seconds
Epoch 6. Loss 0.6576. accuracy 0.5798. Elapsed 8464 seconds
Epoch 6. Loss 0.6535. accuracy 0.5904. Elapsed 8520 seconds
Epoch 6. Loss 0.6514. accuracy 0.5691. Elapsed 8576 seconds
Epoch 6. Loss 0.6535. accur

OSError: [Errno 28] No space left on device

In [12]:

train_data_set = None
test_data_set = LastSentenceDataset(file='story_cloze_data/cloze_test_test__spring2016 - cloze_test_ALL_test.csv',vocab=voc)

vocab len is 5226


  "num_layers={}".format(dropout, num_layers))


skip thought encoding dataset

In [25]:
def load(name):
    l_model = LSModel()
    l_model.load_state_dict(torch.load('saved_models/{}.save'.format(name)))
    return l_model
#model = load('LS_adagrad5')
#print('model',model)

In [26]:
compute_accuracy(test_data_set)

0.5793693212185996