In [8]:
import torch
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import sys
import numpy
sys.path.append('skip-thoughts.torch/pytorch')
from skipthoughts import UniSkip,BiSkip
import pandas as pd
from Vocabulary import Vocabulary, preprocess

In [9]:
def make_vocab(tokens):
    voc = Vocabulary(['<PAD>','<UNK>'])
    voc.add_tokens(tokens)
    print('vocab len is {}'.format(len(voc.w2idx)))
    return voc

In [10]:
def load_data(file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv'):
    df= pd.read_csv(file)
    df=df.head(500)
    df = df.drop('InputStoryid',axis=1)
    targets = df['AnswerRightEnding']
    df = df.drop('AnswerRightEnding',axis=1)
    df = df.drop('InputSentence1',axis=1)
    df = df.drop('InputSentence2',axis=1)
    df = df.drop('InputSentence3',axis=1)
    
    voc_str= ''
    for index, row in df.iterrows():
        voc_str+=' '.join(list(row)) + ' '
        
    df['AnswerRightEnding'] = targets
    return df,make_vocab(preprocess(voc_str))

In [11]:
import sys
from collections import OrderedDict 
class LastSentenceDataset(Dataset):
    def __init__(self,file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv',vocab=None,df=None):

        super().__init__()
        
        created_df, created_vocab = load_data(file)
        if df is None:
            df = created_df
        if vocab:
            self.vocab = vocab
        else:
            self.vocab = created_vocab
      
        
        self.dir_st = 'data/skip-thoughts'
        self.biskip = BiSkip(self.dir_st, self.vocab.convert_to_list())
        
        self.uniskip = UniSkip(self.dir_st, self.vocab.convert_to_list())
        
        
        self.data = self.make_data(df)
        
        
    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns: skip thought embedding of ending and 0/1 if it is the right ending 

        """
        return self.data[idx]

    def __len__(self):
        """
        Returns len of the dataset
        """
        return len(self.data)
       
    def make_data(self, df):
        data = []
        total = df.index
        print('skip thought encoding dataset')
        for i in total:
            #print(row['RandomFifthSentenceQuiz1'],row['RandomFifthSentenceQuiz2'])
            progress(i,len(total))
            endings =  self.gen_embbeding(df.at[i,'RandomFifthSentenceQuiz1'], 
                                          df.at[i,'RandomFifthSentenceQuiz2'],
                                          df.at[i,'InputSentence4'])
            if df.at[i,'AnswerRightEnding'] == 1:
                data.append((endings[0].detach().numpy(),1))
                data.append((endings[1].detach().numpy(),0))
            else:
                data.append((endings[0].detach().numpy(),0))
                data.append((endings[1].detach().numpy(),1))
        return data
    

    def zero_pad(self,l,n):
        l = (l + n * [0])[:n]
        return l
    
    def pad_input(self,d):
        d = OrderedDict(sorted(d.items(), key=lambda s: len(s[1])))
        for k,v in d.items():
            d[k]= self.zero_pad(v,len(list(d.items())[-1][1]))
        return d
        
    def gen_embbeding(self,sent1,sent2,last_sent):
        d = dict()
        sent1 = preprocess(sent1)
        sent2 = preprocess(sent2)
        ls = preprocess(last_sent)
        #remove random n token that is in one sentence
        if 'n' in sent2:
            sent2.remove('n')
        d['sent1'] = self.vocab.get_sentence(sent1)
        d['sent2'] = self.vocab.get_sentence(sent2)
        d['ls'] = self.vocab.get_sentence(ls)
        d = self.pad_input(d)
        
        batch = torch.LongTensor([d['sent1'],d['sent2'], d['ls']]) 
        top_half = self.uniskip(batch)
        bottom_half = self.biskip(batch)
        combine_skip = torch.cat([top_half,bottom_half],dim=1)
        end1 = combine_skip[0]
        end2 = combine_skip[1]
        ls = combine_skip[2]
        
        #print(end1[:20],end2[:20])
        end1.add_(ls)
        end2.add_(ls)
        #print('ls',ls[:20])
        #print('after',end1[:20])
        return end1,end2    
    
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

In [12]:
class LSModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.hd1 = torch.nn.Linear(4800, 2400)
        self.hd2 = torch.nn.Linear(2400, 1200)
        self.hd3 = torch.nn.Linear(1200, 600)
        self.output = torch.nn.Linear(600, 2)

    def forward(self, x):
        x = torch.nn.functional.relu(self.hd1(x))
        x = torch.nn.functional.relu(self.hd2(x))
        x = torch.nn.functional.relu(self.hd3(x))
        x = self.output(x)
        #print('output',x)
        return x

In [32]:
def score(pair,model):
    '''true if model predicts right'''
    ending1, ending2 = pair
    if ending1[1] == 1:
        target = 1
    else:
        target =  2 
        
    ending1 = torch.tensor(ending1[0]).cuda()
    ending2 = torch.tensor(ending2[0]).cuda()
    res1 = model(ending1)
    res2 = model(ending2)
    softm = torch.nn.Softmax(dim=0)
    prob_end1_right = softm(res1)[1].item() 
    prob_end2_right = softm(res2)[1].item()
    
    if prob_end1_right > prob_end2_right:
        pred = 1
    else:
        pred = 2
    
    if pred == target:
        return True
    else: 
        #print(prob_end1_right,prob_end2_right,pred,target)
        return False
def score2(pair,clf):
    '''true if model predicts right'''
    ending1, ending2 = pair
    if ending1[1] == 1:
        target = 1
    else:
        target =  2 
        
    ending1 = ending1[0]
    ending2 = ending2[0]
    prob_end1_right = clf.predict_proba([ending1.detach().numpy()])[0][1]
    prob_end2_right = clf.predict_proba([ending2.detach().numpy()])[0][1]
    
    if prob_end1_right > prob_end2_right:
        pred = 1
    else:
        pred = 2
    
    if pred == target:
        return True
    else: 
        #print(prob_end1_right,prob_end2_right,pred,target)
        return False
   

In [14]:
def compute_accuracy(data_set,model):
    num_right = 0
    for i in range(0,len(data_set),2):
        if score((data_set[i],data_set[i+1]),model):
            num_right+=1
    return num_right / (len(data_set)/2)


      

In [15]:
def save(name):
    torch.save(model.state_dict(), 'saved_models/{}.save'.format(name))

In [16]:
from sklearn.model_selection import train_test_split
df, voc = load_data()
train, val = train_test_split(df, test_size=0.1,shuffle=False)

vocab len is 2522


In [35]:
import time
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('seaborn-paper')
batch_size = 128
lr = 0.01
num_epochs = 100
report_every = 1

In [18]:
train_data_set = LastSentenceDataset(df=train,vocab=voc)
data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True,num_workers = 0)

vocab len is 2522


  "num_layers={}".format(dropout, num_layers))


skip thought encoding dataset

In [19]:
val_data_set = LastSentenceDataset(file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv',df=val)

vocab len is 2522
skip thought encoding dataset

(3366, 4800)

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
model = LSModel()
if torch.cuda.is_available():
    model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)

In [37]:
tick = time.time()
name='LS_adagrad_no_retrain'
epoch_losses = []
best_score= 0.5
for epoch_num in range(1, num_epochs + 1):
    batch_losses = []
    for i, batch in enumerate(data_loader):
        #print(batch)
        ### YOUR CODE BELOW ###
        # Zero the gradients
        optimizer.zero_grad()
        # Extract the inputs and the targets
        inputs, targets = batch
        # Transfer the inputs and the targets to GPUs, if available
        if torch.cuda.is_available():
            inputs =inputs.cuda()
            targets = targets.cuda()
        # Run the model
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs,targets)
        
        
        # Backpropagate the error
        loss.backward()
        # Update the parameters
        optimizer.step()
        
        # Append the loss
        batch_losses.append(float (loss))
        ### YOUR CODE ABOVE ###
        epoch_loss = np.mean(np.array(batch_losses))
        epoch_losses.append(epoch_loss)
        
        acc = compute_accuracy(val_data_set,model)
        if acc > best_score:
            best_score = acc
            print('new best saving model')
            save('{}{}_best_acc_{:.2f}'.format(name,epoch_num,acc))

        if epoch_num % report_every == 0:
            tock = time.time()
            print("Epoch {}. Loss {:.4f}. accuracy {:.4f}. Elapsed {:.0f} seconds".format(epoch_num, epoch_loss,acc, tock-tick))
    print('saving model')
    save(name + str(epoch_num))

print("Total time elapsed: {:.0f} minutes".format((tock-tick)/60))

Epoch 1. Loss 0.6908. accuracy 0.4200. Elapsed 0 seconds
Epoch 1. Loss 126.6485. accuracy 0.4200. Elapsed 0 seconds
Epoch 1. Loss 115.3008. accuracy 0.4200. Elapsed 0 seconds
Epoch 1. Loss 90.0425. accuracy 0.4800. Elapsed 0 seconds
new best saving model
Epoch 1. Loss 72.1765. accuracy 0.5200. Elapsed 3 seconds
Epoch 1. Loss 61.1259. accuracy 0.5000. Elapsed 3 seconds
new best saving model
Epoch 1. Loss 52.6199. accuracy 0.6000. Elapsed 6 seconds
Epoch 1. Loss 46.1079. accuracy 0.5200. Elapsed 6 seconds
saving model
Epoch 2. Loss 1.4966. accuracy 0.4600. Elapsed 9 seconds
Epoch 2. Loss 1.1161. accuracy 0.5400. Elapsed 9 seconds
Epoch 2. Loss 1.1062. accuracy 0.5000. Elapsed 9 seconds
Epoch 2. Loss 1.0116. accuracy 0.4600. Elapsed 10 seconds
Epoch 2. Loss 0.9504. accuracy 0.4800. Elapsed 10 seconds
Epoch 2. Loss 0.9612. accuracy 0.6000. Elapsed 10 seconds
Epoch 2. Loss 0.9313. accuracy 0.5800. Elapsed 10 seconds
Epoch 2. Loss 0.8972. accuracy 0.5200. Elapsed 10 seconds
saving model
Epoc

Epoch 18. Loss 0.3758. accuracy 0.5600. Elapsed 66 seconds
Epoch 18. Loss 0.4184. accuracy 0.5600. Elapsed 66 seconds
Epoch 18. Loss 0.4281. accuracy 0.6000. Elapsed 66 seconds
Epoch 18. Loss 0.3917. accuracy 0.6000. Elapsed 66 seconds
Epoch 18. Loss 0.3632. accuracy 0.6000. Elapsed 67 seconds
Epoch 18. Loss 0.3374. accuracy 0.6000. Elapsed 67 seconds
Epoch 18. Loss 0.3314. accuracy 0.5400. Elapsed 67 seconds
Epoch 18. Loss 0.3296. accuracy 0.5600. Elapsed 67 seconds
saving model
Epoch 19. Loss 0.3014. accuracy 0.5800. Elapsed 70 seconds
Epoch 19. Loss 0.3415. accuracy 0.6400. Elapsed 70 seconds
Epoch 19. Loss 0.3293. accuracy 0.6200. Elapsed 70 seconds
Epoch 19. Loss 0.3084. accuracy 0.6200. Elapsed 70 seconds
Epoch 19. Loss 0.2889. accuracy 0.6400. Elapsed 70 seconds
Epoch 19. Loss 0.2764. accuracy 0.6400. Elapsed 70 seconds
Epoch 19. Loss 0.2712. accuracy 0.5800. Elapsed 70 seconds
Epoch 19. Loss 0.2790. accuracy 0.5600. Elapsed 70 seconds
saving model
Epoch 20. Loss 0.3060. accurac

KeyboardInterrupt: 

In [12]:

train_data_set = None
test_data_set = LastSentenceDataset(file='story_cloze_data/cloze_test_test__spring2016 - cloze_test_ALL_test.csv',vocab=voc)

vocab len is 5226


  "num_layers={}".format(dropout, num_layers))


skip thought encoding dataset

In [25]:
def load(name):
    l_model = LSModel()
    l_model.load_state_dict(torch.load('saved_models/{}.save'.format(name)))
    return l_model
#model = load('LS_adagrad5')
#print('model',model)

In [26]:
compute_accuracy(test_data_set)

0.5793693212185996