In [4]:
import torch
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import sys
import numpy
sys.path.append('skip-thoughts.torch/pytorch')
from skipthoughts import UniSkip,BiSkip
import pandas as pd
from Vocabulary import Vocabulary, preprocess
import time
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('seaborn-paper')

In [5]:
def make_vocab(tokens):
    voc = Vocabulary(['<PAD>','<UNK>'])
    voc.add_tokens(tokens)
    print('vocab len is {}'.format(len(voc.w2idx)))
    return voc

In [6]:
def load_data(file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv'):
    df= pd.read_csv(file)
    df = df.drop('InputStoryid',axis=1)
    targets = df['AnswerRightEnding']
    df = df.drop('AnswerRightEnding',axis=1)
    df = df.drop('InputSentence1',axis=1)
    df = df.drop('InputSentence2',axis=1)
    df = df.drop('InputSentence3',axis=1)
    
    voc_str= ''
    for index, row in df.iterrows():
        voc_str+=' '.join(list(row)) + ' '
        
    df['AnswerRightEnding'] = targets
    return df,make_vocab(preprocess(voc_str))

In [24]:
def load_data_train(file='story_cloze_data/ROCStories__spring2016 - ROCStories_spring2016.csv',random_seed=None):
    np.random.RandomState(random_seed)
    df = pd.read_csv(file)
    df = df.drop('storyid',axis=1)
    df = df.drop('storytitle',axis=1)
    df = df.drop('sentence1',axis=1)
    df = df.drop('sentence2',axis=1)
    df = df.drop('sentence3',axis=1)
    target = df['sentence5']
    df = df.drop('sentence5',axis=1)
    
    df = df.rename({'sentence4':'InputSentence4'},axis=1)
    answer1=[]
    answer2=[]
    trueanswer=[]
    for i in range(len(target)):
        k= np.random.randint(0,len(target))
        while k == i:
            k= np.random.randint(0,len(target))
        if np.random.random()<.5:
            answer1.append(target[i])
            answer2.append(target[k])
            trueanswer.append(1)
        else:
            answer1.append(target[k])
            answer2.append(target[i])
            trueanswer.append(2)
    
    df1 = pd.DataFrame({'RandomFifthSentenceQuiz1': answer1,'RandomFifthSentenceQuiz2': answer2})
    df = df.join(df1)
    voc_str= ''
    for index, row in df.iterrows():
        voc_str+=' '.join(list(row)) + ' '
    df['AnswerRightEnding'] = trueanswer
    return df,make_vocab(preprocess(voc_str))

In [8]:
import sys
from collections import OrderedDict 
class LastSentenceDataset(Dataset):
    def __init__(self,file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv',vocab=None,df=None,train=None):

        super().__init__()
        
        created_df, created_vocab = load_data(file)
        if df is None:
            df = created_df
        if vocab:
            self.vocab = vocab
        else:
            self.vocab = created_vocab
      
        
        self.dir_st = 'data/skip-thoughts'
        self.biskip = BiSkip(self.dir_st, self.vocab.convert_to_list()[1:])
        
        self.uniskip = UniSkip(self.dir_st, self.vocab.convert_to_list()[1:])
        
        
        self.data = self.make_data(df)
        
        
    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns: skip thought embedding of ending and 0/1 if it is the right ending 

        """
        return self.data[idx]

    def __len__(self):
        """
        Returns len of the dataset
        """
        return len(self.data)
       
    def make_data(self, df):
        data = []
        total = df.index
        print('skip thought encoding dataset')
        for i in total:
            #print(row['RandomFifthSentenceQuiz1'],row['RandomFifthSentenceQuiz2'])
            progress(i,len(total))
            endings =  self.gen_embbeding(df.at[i,'RandomFifthSentenceQuiz1'], 
                                          df.at[i,'RandomFifthSentenceQuiz2'],
                                          df.at[i,'InputSentence4'])
            if df.at[i,'AnswerRightEnding'] == 1:
                data.append((endings[0].detach().numpy(),1))
                data.append((endings[1].detach().numpy(),0))
            else:
                data.append((endings[0].detach().numpy(),0))
                data.append((endings[1].detach().numpy(),1))
        return data
    

    def zero_pad(self,l,n):
        l = (l + n * [0])[:n]
        return l
    
    def pad_input(self,d):
        d = OrderedDict(sorted(d.items(), key=lambda s: len(s[1])))
        for k,v in d.items():
            d[k]= self.zero_pad(v,len(list(d.items())[-1][1]))
        return d
        
    def gen_embbeding(self,sent1,sent2,last_sent):
        d = dict()
        sent1 = preprocess(sent1)
        sent2 = preprocess(sent2)
        ls = preprocess(last_sent)
        #remove random n token that is in one sentence
        if 'n' in sent2:
            sent2.remove('n')
        d['sent1'] = self.vocab.get_sentence(sent1)
        d['sent2'] = self.vocab.get_sentence(sent2)
        d['ls'] = self.vocab.get_sentence(ls)
        d = self.pad_input(d)
        
        batch = torch.LongTensor([d['sent1'],d['sent2'], d['ls']]) 
        top_half = self.uniskip(batch)
        bottom_half = self.biskip(batch)
        combine_skip = torch.cat([top_half,bottom_half],dim=1)
        end1 = combine_skip[0]
        end2 = combine_skip[1]
        ls = combine_skip[2]
        
        #print(end1[:20],end2[:20])
        end1.add_(ls)
        end2.add_(ls)
        #print('ls',ls[:20])
        #print('after',end1[:20])
        return end1,end2    
    
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

In [25]:
class LSModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.hd1 = torch.nn.Linear(4800, 2400)
        self.hd2 = torch.nn.Linear(2400, 1200)
        self.hd3 = torch.nn.Linear(1200, 600)
        self.output = torch.nn.Linear(600, 2)

    def forward(self, x):
        x = torch.nn.functional.relu(self.hd1(x))
        x = torch.nn.functional.relu(self.hd2(x))
        x = torch.nn.functional.relu(self.hd3(x))
        x = self.output(x)
        #print('output',x)
        return x

In [10]:
class LSShortModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.input = torch.nn.Linear(4800,256)
        self.hidden= torch.nn.Linear(256,64)
        self.output = torch.nn.Linear(64,2)
        
    def forward(self, inputs):
        hidden = torch.nn.functional.relu(self.input(inputs))
        hidden1 = torch.nn.functional.relu(self.hidden(hidden))
        output = self.output(hidden1)
        return output

In [28]:
def score(pair,model):
    '''true if model predicts right'''
    ending1, ending2 = pair
    if ending1[1] == 1:
        target = 1
    else:
        target =  2 
        
    ending1 = torch.tensor(ending1[0])
    ending2 = torch.tensor(ending2[0])
    res1 = model(ending1)
    res2 = model(ending2)
    softm = torch.nn.Softmax(dim=0)
    prob_end1_right = softm(res1)[1].item() 
    prob_end2_right = softm(res2)[1].item()
    
    if prob_end1_right > prob_end2_right:
        pred = 1
    else:
        pred = 2
    
    if pred == target:
        return True
    else: 
        #print(prob_end1_right,prob_end2_right,pred,target)
        return False


In [29]:
def compute_accuracy(data_set,model):
    num_right = 0
    for i in range(0,len(data_set),2):
        if score((data_set[i],data_set[i+1]),model):
            num_right+=1
    return num_right / (len(data_set)/2)


      

In [27]:
def save(name):
    torch.save(model.state_dict(), 'saved_models/{}.save'.format(name))

In [167]:
from sklearn.model_selection import train_test_split
df, voc = load_data(file='story_cloze_data/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv')
train, val = train_test_split(df, test_size=0.1,shuffle=False)

vocab len is 4851


In [30]:
from sklearn.model_selection import train_test_split
df, voc = load_data_train()
train, val = train_test_split(df, test_size=0.1,shuffle=False)

vocab len is 22443


In [31]:
batch_size = 128
lr = 0.01
num_epochs = 20
report_every = 1

In [32]:
train_data_set = LastSentenceDataset(df=train, vocab=voc)
data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=False)

vocab len is 5303
skip thought encoding dataset

In [17]:
voc.unk_ratio()

'1  out of 26004 ids generated were for unk(0.004 %)'

In [33]:
val_data_set = LastSentenceDataset(df=val,vocab=voc)

vocab len is 5303
skip thought encoding dataset

In [34]:
model = LSModel()
if torch.cuda.is_available():
    model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [35]:
tick = time.time()
name='LS_new_4_26_train_ep'
epoch_losses = []
epoch_accs = []
best_score= 0.5
for epoch_num in range(1, num_epochs + 1):
    batch_losses = []
    for i, batch in enumerate(data_loader):
        #print(batch)
        ### YOUR CODE BELOW ###
        # Zero the gradients
        optimizer.zero_grad()
        # Extract the inputs and the targets
        inputs, targets = batch
        # Transfer the inputs and the targets to GPUs, if available
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
        # Run the model
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs,targets)
        
        
        # Backpropagate the error
        loss.backward()
        # Update the parameters
        optimizer.step()
        
        # Append the loss
        batch_losses.append(float (loss))
        ### YOUR CODE ABOVE ###
        epoch_loss = np.mean(np.array(batch_losses))
        epoch_losses.append(epoch_loss)
        
        acc = compute_accuracy(val_data_set,model)
        epoch_accs.append(acc)
        

        if epoch_num % report_every == 0:
            tock = time.time()
            print("Epoch {}. Loss {:.4f}. accuracy {:.4f}. Elapsed {:.0f} seconds".format(epoch_num, epoch_loss,acc, tock-tick))
    epoch_acc = np.mean(np.array(epoch_accs))
    print('ep acc',epoch_acc)
    if epoch_acc > best_score:
            best_score = epoch_acc
            print('new best saving model')
            save('{}{}_best_acc_{:.2f}'.format(name,epoch_num,epoch_acc))
     
    #print('saving model')
    #save(name + str(epoch_num))

print("Total time elapsed: {:.0f} minutes".format((tock-tick)/60))

Epoch 1. Loss 0.6938. accuracy 0.4938. Elapsed 31 seconds
Epoch 1. Loss 11.2370. accuracy 0.4938. Elapsed 62 seconds
Epoch 1. Loss 67.9420. accuracy 0.5048. Elapsed 93 seconds
Epoch 1. Loss 51.9454. accuracy 0.5130. Elapsed 123 seconds
Epoch 1. Loss 42.3841. accuracy 0.5053. Elapsed 154 seconds
Epoch 1. Loss 35.6061. accuracy 0.4916. Elapsed 184 seconds
Epoch 1. Loss 30.6209. accuracy 0.5048. Elapsed 214 seconds
Epoch 1. Loss 26.8877. accuracy 0.5165. Elapsed 245 seconds
Epoch 1. Loss 24.0031. accuracy 0.5169. Elapsed 275 seconds
Epoch 1. Loss 21.6983. accuracy 0.5358. Elapsed 305 seconds
Epoch 1. Loss 19.7882. accuracy 0.5407. Elapsed 336 seconds
Epoch 1. Loss 18.2169. accuracy 0.5433. Elapsed 366 seconds
Epoch 1. Loss 16.8700. accuracy 0.5347. Elapsed 397 seconds
Epoch 1. Loss 15.7164. accuracy 0.5290. Elapsed 427 seconds
Epoch 1. Loss 14.7160. accuracy 0.5352. Elapsed 457 seconds
Epoch 1. Loss 13.8391. accuracy 0.5475. Elapsed 488 seconds
Epoch 1. Loss 13.0657. accuracy 0.5398. Elap

Epoch 1. Loss 2.2201. accuracy 0.5840. Elapsed 4401 seconds
Epoch 1. Loss 2.2091. accuracy 0.5736. Elapsed 4435 seconds
Epoch 1. Loss 2.1982. accuracy 0.5714. Elapsed 4467 seconds
Epoch 1. Loss 2.1875. accuracy 0.5730. Elapsed 4500 seconds
Epoch 1. Loss 2.1770. accuracy 0.5778. Elapsed 4533 seconds
Epoch 1. Loss 2.1666. accuracy 0.5771. Elapsed 4565 seconds
Epoch 1. Loss 2.1562. accuracy 0.5776. Elapsed 4600 seconds
Epoch 1. Loss 2.1461. accuracy 0.5776. Elapsed 4632 seconds
Epoch 1. Loss 2.1361. accuracy 0.5813. Elapsed 4665 seconds
Epoch 1. Loss 2.1263. accuracy 0.5818. Elapsed 4698 seconds
Epoch 1. Loss 2.1166. accuracy 0.5851. Elapsed 4730 seconds
Epoch 1. Loss 2.1070. accuracy 0.5714. Elapsed 4761 seconds
Epoch 1. Loss 2.0976. accuracy 0.5697. Elapsed 4794 seconds
Epoch 1. Loss 2.0881. accuracy 0.5668. Elapsed 4827 seconds
Epoch 1. Loss 2.0788. accuracy 0.5800. Elapsed 4859 seconds
Epoch 1. Loss 2.0696. accuracy 0.5789. Elapsed 4892 seconds
Epoch 1. Loss 2.0606. accuracy 0.5752. E

Epoch 1. Loss 1.4564. accuracy 0.5774. Elapsed 8627 seconds
Epoch 1. Loss 1.4536. accuracy 0.5778. Elapsed 8657 seconds
Epoch 1. Loss 1.4510. accuracy 0.5782. Elapsed 8686 seconds
Epoch 1. Loss 1.4482. accuracy 0.5798. Elapsed 8716 seconds
Epoch 1. Loss 1.4454. accuracy 0.5824. Elapsed 8746 seconds
Epoch 1. Loss 1.4427. accuracy 0.5855. Elapsed 8775 seconds
Epoch 1. Loss 1.4400. accuracy 0.5844. Elapsed 8805 seconds
Epoch 1. Loss 1.4374. accuracy 0.5842. Elapsed 8835 seconds
Epoch 1. Loss 1.4347. accuracy 0.5826. Elapsed 8864 seconds
Epoch 1. Loss 1.4321. accuracy 0.5815. Elapsed 8894 seconds
Epoch 1. Loss 1.4295. accuracy 0.5837. Elapsed 8923 seconds
Epoch 1. Loss 1.4269. accuracy 0.5815. Elapsed 8953 seconds
Epoch 1. Loss 1.4243. accuracy 0.5820. Elapsed 8983 seconds
Epoch 1. Loss 1.4218. accuracy 0.5846. Elapsed 9013 seconds
Epoch 1. Loss 1.4192. accuracy 0.5844. Elapsed 9042 seconds
Epoch 1. Loss 1.4166. accuracy 0.5837. Elapsed 9072 seconds
Epoch 1. Loss 1.4141. accuracy 0.5844. E

Epoch 1. Loss 1.1989. accuracy 0.6488. Elapsed 12823 seconds
Epoch 1. Loss 1.1977. accuracy 0.6418. Elapsed 12853 seconds
Epoch 1. Loss 1.1963. accuracy 0.6433. Elapsed 12884 seconds
Epoch 1. Loss 1.1950. accuracy 0.6380. Elapsed 12915 seconds
Epoch 1. Loss 1.1937. accuracy 0.6380. Elapsed 12946 seconds
Epoch 1. Loss 1.1925. accuracy 0.6367. Elapsed 12977 seconds
Epoch 1. Loss 1.1912. accuracy 0.6402. Elapsed 13008 seconds
Epoch 1. Loss 1.1900. accuracy 0.6446. Elapsed 13039 seconds
Epoch 1. Loss 1.1888. accuracy 0.6426. Elapsed 13069 seconds
Epoch 1. Loss 1.1876. accuracy 0.6440. Elapsed 13099 seconds
Epoch 1. Loss 1.1864. accuracy 0.6457. Elapsed 13128 seconds
Epoch 1. Loss 1.1851. accuracy 0.6488. Elapsed 13158 seconds
Epoch 1. Loss 1.1839. accuracy 0.6538. Elapsed 13188 seconds
Epoch 1. Loss 1.1827. accuracy 0.6567. Elapsed 13218 seconds
Epoch 1. Loss 1.1816. accuracy 0.6565. Elapsed 13247 seconds
Epoch 1. Loss 1.1804. accuracy 0.6563. Elapsed 13277 seconds
Epoch 1. Loss 1.1792. ac

KeyboardInterrupt: 

In [22]:

#train_data_set = None
test_data_set = LastSentenceDataset(file='story_cloze_data/cloze_test_test__spring2016 - cloze_test_ALL_test.csv',vocab=voc)

vocab len is 5226
skip thought encoding dataset

In [151]:
def load(name):
    l_model = LSModel()
    l_model.load_state_dict(torch.load('saved_models/{}.save'.format(name)))
    return l_model
model = load('LS_new_4_25_voc_fix_2_ep7_best_acc_0.70')
#print('model',model)

In [None]:
compute_accuracy(train_data_set,model)

In [36]:
compute_accuracy(test_data_set,model)

0.5344735435595938