In [1]:
import torch
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import sys
import numpy
sys.path.append('skip-thoughts.torch/pytorch')
from skipthoughts import UniSkip,BiSkip
import pandas as pd
from Vocabulary import Vocabulary, preprocess
import time
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('seaborn-paper')

In [2]:
def make_vocab(tokens):
    voc = Vocabulary(['<PAD>','<UNK>'])
    voc.add_tokens(tokens)
    print('vocab len is {}'.format(len(voc.w2idx)))
    return voc

In [3]:
def load_data(file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv'):
    df= pd.read_csv(file)
    df = df.drop('InputStoryid',axis=1)
    targets = df['AnswerRightEnding']
    df = df.drop('AnswerRightEnding',axis=1)
    df = df.drop('InputSentence1',axis=1)
    df = df.drop('InputSentence2',axis=1)
    df = df.drop('InputSentence3',axis=1)
    
    voc_str= ''
    for index, row in df.iterrows():
        voc_str+=' '.join(list(row)) + ' '
        
    df['AnswerRightEnding'] = targets
    return df,make_vocab(preprocess(voc_str))

In [4]:
def load_data_train(file='story_cloze_data/ROCStories__spring2016 - ROCStories_spring2016.csv',random_seed=None):
    np.random.RandomState(random_seed)
    df = pd.read_csv(file)
    df = df.drop('storyid',axis=1)
    df = df.drop('storytitle',axis=1)
    df = df.drop('sentence1',axis=1)
    df = df.drop('sentence2',axis=1)
    df = df.drop('sentence3',axis=1)
    target = df['sentence5']
    df = df.drop('sentence5',axis=1)
    
    df = df.rename({'sentence4':'InputSentence4'},axis=1)
    answer1=[]
    answer2=[]
    trueanswer=[]
    for i in range(len(target)):
        k= np.random.randint(0,len(target))
        while k == i:
            k= np.random.randint(0,len(target))
        if np.random.random()<.5:
            answer1.append(target[i])
            answer2.append(target[k])
            trueanswer.append(1)
        else:
            answer1.append(target[k])
            answer2.append(target[i])
            trueanswer.append(2)
    
    df1 = pd.DataFrame({'RandomFifthSentenceQuiz1': answer1,'RandomFifthSentenceQuiz2': answer2})
    df = df.join(df1)
    voc_str= ''
    for index, row in df.iterrows():
        voc_str+=' '.join(list(row)) + ' '
    df['AnswerRightEnding'] = trueanswer
    return df,make_vocab(preprocess(voc_str))

In [5]:
import sys
from collections import OrderedDict 
class LastSentenceDataset(Dataset):
    def __init__(self,file='story_cloze_data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv',vocab=None,df=None,train=None):

        super().__init__()
        
        created_df, created_vocab = load_data(file)
        if df is None:
            df = created_df
        if vocab:
            self.vocab = vocab
        else:
            self.vocab = created_vocab
      
        
        self.dir_st = 'data/skip-thoughts'
        self.biskip = BiSkip(self.dir_st, self.vocab.convert_to_list()[1:])
        
        self.uniskip = UniSkip(self.dir_st, self.vocab.convert_to_list()[1:])
        
        
        self.data = self.make_data(df)
        
        
    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns: skip thought embedding of ending and 0/1 if it is the right ending 

        """
        return self.data[idx]

    def __len__(self):
        """
        Returns len of the dataset
        """
        return len(self.data)
       
    def make_data(self, df):
        data = []
        total = df.index
        print('skip thought encoding dataset')
        for i in total:
            #print(row['RandomFifthSentenceQuiz1'],row['RandomFifthSentenceQuiz2'])
            progress(i,len(total))
            endings =  self.gen_embbeding(df.at[i,'RandomFifthSentenceQuiz1'], 
                                          df.at[i,'RandomFifthSentenceQuiz2'],
                                          df.at[i,'InputSentence4'])
            if df.at[i,'AnswerRightEnding'] == 1:
                data.append((endings[0].detach().numpy(),1))
                data.append((endings[1].detach().numpy(),0))
            else:
                data.append((endings[0].detach().numpy(),0))
                data.append((endings[1].detach().numpy(),1))
        return data
    

    def zero_pad(self,l,n):
        l = (l + n * [0])[:n]
        return l
    
    def pad_input(self,d):
        d = OrderedDict(sorted(d.items(), key=lambda s: len(s[1])))
        for k,v in d.items():
            d[k]= self.zero_pad(v,len(list(d.items())[-1][1]))
        return d
        
    def gen_embbeding(self,sent1,sent2,last_sent):
        d = dict()
        sent1 = preprocess(sent1)
        sent2 = preprocess(sent2)
        ls = preprocess(last_sent)
        #remove random n token that is in one sentence
        if 'n' in sent2:
            sent2.remove('n')
        d['sent1'] = self.vocab.get_sentence(sent1)
        d['sent2'] = self.vocab.get_sentence(sent2)
        d['ls'] = self.vocab.get_sentence(ls)
        d = self.pad_input(d)
        
        batch = torch.LongTensor([d['sent1'],d['sent2'], d['ls']]) 
        top_half = self.uniskip(batch)
        bottom_half = self.biskip(batch)
        combine_skip = torch.cat([top_half,bottom_half],dim=1)
        end1 = combine_skip[0]
        end2 = combine_skip[1]
        ls = combine_skip[2]
        
        #print(end1[:20],end2[:20])
        end1.add_(ls)
        end2.add_(ls)
        #print('ls',ls[:20])
        #print('after',end1[:20])
        return end1,end2    
    
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

In [6]:
class LSModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.hd1 = torch.nn.Linear(4800, 2400)
        self.hd2 = torch.nn.Linear(2400, 1200)
        self.hd3 = torch.nn.Linear(1200, 600)
        self.output = torch.nn.Linear(600, 2)

    def forward(self, x):
        x = torch.nn.functional.relu(self.hd1(x))
        x = torch.nn.functional.relu(self.hd2(x))
        x = torch.nn.functional.relu(self.hd3(x))
        x = self.output(x)
        #print('output',x)
        return x

In [7]:
class LSShortModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.input = torch.nn.Linear(4800,256)
        self.hidden= torch.nn.Linear(256,64)
        self.output = torch.nn.Linear(64,2)
        
    def forward(self, inputs):
        hidden = torch.nn.functional.relu(self.input(inputs))
        hidden1 = torch.nn.functional.relu(self.hidden(hidden))
        output = self.output(hidden1)
        return output

In [20]:
def score(pair,model):
    '''true if model predicts right'''
    ending1, ending2 = pair
    if ending1[1] == 1:
        target = 1
    else:
        target =  2 
    if torch.cuda.is_available():
        ending1 = torch.tensor(ending1[0]).cuda()
        ending2 = torch.tensor(ending2[0]).cuda()
    else:
        ending1 = torch.tensor(ending1[0])
        ending2 = torch.tensor(ending2[0])
    res1 = model(ending1)
    res2 = model(ending2)
    softm = torch.nn.Softmax(dim=0)
    prob_end1_right = softm(res1)[1].item() 
    prob_end2_right = softm(res2)[1].item()
    
    if prob_end1_right > prob_end2_right:
        pred = 1
    else:
        pred = 2
    
    if pred == target:
        return True
    else: 
        #print(prob_end1_right,prob_end2_right,pred,target)
        return False


In [9]:
def compute_accuracy(data_set,model):
    num_right = 0
    for i in range(0,len(data_set),2):
        if score((data_set[i],data_set[i+1]),model):
            num_right+=1
    return num_right / (len(data_set)/2)


In [10]:
def save(name):
    torch.save(model.state_dict(), 'saved_models/{}.save'.format(name))

In [11]:
from sklearn.model_selection import train_test_split
df, voc = load_data(file='story_cloze_data/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv')
train, val = train_test_split(df, test_size=0.1,shuffle=False)

vocab len is 4851


In [12]:
#uncomment to load training data
#from sklearn.model_selection import train_test_split
#df, voc = load_data_train()
#train, val = train_test_split(df, test_size=0.1,shuffle=False)

In [13]:
batch_size = 128
lr = 0.01
num_epochs = 7
report_every = 1

In [22]:
train_data_set = LastSentenceDataset(df=train, vocab=voc)
data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True)

vocab len is 5303


  "num_layers={}".format(dropout, num_layers))


skip thought encoding dataset

In [15]:
voc.unk_ratio()

'2  out of 33769 ids generated were for unk(0.006 %)'

In [16]:
val_data_set = LastSentenceDataset(df=val,vocab=voc)

vocab len is 5303
skip thought encoding dataset

In [23]:
model = LSModel()
if torch.cuda.is_available():
    model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [24]:
tick = time.time()
name='LS_new_4_26_train_ep'
epoch_losses = []
epoch_accs = []
best_score= 0.5
for epoch_num in range(1, num_epochs + 1):
    batch_losses = []
    for i, batch in enumerate(data_loader):
        #print(batch)
        ### YOUR CODE BELOW ###
        # Zero the gradients
        optimizer.zero_grad()
        # Extract the inputs and the targets
        inputs, targets = batch
        # Transfer the inputs and the targets to GPUs, if available
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
        # Run the model
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs,targets)
        
        
        # Backpropagate the error
        loss.backward()
        # Update the parameters
        optimizer.step()
        
        # Append the loss
        batch_losses.append(float (loss))
        ### YOUR CODE ABOVE ###
        epoch_loss = np.mean(np.array(batch_losses))
        epoch_losses.append(epoch_loss)
        
        acc = compute_accuracy(val_data_set,model)
        epoch_accs.append(acc)
        

        if epoch_num % report_every == 0:
            tock = time.time()
            print("Epoch {}. Loss {:.4f}. accuracy {:.4f}. Elapsed {:.0f} seconds".format(epoch_num, epoch_loss,acc, tock-tick))
    epoch_acc = np.mean(np.array(epoch_accs))
    print('ep acc',epoch_acc)
    if epoch_acc > best_score:
            best_score = epoch_acc
            print('new best saving model')
            save('{}{}_best_acc_{:.2f}'.format(name,epoch_num,epoch_acc))
     
    #print('saving model')
    #save(name + str(epoch_num))

print("Total time elapsed: {:.0f} minutes".format((tock-tick)/60))

Epoch 1. Loss 0.6930. accuracy 0.5823. Elapsed 1 seconds
Epoch 1. Loss 21.8699. accuracy 0.5063. Elapsed 3 seconds
Epoch 1. Loss 104.6657. accuracy 0.4620. Elapsed 4 seconds
Epoch 1. Loss 80.8051. accuracy 0.5886. Elapsed 5 seconds
Epoch 1. Loss 65.0139. accuracy 0.3924. Elapsed 6 seconds
Epoch 1. Loss 54.3510. accuracy 0.5063. Elapsed 7 seconds
Epoch 1. Loss 46.7884. accuracy 0.5316. Elapsed 9 seconds
Epoch 1. Loss 41.0415. accuracy 0.4430. Elapsed 10 seconds
Epoch 1. Loss 36.5722. accuracy 0.4304. Elapsed 11 seconds
Epoch 1. Loss 33.0038. accuracy 0.5127. Elapsed 12 seconds
Epoch 1. Loss 30.0670. accuracy 0.5443. Elapsed 13 seconds
Epoch 1. Loss 27.6195. accuracy 0.5253. Elapsed 14 seconds
Epoch 1. Loss 25.5475. accuracy 0.4873. Elapsed 16 seconds
Epoch 1. Loss 23.7790. accuracy 0.5633. Elapsed 17 seconds
Epoch 1. Loss 22.2457. accuracy 0.6266. Elapsed 18 seconds
Epoch 1. Loss 20.8984. accuracy 0.5886. Elapsed 19 seconds
Epoch 1. Loss 19.7088. accuracy 0.5380. Elapsed 20 seconds
Epoc

Epoch 6. Loss 0.5607. accuracy 0.7405. Elapsed 160 seconds
Epoch 6. Loss 0.5612. accuracy 0.6962. Elapsed 161 seconds
ep acc 0.6818473674555128
new best saving model
Epoch 7. Loss 0.5263. accuracy 0.7025. Elapsed 162 seconds
Epoch 7. Loss 0.5380. accuracy 0.7089. Elapsed 163 seconds
Epoch 7. Loss 0.5334. accuracy 0.7342. Elapsed 165 seconds
Epoch 7. Loss 0.5256. accuracy 0.7152. Elapsed 166 seconds
Epoch 7. Loss 0.5480. accuracy 0.7025. Elapsed 167 seconds
Epoch 7. Loss 0.5476. accuracy 0.6962. Elapsed 168 seconds
Epoch 7. Loss 0.5538. accuracy 0.7342. Elapsed 169 seconds
Epoch 7. Loss 0.5460. accuracy 0.7342. Elapsed 170 seconds
Epoch 7. Loss 0.5389. accuracy 0.7722. Elapsed 172 seconds
Epoch 7. Loss 0.5425. accuracy 0.7785. Elapsed 173 seconds
Epoch 7. Loss 0.5384. accuracy 0.7658. Elapsed 174 seconds
Epoch 7. Loss 0.5394. accuracy 0.7658. Elapsed 175 seconds
Epoch 7. Loss 0.5384. accuracy 0.7342. Elapsed 176 seconds
Epoch 7. Loss 0.5376. accuracy 0.6962. Elapsed 177 seconds
Epoch 7.

In [None]:

#train_data_set = None
test_data_set = LastSentenceDataset(file='story_cloze_data/cloze_test_test__spring2016 - cloze_test_ALL_test.csv',vocab=voc)

In [None]:
def load(name):
    l_model = LSModel()
    l_model.load_state_dict(torch.load('saved_models/{}.save'.format(name)))
    return l_model
#model = load('LS_new_4_25_voc_fix_2_ep7_best_acc_0.70')
#print('model',model)

In [None]:
compute_accuracy(train_data_set,model)

In [None]:
compute_accuracy(test_data_set,model)