## Load Dependencies

In [1]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn import LSTM
from torch.nn import Parameter
from torch.nn import MSELoss, L1Loss, SmoothL1Loss, CrossEntropyLoss
from scipy.special import huber
#from torch.autograd import Variable


import pandas as pd
import numpy as np
import math
import json
import re
from nltk.tokenize import word_tokenize

from collections import Iterable
from tqdm import tqdm

import matplotlib.pyplot as plt

sys.path.insert(0, '/Users/sidvash/facts_lab/factslab-python/factslab/')
from utility import load_glove_embedding
from datastructures import ConstituencyTree
from pytorch.childsumtreelstm import *
#from pytorch.rnnregression import RNNRegressionTrainer
from pytorch.rnnregression import RNNRegression


torch.manual_seed(1)

<torch._C.Generator at 0x105822a70>

## Load the data from MTurK

In [2]:
#Data locations:
data_file = "/Users/sidvash/facts_lab/factslab-protocols-eventtemporal/" 
data_file += "Temporal_relations/testing_HIT/agreement_data_protocol2.csv"

turk_output = "/Users/sidvash/facts_lab/factslab-protocols-eventtemporal/" 
turk_output += "Temporal_relations/testing_HIT/Batch_3220834_batch_results.csv"

embed_path = '/Users/sidvash/kaggle/'

In [3]:
def filter_sentence(s):
    '''
    Filter out html <span> tags from a sentence 
    '''
    
    s = re.sub(r'\<span class\=\\"predicate\\">', r'', s)
    
    s = re.sub(r'\<\/span\>', r' ', s)
    
    s = re.sub(r'  ', r'', s) #remove extra spaces

    return s

def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_pred1 = "Answer_sld_pred1_" + str(idx+1)
            var_pred2 =  "Answer_sld_pred2_" + str(idx+1)
            var_conf = "Answer_confidence_range" + str(idx+1)
            
            temp_dict["slider1_posn"] = getattr(row, var_pred1)
            temp_dict["slider2_posn"] = getattr(row, var_pred2)
            temp_dict["confidence"] = getattr(row, var_conf)
            temp_dict["worker_id"] = row.WorkerId
            temp_dict["hit_id"] = row.HITId

            global_list.append(temp_dict)
            
    return pd.DataFrame(global_list)

def time_ml_tag(row, var1, var2):
    '''
    Creates time ML tags from slider positions of two events 
    
    Eg: EVENT 1 "is before" EVENT2
    
    Eg: Relatins: is before
    '''
    ans = ""
    
    slider1 = getattr(row, var1)
    slider2 = getattr(row, var2)
    
    event1 = [int(x) for x in slider1.split("-")]
    event2 = [int(x) for x in slider2.split("-")]
    
    if (event1[0] == event2[0]) and (event1[1] == event2[1]):
        ans = "simultaneous"
        
    elif event1[1] <= event2[0]:
        ans = "before"
    
    elif event1[0] >= event2[1]:
        ans = "after"
        
    elif event1[1] >= event2[1] and event1[0] <= event2[0]:
        ans = "includes"
        
    elif event1[1] <= event2[1] and event1[0] >= event2[0]:
        ans = "is_included"
    
    elif event1[0] < event2[0] and event1[1] > event2[0]:
        ans = "before_cont"
        
    elif event1[0] < event2[1] and event1[1] > event2[1]:
        ans = "after_cont"
    
    else: 
        ans = "other"
    
    return ans

In [4]:
data = pd.read_csv(turk_output)
data.columns = [c.replace('.', '_') for c in data.columns]

pilot_data = extract_dataframe(data)

#Re-arrange the order 
cols = list(pilot_data.columns)
#pilot_data = pilot_data[pilot_data.columns[::-1]].drop('sentence', axis=1)
pilot_data = pilot_data[pilot_data.columns[::-1]]


#Drop start, end, instant variables that were stored in the json data
pilot_data = pilot_data.drop(['start_pred1', 'start_pred2', 'end_pred1', 'end_pred2',
                             'instant_pred1', 'instant_pred2'], axis=1)


#pilot_data.replace('na',np.NaN, inplace=True)
pilot_data.head
pilot_data['sent_token'] = pilot_data['sentence_id'] + "_" +\
                            pilot_data['pred_token1'].map(lambda x: str(x)) + "_" +\
                             pilot_data['pred_token2'].map(lambda x: str(x))
pilot_data['timeML_tag'] = pilot_data.apply(lambda row: time_ml_tag(row, 'slider1_posn', 'slider2_posn'), axis=1)

#pilot_data.head()

In [5]:
## Data features
pilot_data['filter_sent'] = pilot_data['sentence'].map(lambda x: [filter_sentence(x).split(" ")])
pilot_data['pred1_token_int'] = pilot_data['sent_token'].map(lambda x: int(x.split("_")[-2]) + 1)
pilot_data['pred2_token_int'] = pilot_data['sent_token'].map(lambda x: int(x.split("_")[-1]) + 1)
pilot_data['pred_tokens'] = pilot_data.apply(lambda row: [[row.pred1_token_int, row.pred2_token_int]], axis=1)

pilot_data['slider1_start'] = pilot_data['slider1_posn'].map(lambda x: [int(ch) for ch in x.split("-")][0])
pilot_data['slider1_end'] = pilot_data['slider1_posn'].map(lambda x: [int(ch) for ch in x.split("-")][1])

pilot_data['slider2_start'] = pilot_data['slider2_posn'].map(lambda x: [int(ch) for ch in x.split("-")][0])
pilot_data['slider2_end'] = pilot_data['slider2_posn'].map(lambda x: [int(ch) for ch in x.split("-")][1])
#Convert objects to numerics

X_data = pilot_data[['filter_sent','pred1_token_int','pred2_token_int', 'pred_tokens']]
y_data = pilot_data[['slider1_start', 'slider1_end', 'slider2_start', 'slider2_end']]


## View Data

In [6]:
X_data.head()

Unnamed: 0,filter_sent,pred1_token_int,pred2_token_int,pred_tokens
0,"[[On, Wednesday, guerrillas, had, kidnapped, a...",5,17,"[[5, 17]]"
1,"[[Yesterday, there, were, tens, of, them, putt...",7,27,"[[7, 27]]"
2,"[[He, did, once, make, an, unforgivable, error...",4,15,"[[4, 15]]"
3,"[[He, added, that, &quot;, America, does, not,...",2,58,"[[2, 58]]"
4,"[[As, a, child, in, the, 50&#39;s, I, had, a, ...",8,35,"[[8, 35]]"


In [7]:
y_data.head()

Unnamed: 0,slider1_start,slider1_end,slider2_start,slider2_end
0,39,46,27,46
1,25,41,6,17
2,35,37,19,26
3,35,36,57,80
4,10,16,15,16


In [8]:
X = X_data.filter_sent.values
idxs = X_data.pred_tokens.values
y = list(list(zip(y_data[['slider1_start', 'slider1_end']].values, 
             y_data[['slider2_start', 'slider2_end']].values)))

data = list(zip(X, idxs, y))
##Check sample values
# print(X[0])
# print(idxs[0])
# print(y[0])

In [9]:
print(data[0])

([['On', 'Wednesday', 'guerrillas', 'had', 'kidnapped', 'a', 'cosmetic', 'surgeon', 'and', 'his', 'wife', 'while', 'they', 'were', 'on', 'their', 'way', 'home', '.']], [[5, 17]], (array([39, 46]), array([27, 46])))


### Model Class

In [10]:
## import from factslab

## Load embeddings

#### Glove embeddings and Word_to_ix dict

In [11]:
vocab_set = set()
for [sent] in X:
    for word in sent:
        if word not in vocab_set:
            vocab_set.add(word) 
print("Vocab size: {}".format(len(vocab_set)))

vocab = list(vocab_set)

Vocab size: 1275


In [12]:
glove_embeddings = load_glove_embedding(embed_path + 'glove.42B.300d', vocab)

## Instantiate the RNN-Regression model

In [13]:
model = RNNRegression(embeddings = glove_embeddings, rnn_classes = LSTM, 
                      rnn_hidden_sizes = 300, num_rnn_layers=1, bidirectional=True, attention=False,
                      regression_hidden_sizes = [32, 16], output_size = 2)

## Fit the model and backpropagate

In [14]:
optimizer_class=torch.optim.Adam
optimizer = optimizer_class(model.parameters())
#optimizer

In [15]:
total_obs = len(data)
losses = []
for epoch in range(20):
    for i, (words, [idx], target) in tqdm(enumerate(data)):
        optimizer.zero_grad()

        predicts = model(words, idxs=idx)
        actuals = [torch.from_numpy(arr).float().view(1,2) for arr in target]

        loss_fn = MSELoss()
        losses = []

        for predicted, actual in list(zip(predicts, actuals)):

            losses.append(loss_fn(predicted, actual))


        overall_loss = sum(losses)
        
        
        overall_loss.backward()
        optimizer.step()
        
    losses.append(overall_loss)
    print("Epoch {} completed. Overall MSE Loss: {}".format((epoch+1), overall_loss))

500it [00:31, 15.68it/s]
2it [00:00, 13.11it/s]

Epoch 1 completed. Overall MSE Loss: 5894.22314453125


500it [00:31, 16.04it/s]
2it [00:00, 14.80it/s]

Epoch 2 completed. Overall MSE Loss: 5886.68017578125


500it [00:35, 14.28it/s]
0it [00:00, ?it/s]

Epoch 3 completed. Overall MSE Loss: 5879.5390625


500it [00:32, 15.45it/s]
2it [00:00, 17.14it/s]

Epoch 4 completed. Overall MSE Loss: 5872.353515625


500it [00:30, 16.37it/s]
2it [00:00, 17.38it/s]

Epoch 5 completed. Overall MSE Loss: 5864.8837890625


500it [00:36, 13.78it/s]
2it [00:00, 10.88it/s]

Epoch 6 completed. Overall MSE Loss: 5857.78955078125


500it [00:36, 13.59it/s]
2it [00:00, 14.82it/s]

Epoch 7 completed. Overall MSE Loss: 5851.7939453125


500it [00:31, 15.74it/s]
2it [00:00, 14.92it/s]

Epoch 8 completed. Overall MSE Loss: 5846.443359375


500it [00:31, 15.78it/s]
2it [00:00, 14.36it/s]

Epoch 9 completed. Overall MSE Loss: 5840.56396484375


500it [00:32, 15.36it/s]
2it [00:00, 15.88it/s]

Epoch 10 completed. Overall MSE Loss: 5834.6640625


500it [00:33, 15.14it/s]
2it [00:00, 15.89it/s]

Epoch 11 completed. Overall MSE Loss: 5829.216796875


500it [00:33, 15.00it/s]
2it [00:00, 15.93it/s]

Epoch 12 completed. Overall MSE Loss: 5824.13623046875


500it [00:33, 14.90it/s]
2it [00:00, 14.67it/s]

Epoch 13 completed. Overall MSE Loss: 5819.3125


500it [00:35, 14.01it/s]
2it [00:00, 14.95it/s]

Epoch 14 completed. Overall MSE Loss: 5814.72509765625


500it [00:34, 14.46it/s]
2it [00:00, 15.85it/s]

Epoch 15 completed. Overall MSE Loss: 5810.30859375


500it [00:32, 15.19it/s]
2it [00:00, 15.26it/s]

Epoch 16 completed. Overall MSE Loss: 5805.93017578125


500it [00:32, 15.24it/s]
2it [00:00, 15.57it/s]

Epoch 17 completed. Overall MSE Loss: 5801.75390625


500it [00:32, 15.49it/s]
2it [00:00, 15.40it/s]

Epoch 18 completed. Overall MSE Loss: 5797.7431640625


500it [00:32, 15.51it/s]
2it [00:00, 16.15it/s]

Epoch 19 completed. Overall MSE Loss: 5793.92919921875


500it [00:35, 13.94it/s]

Epoch 20 completed. Overall MSE Loss: 5790.26171875





## Prediction

In [20]:
data_idx = 140

words = data[data_idx][0]
[idx] = data[data_idx][1]
print("Predicted value")
model(words, idxs = idx)

Predicted value


[tensor([[ 0.7649,  0.6589]]), tensor([[ 0.4548,  0.2953]])]

In [18]:
print("Actual value")
data[data_idx][2]

Actual value


(array([17, 48]), array([70, 80]))

## Rough Work

In [19]:
# embeddings = torch.nn.Embedding(1275, 300)

# indices = [[1], [2], [5], [45]]
# indices = torch.tensor(indices, dtype=torch.long)
# print(indices.type())
# indices.requires_grad