In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
import code
import pickle
import os
from torch import autograd
from torch.autograd import Variable
from torch.nn import Embedding
from argparse import ArgumentParser


class DCN_Model(nn.Module):

    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio, maxout_pool_size, max_number_of_iterations):
        super(DCN_Model, self).__init__()

        self.encoder = Word_Level_Encoder(hidden_dim, emb_matrix, dropout_ratio)
        self.coattention_encoder = Coattention_Encoder(hidden_dim, maxout_pool_size, embedding_matrix, max_number_of_iterations, dropout_ratio)
        self.decoder = Dynamic_Decoder(hidden_dim, maxout_pool_size, max_number_of_iterations, dropout_ratio)

    def forward(self, context_word_indexes, context_word_mask, question_word_indexes, question_word_mask,span_tensor):
        passage_representation = self.encoder.forward(context_word_indexes, context_word_mask)

        question_representation = self.encoder.forward(question_word_indexes, question_word_mask)
       

        U_matrix = self.coattention_encoder.forward(question_representation, passage_representation,context_word_mask)

#         print(span_tensor[0].size())

        loss, index_start, index_end = self.decoder.forward(U_matrix, context_word_mask, span_tensor)

        return loss, index_start, index_end

In [13]:
def index_files_using_word_to_index(filename, _dict, max_words):

    f = open(filename, "r", encoding="utf-8")

    lines = f.readlines()
    lines  = [l.lower() for l in lines]
    encoded_lines = []
    for l in lines:
        tokens = l.split()
        tokens = tokens[:max_words]
        temp = []
        for t in tokens:
            if t in _dict:
                temp.append(_dict[t])
            else:
                temp.append(1)

        encoded_lines.append(temp[:])

    return encoded_lines
def find_max_length(data):

    """ Finds the maximum sequence length for data
        Args:
            data: The data from which sequences will be chosen
    """
    temp = 0
    index = 0
    for i, _ in enumerate(data):

        if (len(data[i]) > temp):
            temp = len(data[i])
            index = i
    return temp,index


def pad_data(data):

    """ Pad the data to max_length given
        Args:
            data: Data that needs to be padded
            max_length : The length to be achieved with padding
        Returns:
            padded_data : Each sequence is padded to make it of length
                          max_length.
    """
    padded_data = []
    max_length,index =  find_max_length(data)

    for lines in data:
        if (len(lines) < max_length):
            temp = np.lib.pad(lines, (0,max_length - len(lines)),
                'constant', constant_values=0)
        else:
            temp = lines[:max_length]
        padded_data.append(temp)

    padded_data = torch.from_numpy(np.array(padded_data)).type(torch.int64)

    return padded_data


In [39]:
import pickle
import torch
import os
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"
def get_pretrained_embedding(embedding_matrix):
    embedding = nn.Embedding(*embedding_matrix.shape)
    embedding.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
    embedding.weight.requires_grad = False
    return embedding
with open(data_dir + "glove_word_embeddings.pkl", "rb") as input_file:
    embedding_matrix_words = pickle.load(input_file)
    
embedding = get_pretrained_embedding(embedding_matrix_words)

with open(os.path.join(data_dir , "dictionaries.pkl"), "rb") as input_file:
    dictionaries = pickle.load(input_file)
word_to_index = dictionaries["word_to_index"]


context_path_train = os.path.join(data_dir, "train.context")
context_tokens = open(context_path_train, "r", encoding="utf-8").readlines()
    

context_word_index_old = index_files_using_word_to_index(context_path_train, word_to_index, 400)

context_word_index = context_word_index_old[0:100]
context_word_index = Variable(pad_data(context_word_index))
context_word_index.requires_grad = False

word_sequence_embeddings = embedding(context_word_index)


In [50]:
print(word_sequence_embeddings[0][1])
print(context_word_index[0])
print(context_tokens[0])



tensor([-0.1696,  0.0160, -0.0499,  0.1167,  0.0508, -0.0192,  0.1246,  0.0109,
        -0.0703,  0.0379, -0.1481,  0.1130,  0.0287, -0.1677,  0.0618, -0.1429,
        -0.1560,  0.1138, -0.0494,  0.1030, -0.1634, -0.0556, -0.1604,  0.0163,
        -0.1033,  0.1691, -0.0814,  0.0414, -0.0799, -0.0599,  0.0862, -0.1673,
         0.1429,  0.0342,  0.0771,  0.0640, -0.0052, -0.1170, -0.0212,  0.1423,
         0.1076,  0.0930, -0.1570, -0.0211,  0.0964, -0.0314, -0.1519,  0.0555,
         0.0997, -0.0666,  0.0491, -0.1069,  0.0014,  0.0559, -0.0516, -0.0303,
        -0.0859,  0.0625,  0.0753,  0.0113, -0.0553,  0.1540, -0.1374,  0.1481,
         0.1406, -0.0216, -0.0647, -0.1412,  0.0747,  0.0547,  0.1113, -0.0444,
        -0.0984,  0.0201,  0.0704,  0.1566,  0.1341, -0.1072, -0.0030,  0.0875,
         0.0781,  0.0268, -0.0842, -0.0226, -0.0542, -0.0460,  0.1508, -0.1167,
         0.0659,  0.0403,  0.0854, -0.0103,  0.1290, -0.0669, -0.1134,  0.0838,
         0.1443, -0.0277, -0.0063, -0.10

In [60]:
embeddings_index = {}
file = open(data_dir + "glove_embeddings100.txt", "r", encoding="utf-8")
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    vector = np.asarray(values[1:], dtype='float32') ## These are the vectors representing the embedding for the word
    embeddings_index[word] = vector
file.close()
embedding_matrix = np.zeros((num_words, 100))
for word, i in word_to_index.items():
    embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

NameError: name 'num_words' is not defined

In [80]:
file = open(os.path.join(data_dir , "train.context"),"r", encoding="utf-8")

In [111]:

glove_embeddings = os.path.join(data_dir, "glove_embeddings100.txt")

glove_embeddings = open(glove_embeddings,'r', encoding = 'utf-8')
word_embedding_size = 100
temp_embeddings = []

embedding_matrix = np.zeros((len(word_to_index), 100))
for word, i in word_to_index.items():
    embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

for word in word_to_index:

        if word in ['<pad>', '<sos>','<unk>']:
            temp_vector = np.zeros((word_embedding_size))
        elif word not in glove_embeddings:
            temp_vector = np.random.uniform(-np.sqrt(3)/np.sqrt(word_embedding_size), np.sqrt(3)/np.sqrt(word_embedding_size), word_embedding_size)
        else:
            temp_vector = glove_embeddings[word]
            
#         print(str(word) + str(temp_vector))
        temp_embeddings.append(temp_vector)

temp_embeddings = np.asarray(temp_embeddings)
temp_embeddings = temp_embeddings.astype(np.float32)

In [117]:
temp_embeddings

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.11415606,  0.09369559,  0.13041882, ..., -0.17320384,
         0.07250953,  0.08871087],
       [ 0.10268117,  0.13954978,  0.05073194, ..., -0.0747427 ,
         0.04370208, -0.13305794],
       [ 0.12480463, -0.1261571 , -0.07739303, ...,  0.07390368,
        -0.06525229, -0.1287655 ]], dtype=float32)

In [116]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.68520999,  0.63674998,  0.14244001, ..., -0.0029878 ,
        -0.55435997,  0.67944998],
       [-0.56307   , -0.13433   ,  0.11604   , ...,  0.26135001,
         0.28470999,  0.41460001],
       [-0.7335    ,  0.29925001, -0.078124  , ...,  0.00693   ,
         0.51097   ,  0.84179002]])

In [100]:
word_to_index


{'<pad>': 0,
 '<sos>': 1,
 '<unk>': 2,
 'of': 3,
 'and': 4,
 'in': 5,
 'to': 6,
 'sos': 7,
 'was': 8,
 's': 9,
 'as': 10,
 'for': 11,
 'on': 12,
 'that': 13,
 'is': 14,
 'with': 15,
 'by': 16,
 'at': 17,
 'his': 18,
 'from': 19,
 'what': 20,
 'were': 21,
 'which': 22,
 'it': 23,
 'he': 24,
 'new': 25,
 'are': 26,
 'her': 27,
 'who': 28,
 'first': 29,
 'city': 30,
 'also': 31,
 'beyoncé': 32,
 'has': 33,
 'one': 34,
 'have': 35,
 'york': 36,
 'or': 37,
 'had': 38,
 'their': 39,
 'be': 40,
 'chopin': 41,
 'this': 42,
 'did': 43,
 'season': 44,
 'its': 45,
 'after': 46,
 'west': 47,
 'not': 48,
 'music': 49,
 'she': 50,
 'many': 51,
 'most': 52,
 'when': 53,
 'been': 54,
 'other': 55,
 'million': 56,
 'but': 57,
 'they': 58,
 'all': 59,
 'album': 60,
 'world': 61,
 'two': 62,
 'american': 63,
 'chinese': 64,
 'than': 65,
 'during': 66,
 'time': 67,
 'torch': 68,
 'some': 69,
 'more': 70,
 'year': 71,
 'over': 72,
 'show': 73,
 'people': 74,
 'china': 75,
 'into': 76,
 'how': 77,
 'such': 