In [3]:
import json

In [4]:
TRAIN_1_FILE = '../data/train/Task_1_train.jsonl'

In [5]:
with open(TRAIN_1_FILE,'r') as f:
    lines = [json.loads(line) for line in f.read().splitlines()]

In [6]:
example = lines[0]

In [7]:
from transformers import BertTokenizer
bt = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
article_token = bt.encode(example['article'])[:512]

Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors


In [9]:
question_token = bt.encode(example['question'].replace('@placeholder','[MASK]'))

In [10]:
options_tokens = [bt.encode(example[f'option_{i}']) for i in range(5)]

In [11]:
import torch

In [12]:
article_token = torch.LongTensor([article_token])

In [13]:
question_token = torch.LongTensor([question_token])

In [14]:
question_token.shape

torch.Size([1, 26])

In [15]:
options_tokens = torch.LongTensor(options_tokens)

In [16]:
options_tokens.shape

torch.Size([5, 3])

In [17]:
from transformers import BertModel
m = BertModel.from_pretrained('bert-base-uncased')

In [18]:
m.embeddings(article_token).shape

torch.Size([1, 512, 768])

In [19]:
m.embeddings(question_token).shape

torch.Size([1, 26, 768])

In [20]:
m.embeddings(options_tokens).shape

torch.Size([5, 3, 768])

## GABert

In [21]:
m1 = BertModel.from_pretrained('bert-base-uncased')
m2 = BertModel.from_pretrained('bert-base-uncased')

In [22]:
e1 = m1.embeddings

In [23]:
layer_1_1 = m1.encoder.layer[0]
layer_2_1 = m2.encoder.layer[0]

In [24]:
article_embeds = e1(article_token)
print(article_embeds.shape)

torch.Size([1, 512, 768])


In [25]:
question_embeds = e1(question_token)
print(question_embeds.shape)

torch.Size([1, 26, 768])


In [26]:
options_embeds = e1(options_tokens)
print(options_embeds.shape)

torch.Size([5, 3, 768])


In [27]:
class GatedAttention(torch.nn.Module):      
    def forward(self, question_states, article_states):
        question_att = question_states.permute(0,2,1)
        att_matrix = torch.bmm(article_states,question_att)
        
        att_weights = torch.nn.functional.softmax(att_matrix.view(-1,att_matrix.size(-1)),dim=1).view_as(att_matrix)
        question_rep = torch.bmm(att_weights, question_states)
    
        question_to_article = torch.mul(article_states, question_rep)
        
        return question_to_article ##Attention applied on articles    

In [28]:
ga = GatedAttention()

In [29]:
layer_1_1_out = layer_1_1(question_embeds)
layer_2_1_out = layer_2_1(article_embeds)

In [30]:
layer_1_1_out[0].shape

torch.Size([1, 26, 768])

In [31]:
layer_2_1_out[0].shape

torch.Size([1, 512, 768])

In [32]:
ga(layer_1_1_out[0],layer_2_1_out[0]).shape

torch.Size([1, 512, 768])

In [33]:
inp_1 = question_embeds
inp_2 = article_embeds

## Need to check attention_mask and layer_head_mask
for i in range(len(m1.encoder.layer)):
    current_layer_1 = m1.encoder.layer[i]
    current_layer_2 = m2.encoder.layer[i]
    inp_1 = current_layer_1(inp_1)[0]
    int_2 = current_layer_2(inp_2)[0]
    inp_2 = ga(inp_1,int_2)

In [34]:
inp_1.shape

torch.Size([1, 26, 768])

In [35]:
inp_2.shape

torch.Size([1, 512, 768])

In [36]:
inp_2

tensor([[[ 8.1008e-02,  8.4509e-02,  9.0809e-02,  ..., -2.0987e-02,
           2.0178e-02,  1.3246e-01],
         [ 7.2911e-02,  5.7018e-02,  7.9535e-02,  ..., -1.7414e-02,
           4.2224e-02,  1.2953e-01],
         [ 6.1251e-02,  2.1614e-02,  7.0671e-02,  ..., -8.3078e-03,
           6.3368e-02,  1.2328e-01],
         ...,
         [ 5.1315e-02, -4.3455e-01, -1.0332e-01,  ...,  2.9795e-02,
          -5.7426e-02, -1.7541e-04],
         [ 2.6858e-02, -4.4956e-01, -1.2668e-01,  ...,  5.7979e-02,
          -2.2074e-01, -4.7746e-02],
         [-1.1459e-01,  3.6559e-02, -1.0911e-01,  ..., -6.2565e-02,
          -1.8362e-01,  7.9108e-02]]], grad_fn=<MulBackward0>)

In [37]:
article_embeds

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.4367,  0.5360, -0.0514,  ..., -0.0397,  0.6783, -0.5318],
         [ 0.7838, -0.3506, -1.1582,  ..., -0.8033,  0.1465,  0.2171],
         ...,
         [ 0.2022,  0.0762,  0.3220,  ...,  0.5130, -0.6300, -0.0597],
         [ 1.3539,  0.4626,  0.3129,  ..., -0.9238, -0.9422, -0.4833],
         [ 0.7480,  0.4874, -0.3261,  ..., -0.5679,  0.9606, -1.7922]]],
       grad_fn=<NativeLayerNormBackward>)

In [38]:
class GABertEmbeddings(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = BertModel.from_pretrained('bert-base-uncased').embeddings
    
    def forward(self, article_tokens, question_tokens, options_tokens):
        article_embeds = self.embeddings(article_tokens)
        question_embeds = self.embeddings(question_tokens)
        options_embeds = self.embeddings(options_tokens)
    
        return article_embeds, question_embeds, options_embeds

In [44]:
embeddings = GABertEmbeddings()
article_embeds, question_embeds, options_embeds = embeddings(article_token,question_token, options_tokens)

In [47]:
class GABertEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.m1 = BertModel.from_pretrained('bert-base-uncased')
        self.m2 = BertModel.from_pretrained('bert-base-uncased')
        self.ga = GatedAttention()
        
    def forward(self, article_contexts, question_contexts, article_attention_mask=None, question_attention_mask=None):
        
        for i in range(len(self.m1.encoder.layer)):
            current_layer_1 = self.m1.encoder.layer[i]
            current_layer_2 = self.m2.encoder.layer[i]
            question_contexts = current_layer_1(question_contexts, question_attention_mask)[0]
            article_intermediates = current_layer_2(article_contexts, article_attention_mask)[0]
            article_contexts = ga(question_contexts,article_intermediates)
        
        return article_contexts, question_contexts

In [48]:
encoder = GABertEncoder()
article_contexts, question_contexts= encoder(article_embeds, question_embeds)

In [49]:
class GABertPooler(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pooler = BertModel.from_pretrained('bert-base-uncased').pooler
    def forward(self,contexts):
        return self.pooler(contexts)

## Final Layer Out (Baseline GAReader)

In [50]:
class Linear(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()

        self.linear = nn.Linear(in_features=in_features, out_features=out_features)
        self.init_params()

    def init_params(self):
        torch.nn.init.kaiming_normal_(self.linear.weight.data)
        torch.nn.init.constant_(self.linear.bias.data, 0)

    def forward(self, x):

        # x: [batch_size, seq_len, in_features]
        x = self.linear(x)
        # x: [batch_size, seq_len, out_features]
        return x


In [51]:
class MLPAttention(torch.nn.Module):
    def __init__(self, dim, dropout):
        super(MLPAttention, self).__init__()

        self.Q_W = Linear(dim, dim)
        self.K_W = Linear(dim, dim)
        self.V_W = Linear(dim, dim)

        self.tanh = torch.nn.Tanh()
        self.V = Linear(dim, 1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, Q, K, V):
        # Q: [batch_size, dim]
        # K: [batch_size, seq_len, dim]
        # V: [batch_size, seq_len, dim]

        Q = self.dropout(self.Q_W(Q))  # [batch_size, dim]
        K = self.dropout(self.K_W(K))  # [batch_size, seq_len, dim]
        V = self.dropout(self.V_W(V))  # [batch_size, seq_len, dim]

        Q = Q.unsqueeze(1)  # [batch_size, 1, dim]
        M = self.dropout(self.tanh(Q + K))  # [batch_size, seq_len, dim]
        scores = self.dropout(self.V(M))  # [batch_size, seq_len, 1]
        scores = F.softmax(scores, dim=1)  # [batch_size, seq_len, 1]

        R = self.dropout(V * scores)  # [batch_size, seq_len, dim]

        feat = torch.sum(R, dim=1)  # [batch_size, dim]

        return feat

In [None]:
class BaselineOut(torch.nn.Module):
    def __init__(dropout, hidden_size):
        self.dropout = torch.nn.Dropout(dropout)
        self.mlp_att = MLPAttention(hidden_size*2, dropout)
        self.dot_layer = MLPAttention(hidden_size*2,dropout)
        self.final_linear = Linear(hidden_size*10,output_dim)
    def forward(self, article_contexts,question_contexts,options_embeds):

        bsz = article_contexts.shape[0]
        options_embeds = options_embeds.view(bsz,5,-1,hidden_size)
        
        article_question_attention = self.mlp_att(question_contexts, article_contexts, article_contexts)
        
        
        att_opt0 = 

## Selection and Pooling