In [1]:
import torch
import torch.nn as nn

import random

from transformers import BertTokenizer, BertModel
from torchtext import data

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
len(tokenizer.vocab)

30522

In [4]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [5]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [6]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [7]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [8]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [9]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [10]:
class BERTSentiment(nn.Module):
    def __init__(self):
        
        super().__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        embedding_dim = self.bert.config.to_dict()['hidden_size']

        self.output = nn.Linear(embedding_dim, 1)
                
    def forward(self, text):
                
        embedded = self.bert(text)[0]
        
        logits = embedded[:,0,:]
        
        final_logits = self.output(logits)
        
        return final_logits

In [11]:
models = []

for i in range(2):
    new_model = BERTSentiment().cuda()
    new_model.load_state_dict(torch.load(f'para_rank_model_{i+1}.pt'))
    models.append(new_model)

In [12]:
def predict_sentiment(models, tokenizer, sentence):
    for m in models:
        m.eval()
    
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).cuda()
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(models[0](tensor) + models[1](tensor))
    return prediction.item()

In [13]:
def para_scores(para, question):
    lines = para.split('.')
    
    print(f"QUESTION : {question}\n")
    for line in lines[:-1]:
        input_text = question + ' [SEP] ' + line.lower()
        p = predict_sentiment(models, tokenizer, input_text)

        print(f"LINE : {line}\nPRED : {p:.4f}\n")

In [14]:
#question = 'when is ipl held ?'
#question = 'who founded ipl ?'

#question = 'how much is ipl worth ?'
#question = 'how much did ipl contribute to the indian economy ?'

#question = 'how many seasons of ipl have been held ?'
question = 'who are the current title holder of ipl ?'

In [15]:
para = 'There have been twelve seasons of the IPL tournament. The current IPL title holders are the Mumbai Indians, who won the 2019 season. The venue for the 2020 season has been moved, due to the ongoing COVID-19 pandemic; games will now take place in the United Arab Emirates from September 19 through November 10.'

In [16]:
para_scores(para, question)

QUESTION : who are the current title holder of ipl ?

LINE : There have been twelve seasons of the IPL tournament
PRED : 0.0000

LINE :  The current IPL title holders are the Mumbai Indians, who won the 2019 season
PRED : 1.0000

LINE :  The venue for the 2020 season has been moved, due to the ongoing COVID-19 pandemic; games will now take place in the United Arab Emirates from September 19 through November 10
PRED : 0.0000

