# Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-cased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


# Import Data

In [82]:
import pandas as pd

In [83]:
df_2 = pd.read_csv('./data/predicted_label_first_two_sentence.csv')

In [84]:
df_2

Unnamed: 0,sentence,wikidata,label
0,sid ahmed ghozali born 31 march 1937 in maghni...,place of birth ouedjda,0.993846
1,ahmed ben bella ; 25 december 1916 – 11 april ...,place of birth ouedjda,0.991789
2,"ahmed ben bella was born in maghnia, in the fo...",place of birth ouedjda,0.991534
3,born lourens alma tadema ; 8 january 1836 – 25...,place of birth Dronryp,0.989448
4,he was a member of the national liberation fro...,place of birth ouedjda,0.985167
...,...,...,...
159897,she is known as one of the members of the japa...,place of birth Hiroshima,0.000752
159898,"and mabel née woodward johnson, in 1931 at aug...",place of birth Augusta,0.000695
159899,"born in saint-étienne-de-lauzon, quebec, rober...",place of birth Lévis,0.000691
159900,"265 bc – 241 bc, the elder son of eudamidas ii...",place of birth Sparta,0.000689


In [125]:
df = pd.DataFrame(columns=['sentence', 'wikidata'])
df_test = pd.DataFrame(columns=['sentence', 'wikidata', 'inconsistent_prob'])

In [126]:
test_iloc = [30, 39, 50, 52, 53]

for ID in test_iloc:
    test_sentence = df_2.iloc[ID]['sentence']
    test_wikidata = df_2.iloc[ID]['wikidata']
    test_prob = df_2.iloc[ID]['label']
    new_row = {'sentence':test_sentence, 'wikidata':test_wikidata, 'inconsistent_prob':test_prob}
    df_test = df_test.append(new_row, ignore_index=True)

In [127]:
df_test

Unnamed: 0,sentence,wikidata,inconsistent_prob
0,"born and raised in morocco, he immigrated to t...",place of birth casablanca,0.850393
1,"born in casablanca, el adoua made his senior d...",place of birth casablanca,0.770453
2,"was born on february 10, 1955 in casablanca, t...",place of birth casablanca,0.546604
3,"born in fenghuang, xiangxi prefecture of hunan...",place of birth fenghuang,0.515357
4,"flynn was born in casablanca, morocco but grew...",place of birth casablanca,0.502653


In [128]:
for i in range(len(df_test)):
    test_wikidata_list = df_test.iloc[i]['wikidata'].split(' ')
    test_sentence = df_test.iloc[i]['sentence']
    if 'casablanca' in test_wikidata_list:
        test_wikidata_list.remove('casablanca')
        test_wikidata_list.append('morocco')
        new_row = {'sentence':test_sentence, 'wikidata':" ".join(test_wikidata_list)}
        df = df.append(new_row, ignore_index=True)
    else:
        test_wikidata_list.remove('fenghuang')
        test_wikidata_list.append('china')
        new_row = {'sentence':test_sentence, 'wikidata':" ".join(test_wikidata_list)}
        df = df.append(new_row, ignore_index=True)

In [129]:
# test_sentence_list = test_sentence.split(' ')
# len_test_sentence = len(test_sentence_list)

In [130]:
# for i in range(len_test_sentence):
#     test_sentence_list.remove(test_sentence_list[i])
#     new_row = {'sentence':" ".join(test_sentence_list), 'wikidata':test_wikidata}
#     df = df.append(new_row, ignore_index=True)
#     test_sentence_list = test_sentence.split(' ')

In [131]:
df

Unnamed: 0,sentence,wikidata
0,"born and raised in morocco, he immigrated to t...",place of birth morocco
1,"born in casablanca, el adoua made his senior d...",place of birth morocco
2,"was born on february 10, 1955 in casablanca, t...",place of birth morocco
3,"born in fenghuang, xiangxi prefecture of hunan...",place of birth china
4,"flynn was born in casablanca, morocco but grew...",place of birth morocco


In [132]:
df.to_csv("./data/test.csv", index=False)

## Make Dataset

In [133]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv('./data/' + mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

### Make testset

In [134]:
trainset = WikiDataset("test", tokenizer=tokenizer)

## DataLoader

In [135]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [136]:
testloader = DataLoader(trainset, batch_size=128, collate_fn=create_mini_batch)

## Load model

In [137]:
model = torch.load('model_place_of_birth')

## Prediction

In [138]:
def get_predictions(model, dataloader):
    predictions = None
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            pred = logits.data
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    return predictions

In [139]:
model.eval()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [140]:
%%time

with torch.no_grad():
    preds = get_predictions(model, testloader)
torch.cuda.empty_cache()

Wall time: 51 ms


In [141]:
m = torch.nn.Softmax(dim=0)
inconsistence_prob = []

for i in range(len(preds)):
    value = m(preds[i])[1].cpu().numpy().tolist()
    inconsistence_prob.append(value)

In [142]:
df['inconsistent_prob'] = inconsistence_prob
df

Unnamed: 0,sentence,wikidata,inconsistent_prob
0,"born and raised in morocco, he immigrated to t...",place of birth morocco,0.489572
1,"born in casablanca, el adoua made his senior d...",place of birth morocco,0.989376
2,"was born on february 10, 1955 in casablanca, t...",place of birth morocco,0.478744
3,"born in fenghuang, xiangxi prefecture of hunan...",place of birth china,0.280171
4,"flynn was born in casablanca, morocco but grew...",place of birth morocco,0.579261


In [81]:
import os
os.remove("./data/test.csv")

#   

In [145]:
for i in range(len(df_test)):
    print(df_test.iloc[i]['sentence'])
    print(df_test.iloc[i]['wikidata'])
    print(df_test.iloc[i]['inconsistent_prob'])
    print('-------------------------------')
    print(df.iloc[i]['sentence'])
    print(df.iloc[i]['wikidata'])
    print(df.iloc[i]['inconsistent_prob'])
    print('')
    print('')

born and raised in morocco, he immigrated to the united states with his family when he was 13
place of birth casablanca
0.8503934741020203
-------------------------------
born and raised in morocco, he immigrated to the united states with his family when he was 13
place of birth morocco
0.4895717203617096


born in casablanca, el adoua made his senior debuts for his hometown's wydad casablanca, and formed a solid partnership with hicham louissi during his spell at the club
place of birth casablanca
0.7704533934593201
-------------------------------
born in casablanca, el adoua made his senior debuts for his hometown's wydad casablanca, and formed a solid partnership with hicham louissi during his spell at the club
place of birth morocco
0.9893761873245239


was born on february 10, 1955 in casablanca, then part of french morocco
place of birth casablanca
0.5466042757034302
-------------------------------
was born on february 10, 1955 in casablanca, then part of french morocco
place of 