# Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-cased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


# Import Data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./data/data_place_of_death_unlabel.csv')

In [4]:
drop_row = []
for i in range(len(df)):
    sentence = df.iloc[i]['sentence']
    if type(sentence) == float:
        drop_row.append(i)

df.drop(drop_row, inplace=True)

In [5]:
print(len(df))

159902


# Text Cleaning

In [6]:
print('For this data')
print("Number of sentences : ", len(df))
print("Longest sentence\'s length : " + str(df.sentence.map(len).max()))
print("Average length of the sentences : " + str(df.sentence.map(len).mean()))

For this data
Number of sentences :  159902
Longest sentence's length : 2910
Average length of the sentences : 129.85074608197522


### Limit the tweet max length

In [7]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [8]:
df['sentence'] = df['sentence'].map(lambda x: max_text_length(x, 1000))

df

Unnamed: 0,sentence,wikidata,label
0,ludmila manicler born 6 july 1987 is an argent...,place of birth San Pedro,
1,she is a former member of the argentina women'...,place of birth San Pedro,
2,li ge born 12 april 1969 is a chinese former g...,place of birth Zigong,
3,"kazuhiro ""daimajin"" sasaki 佐々木 主浩 sasaki kazuh...",place of birth Sendai,
4,he played his entire npb career with the yokoh...,place of birth Sendai,
...,...,...,...
159902,"he served as chairman of the board, chairman, ...",place of birth Cardiff,
159903,"levon isayevich mirzoyan ; november 14, 1897 –...",place of birth Shusha,
159904,mirzoyan was born in the village of ashan in s...,place of birth Shusha,
159905,benjamin brian colin parker born 8 november 19...,place of birth Pontefract,


In [9]:
df.to_csv("./data/test.csv", index=False)

## Make Dataset

In [10]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv('./data/' + mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

### Make testset

In [11]:
trainset = WikiDataset("test", tokenizer=tokenizer)

## DataLoader

In [12]:
"""
Create a DataLoader that can return a mini-batch each time
This DataLoader works with the 'WikiDataset' we define previously
we need 4 tensors when training a BERT model：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [13]:
testloader = DataLoader(trainset, batch_size=128, collate_fn=create_mini_batch)

## Load model

In [14]:
model = torch.load('model_place_of_birth')

In [15]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

## Prediction

In [16]:
def get_predictions(model, dataloader):
    predictions = None
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            pred = logits.data
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    return predictions

In [17]:
# run the model on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [18]:
%%time

with torch.no_grad():
    preds = get_predictions(model, testloader)
torch.cuda.empty_cache()

Wall time: 26min 53s


In [19]:
preds

tensor([[ 2.7095, -2.2033, -1.1276],
        [-2.1357, -2.3624,  4.7262],
        [ 2.3143, -1.2662, -1.9053],
        ...,
        [ 3.5406, -1.4478, -2.3116],
        [ 2.0807, -2.1764, -1.0328],
        [-1.9644, -1.9608,  4.6672]], device='cuda:0')

In [20]:
m = torch.nn.Softmax(dim=0)
inconsistence_prob = []

for i in range(len(preds)):
    value = m(preds[i])[1].cpu().numpy().tolist()
    inconsistence_prob.append(value)

In [21]:
df['label'] = inconsistence_prob
df

Unnamed: 0,sentence,wikidata,label
0,ludmila manicler born 6 july 1987 is an argent...,place of birth San Pedro,0.007146
1,she is a former member of the argentina women'...,place of birth San Pedro,0.000833
2,li ge born 12 april 1969 is a chinese former g...,place of birth Zigong,0.026727
3,"kazuhiro ""daimajin"" sasaki 佐々木 主浩 sasaki kazuh...",place of birth Sendai,0.003321
4,he played his entire npb career with the yokoh...,place of birth Sendai,0.001312
...,...,...,...
159902,"he served as chairman of the board, chairman, ...",place of birth Cardiff,0.001119
159903,"levon isayevich mirzoyan ; november 14, 1897 –...",place of birth Shusha,0.001601
159904,mirzoyan was born in the village of ashan in s...,place of birth Shusha,0.006751
159905,benjamin brian colin parker born 8 november 19...,place of birth Pontefract,0.013378


In [22]:
df = df.sort_values(by=['label'], ascending=False)

In [27]:
df_1 = df[df['label']>0.33]

In [33]:
df_1.to_csv('./data/predicted_label_first_two_sentence.csv', index=False)

In [36]:
import os
os.remove("./data/test.csv")

In [37]:
for i in range(len(df_1)):
    print(df_1.iloc[i]['sentence'])
    print(df_1.iloc[i]['wikidata'])
    print(df_1.iloc[i]['label'])
    print('')

born in fenghuang, xiangxi prefecture of hunan, china, xiong was also a chinese scholar
place of birth fenghuang
0.9822410941123962

karim alami born 24 may 1973 is a retired tennis player from morocco, who turned professional in 1990
place of birth casablanca
0.9809897541999817

ahmed ben bella was born in maghnia, in the former department of oran, western algeria,lyes laribi , l'algérie des généraux, max milo, 2007, p.11 on 25 december 1916,shown as 1916 in some sources, but his father changed his year of birth from 1916 to enable him to leave school early and help him on his farm during the height of the french colonial period
place of birth ouedjda
0.9801813960075378

issam el adoua born 9 december 1986 is a moroccan footballer who plays for al dhafra as a central defender.issam el adoua no vitória por dois anos issam el adoua in vitória for two years; vitória guimarães' official website, 4 june 2011
place of birth casablanca
0.9785626530647278

gad elmaleh gād el-māleḥ; born 19 ap