# Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-uncased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


# Import Data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./data/data_place_of_birth_unlabel.csv')

In [4]:
drop_row = []
for i in range(len(df)):
    sentence = df.iloc[i]['sentence']
    if type(sentence) == float:
        drop_row.append(i)

df.drop(drop_row, inplace=True)

In [5]:
print(len(df))

134236


# Text Cleaning

In [6]:
print('For this data')
print("Number of sentences : ", len(df))
print("Longest sentence\'s length : " + str(df.sentence.map(len).max()))
print("Average length of the sentences : " + str(df.sentence.map(len).mean()))

For this data
Number of sentences :  134236
Longest sentence's length : 1279
Average length of the sentences : 117.97902202091838


### Limit the tweet max length

In [7]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [8]:
df['sentence'] = df['sentence'].map(lambda x: max_text_length(x, 1000))

df

Unnamed: 0,sentence,wikidata,label,raw_sentence
0,ludmila manicler (born 6 july 1987) is an arge...,place of birth San Pedro,,Ludmila Manicler (born 6 July 1987) is an Arge...
1,she is a former member of the argentina women'...,place of birth San Pedro,,She is a former member of the Argentina women'...
2,li ge (born 12 april 1969) is a chinese former...,place of birth Zigong,,Li Ge (born 12 April 1969) is a Chinese former...
3,"kazuhiro ""daimajin"" sasaki (佐々木 主浩 sasaki kaz...",place of birth Sendai,,"Kazuhiro ""Daimajin"" Sasaki (佐々木 主浩 Sasaki Kaz..."
4,he played his entire npb career with the yokoh...,place of birth Sendai,,He played his entire NPB career with the Yokoh...
...,...,...,...,...
134977,"he served as chairman of the board, chairman, ...",place of birth Cardiff,,"He served as chairman of the board, chairman, ..."
134978,"levon isayevich mirzoyan (; ) (november 14, 18...",place of birth Shusha,,"Levon Isayevich Mirzoyan (; ) (November 14, 18..."
134979,he was executed during the great purge.,place of birth Shusha,,He was executed during the Great Purge.
134980,benjamin brian colin parker (born 8 november 1...,place of birth Pontefract,,Benjamin Brian Colin Parker (born 8 November 1...


In [9]:
df.to_csv("./data/test.csv", index=False)

## Make Dataset

In [10]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv('./data/' + mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

### Make testset

In [11]:
trainset = WikiDataset("test", tokenizer=tokenizer)

## DataLoader

In [12]:
"""
Create a DataLoader that can return a mini-batch each time
This DataLoader works with the 'WikiDataset' we define previously
we need 4 tensors when training a BERT model：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [13]:
testloader = DataLoader(trainset, batch_size=128, collate_fn=create_mini_batch)

## Load model

In [14]:
model = torch.load('model_place_of_birth')

In [15]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

## Prediction

In [16]:
def get_predictions(model, dataloader):
    predictions = None
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            pred = logits.data
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    return predictions

In [17]:
# run the model on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [18]:
%%time

with torch.no_grad():
    preds = get_predictions(model, testloader)
torch.cuda.empty_cache()

Wall time: 15min 40s


In [19]:
preds

tensor([[-1.4852, -2.1898,  3.8251],
        [-1.7556, -1.7699,  3.5746],
        [ 0.2725, -2.1025,  1.7854],
        ...,
        [-1.9682, -1.9183,  3.9040],
        [-0.6922, -2.3805,  2.8402],
        [-1.7205, -1.5201,  3.5702]], device='cuda:0')

In [20]:
m = torch.nn.Softmax(dim=0)
consistence_prob = []
inconsistence_prob = []
irrelenant_prob = []

for i in range(len(preds)):
    value_0 = m(preds[i])[0].cpu().numpy().tolist()
    value_1 = m(preds[i])[1].cpu().numpy().tolist()
    value_2 = m(preds[i])[2].cpu().numpy().tolist()
    consistence_prob.append(value_0)
    inconsistence_prob.append(value_1)
    irrelenant_prob.append(value_2)

In [21]:
df['0_prob'] = consistence_prob
df['1_prob'] = inconsistence_prob
df['2_prob'] = irrelenant_prob

In [22]:
df_1 = df.sort_values(by=['1_prob'], ascending=False)

In [23]:
df_1 = df[df['1_prob']>0.33]

In [24]:
df.to_csv('./data/predicted_label_first_two_sentence_all_label.csv', index=False)

In [25]:
import os
os.remove("./data/test.csv")

In [26]:
for i in range(len(df_1)):
    print(df_1.iloc[i]['sentence'])
    print(df_1.iloc[i]['wikidata'])
    print(df_1.iloc[i]['label'])
    print('')

diego klattenhoff (born november 30, 1979) is a canadian actor known for his portrayals of mike faber in the showtime series homeland and as fbi agent donald ressler in the blacklist
place of birth French River, Pictou County
nan

alex caruso (born february 28, 1994) is an american professional basketball player for the los angeles lakers of the national basketball association (nba)
place of birth College Station
nan

jean-baptiste-camille corot ( ,  , ; july 16, 1796his birth certificate initially indicated 27 messidor (july 15), but this was corrected to 28 – february 22, 1875) was a french landscape and portrait painter  as well as a printmaker in etching
place of birth rue du Bac
nan

jon watts (born june 28, 1981) is an american film director, producer, and screenwriter
place of birth Fountain
nan

marat zhaksylykuly abiyev (, marat jaqsylyquly ábıev; born september 6, 1989) is a kazakh businessman, author of the kazakhstani dream book
place of birth Kandyagash
nan

lim yong-kyu (

# Proportion of each label

In [27]:
m = torch.nn.Softmax(dim=0)
labels_prob = []

for i in range(len(preds)):
    value_list = m(preds[i]).cpu().numpy().tolist()
    pred_label = value_list.index(max(value_list))
    labels_prob.append(pred_label)

In [28]:
df['label'] = labels_prob
df

Unnamed: 0,sentence,wikidata,label,raw_sentence,0_prob,1_prob,2_prob
0,ludmila manicler (born 6 july 1987) is an arge...,place of birth San Pedro,2,Ludmila Manicler (born 6 July 1987) is an Arge...,0.004904,0.002424,0.992672
1,she is a former member of the argentina women'...,place of birth San Pedro,2,She is a former member of the Argentina women'...,0.004797,0.004729,0.990474
2,li ge (born 12 april 1969) is a chinese former...,place of birth Zigong,2,Li Ge (born 12 April 1969) is a Chinese former...,0.177534,0.016512,0.805954
3,"kazuhiro ""daimajin"" sasaki (佐々木 主浩 sasaki kaz...",place of birth Sendai,0,"Kazuhiro ""Daimajin"" Sasaki (佐々木 主浩 Sasaki Kaz...",0.973907,0.007905,0.018188
4,he played his entire npb career with the yokoh...,place of birth Sendai,2,He played his entire NPB career with the Yokoh...,0.002745,0.002493,0.994762
...,...,...,...,...,...,...,...
134977,"he served as chairman of the board, chairman, ...",place of birth Cardiff,2,"He served as chairman of the board, chairman, ...",0.002701,0.002249,0.995050
134978,"levon isayevich mirzoyan (; ) (november 14, 18...",place of birth Shusha,2,"Levon Isayevich Mirzoyan (; ) (November 14, 18...",0.008293,0.005563,0.986145
134979,he was executed during the great purge.,place of birth Shusha,2,He was executed during the Great Purge.,0.002800,0.002944,0.994256
134980,benjamin brian colin parker (born 8 november 1...,place of birth Pontefract,2,Benjamin Brian Colin Parker (born 8 November 1...,0.028257,0.005223,0.966519


In [29]:
df['label'].value_counts()

2    109338
0     24869
1        29
Name: label, dtype: int64

#   

In [30]:
df_0 = df.sort_values(by=['0_prob'], ascending=False)

In [31]:
for i in range(20):
    print(df_0.iloc[i]['sentence'])
    print(df_0.iloc[i]['wikidata'])
    print(df_0.iloc[i]['0_prob'])
    print(' ')
    print(' ')

he was born in blue island, illinois, raised in tinley park, illinois, graduated in the class of 1962 from bremen high school (midlothian, illinois) in midlothian, illinois  and at the time of his death resided in monrovia, indiana.
place of birth Tinley Park
0.9895918965339661
 
 
dong yu (chinese: 董宇; born 15 july 1994 in qingdao) is a chinese footballer who plays for hangzhou greentown in the china league one.
place of birth Qingdao
0.9882164597511292
 
 
elisabeth clara heath-sladen (1 february 1946sladen was born in 1946, though this was often erroneously reported as 1948.
place of birth Liverpool
0.9876894950866699
 
 
fløgstad was born in the industrial city of sauda in ryfylke, rogaland
place of birth Sauda
0.9875656962394714
 
 
he was born 10 december 1934 at yunlin county in taiwan, and graduated from national taiwan normal university in 1958 with a degree in english
place of birth Beigang, Yunlin
0.9872835874557495
 
 
he was born in yilan, taiwan.lan cheng-lung at douban.c