# Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-cased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


# Import Data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./data/data_place_of_death_unlabel.csv')

In [4]:
drop_row = []
for i in range(len(df)):
    sentence = df.iloc[i]['sentence']
    if type(sentence) == float:
        drop_row.append(i)

df.drop(drop_row, inplace=True)

In [5]:
print(len(df))

159902


# Text Cleaning

In [6]:
print('For this data')
print("Number of sentences : ", len(df))
print("Longest sentence\'s length : " + str(df.sentence.map(len).max()))
print("Average length of the sentences : " + str(df.sentence.map(len).mean()))

For this data
Number of sentences :  159902
Longest sentence's length : 2910
Average length of the sentences : 129.85074608197522


### Limit the tweet max length

In [7]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [8]:
df['sentence'] = df['sentence'].map(lambda x: max_text_length(x, 1000))

df

Unnamed: 0,sentence,wikidata,label
0,ludmila manicler born 6 july 1987 is an argent...,place of birth San Pedro,
1,she is a former member of the argentina women'...,place of birth San Pedro,
2,li ge born 12 april 1969 is a chinese former g...,place of birth Zigong,
3,"kazuhiro ""daimajin"" sasaki 佐々木 主浩 sasaki kazuh...",place of birth Sendai,
4,he played his entire npb career with the yokoh...,place of birth Sendai,
...,...,...,...
159902,"he served as chairman of the board, chairman, ...",place of birth Cardiff,
159903,"levon isayevich mirzoyan ; november 14, 1897 –...",place of birth Shusha,
159904,mirzoyan was born in the village of ashan in s...,place of birth Shusha,
159905,benjamin brian colin parker born 8 november 19...,place of birth Pontefract,


In [9]:
df.to_csv("./data/test.csv", index=False)

## Make Dataset

In [10]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv('./data/' + mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

### Make testset

In [11]:
trainset = WikiDataset("test", tokenizer=tokenizer)

## DataLoader

In [12]:
"""
Create a DataLoader that can return a mini-batch each time
This DataLoader works with the 'WikiDataset' we define previously
we need 4 tensors when training a BERT model：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [13]:
testloader = DataLoader(trainset, batch_size=128, collate_fn=create_mini_batch)

## Load model

In [14]:
model = torch.load('model_place_of_birth')

In [15]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

## Prediction

In [16]:
def get_predictions(model, dataloader):
    predictions = None
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            pred = logits.data
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    return predictions

In [17]:
# run the model on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [18]:
%%time

with torch.no_grad():
    preds = get_predictions(model, testloader)
torch.cuda.empty_cache()

Wall time: 28min 17s


In [19]:
preds

tensor([[ 2.6423, -1.8725, -0.2144],
        [-1.9707, -2.4925,  4.3746],
        [ 2.4522, -1.9449, -0.1466],
        ...,
        [ 2.8695, -1.8689, -0.6677],
        [ 2.8081, -1.7425, -0.4461],
        [-1.9700, -2.6761,  5.0075]], device='cuda:0')

In [20]:
m = torch.nn.Softmax(dim=0)
consistence_prob = []
inconsistence_prob = []
irrelenant_prob = []

for i in range(len(preds)):
    value_0 = m(preds[i])[0].cpu().numpy().tolist()
    value_1 = m(preds[i])[1].cpu().numpy().tolist()
    value_2 = m(preds[i])[2].cpu().numpy().tolist()
    consistence_prob.append(value_0)
    inconsistence_prob.append(value_1)
    irrelenant_prob.append(value_2)

In [21]:
df['0_prob'] = consistence_prob
df['1_prob'] = inconsistence_prob
df['2_prob'] = irrelenant_prob

In [22]:
df_1 = df.sort_values(by=['1_prob'], ascending=False)

In [23]:
df_1 = df[df['1_prob']>0.33]

In [24]:
df.to_csv('./data/predicted_label_first_two_sentence_all_label.csv', index=False)

In [25]:
import os
os.remove("./data/test.csv")

In [26]:
for i in range(len(df_1)):
    print(df_1.iloc[i]['sentence'])
    print(df_1.iloc[i]['wikidata'])
    print(df_1.iloc[i]['label'])
    print('')

matthew thomas cain born october 1, 1984, nicknamed the horse, big daddy, and big sugar, is an american former professional baseball pitcher, who played 13 seasons in major league baseball mlb for the san francisco giants
place of birth Dothan
nan

scott andrews born 1 august 1989 is a welsh international rugby union player for pro14 side cardiff blues
place of birth Church Village
nan

william james packwood born may 21, 1993 is a former american soccer player who played as a defender or defensive midfielder
place of birth Concord
nan

hoàng cẩm vân born 31 may 1959 is a vietnamese female singer.dale alan olsen popular music of vietnam the politics of remembering routledge 2008 "cẩm vân" pp48, 139-140, 192, 212, 227cẩm vân - khắc triệu 20 năm ấy, biết bao vui buồn she appeared on vietnam idol season 1, and was scheduled to be a judge on cặp đôi hoàn hảo season 1
place of birth District 1
nan

desmond tremaine mason born october 11, 1977 is an american former professional basketball pl


born november 29, 1987 is an american professional basketball player for the new york knicks of the national basketball association nba
place of birth Wynnewood
nan

zoë yadira saldaña nazario born june 19, 1978 is an american actress
place of birth Passaic
nan

april jeanette mendez born march 19, 1987 is an american author and former professional wrestler
place of birth Union City
nan

john david douglas born june 12, 1956 is a retired american professional basketball player who played in the national basketball association nba
place of birth Town Creek
nan

john bradley holland born december 6, 1956 is a retired american professional basketball player
place of birth Billings
nan

michael james vogel born july 17, 1979 is an american actor and former model
place of birth Abington Township
nan

azu born december 8, 1981 is a japanese r and b singer
place of birth Tsu
nan

is a classical pianist and conductor, born in japan and naturalised in britain, particularly noted for her interp

# Proportion of each label

In [27]:
m = torch.nn.Softmax(dim=0)
labels_prob = []

for i in range(len(preds)):
    value_list = m(preds[i]).cpu().numpy().tolist()
    pred_label = value_list.index(max(value_list))
    labels_prob.append(pred_label)

In [28]:
df['label'] = labels_prob
df

Unnamed: 0,sentence,wikidata,label,0_prob,1_prob,2_prob
0,ludmila manicler born 6 july 1987 is an argent...,place of birth San Pedro,0,0.935978,0.010244,0.053778
1,she is a former member of the argentina women'...,place of birth San Pedro,2,0.001750,0.001039,0.997211
2,li ge born 12 april 1969 is a chinese former g...,place of birth Zigong,0,0.920235,0.011331,0.068435
3,"kazuhiro ""daimajin"" sasaki 佐々木 主浩 sasaki kazuh...",place of birth Sendai,0,0.962115,0.007489,0.030396
4,he played his entire npb career with the yokoh...,place of birth Sendai,2,0.001486,0.000678,0.997835
...,...,...,...,...,...,...
159902,"he served as chairman of the board, chairman, ...",place of birth Cardiff,2,0.001039,0.000437,0.998524
159903,"levon isayevich mirzoyan ; november 14, 1897 –...",place of birth Shusha,2,0.002036,0.000632,0.997332
159904,mirzoyan was born in the village of ashan in s...,place of birth Shusha,0,0.963532,0.008433,0.028034
159905,benjamin brian colin parker born 8 november 19...,place of birth Pontefract,0,0.953129,0.010067,0.036804


In [29]:
df['label'].value_counts()

2    107140
0     52341
1       421
Name: label, dtype: int64

#   

In [30]:
df_0 = df.sort_values(by=['0_prob'], ascending=False)

In [31]:
for i in range(20):
    print(df_0.iloc[i]['sentence'])
    print(df_0.iloc[i]['wikidata'])
    print(df_0.iloc[i]['0_prob'])
    print(' ')
    print(' ')

wilson was born in the parish of glencorse, midlothian to annie clark harper and john wilson, a sheep farmer
place of birth Glencorse
0.9820007681846619
 
 
kathy chow hoi-mei ; born december 6, 1966 in hong kong is a hong kong actress and singer who is widely known for her leading roles in hong kong tvb series during the late 1980s to 1990s such as the breaking point and time before time
place of birth Hong Kong
0.9804829359054565
 
 
trần thị cẩm ly born 30 march 1970 in saigon better known as cẩm ly, is a vietnamese pop singer, who is also known for southern vietnam folk songs
place of birth Ho Chi Minh City
0.9793263673782349
 
 
mings was born in bath, avon to former non-league striker adie mings
place of birth Bath
0.9793061017990112
 
 
odilo pedro scherer ; born september 21, 1949 is a brazilian cardinal of the catholic church, who has been the archbishop of são paulo since march 2007
place of birth Cerro Largo, Rio Grande do Sul
0.9791168570518494
 
 
ildikó mincza-nébald born