# Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-cased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


# Import Data

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('./data/data_place_of_death_unlabel.csv')

In [6]:
df = df[:5000]

# Text Cleaning

In [11]:
print('For this data')
print("Number of sentences : ", len(df))
print("Longest sentence\'s length : " + str(df.sentence.map(len).max()))
print("Average length of the sentences : " + str(df.sentence.map(len).mean()))

For this data
Number of sentences :  5000
Longest sentence's length : 1200
Average length of the sentences : 128.044


### Limit the tweet max length

In [9]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [10]:
df['sentence'] = df['sentence'].map(lambda x: max_text_length(x, 1200))

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,sentence,wikidata,label
0,ludmila manicler born 6 july 1987 is an argent...,place of birth San Pedro,
1,she is a former member of the argentina women'...,place of birth San Pedro,
2,li ge born 12 april 1969 is a chinese former g...,place of birth Zigong,
3,"kazuhiro ""daimajin"" sasaki 佐々木 主浩 sasaki kazuh...",place of birth Sendai,
4,he played his entire npb career with the yokoh...,place of birth Sendai,
...,...,...,...
4995,clemens fritz born 7 december 1980 is a german...,place of birth Erfurt,
4996,he is mostly known for his 11-year spell at we...,place of birth Erfurt,
4997,"jim elmer larue august 11, 1925 – march 29, 20...",place of birth Clinton,
4998,he served as the head coach at the university ...,place of birth Clinton,


In [12]:
df.to_csv("test.csv", index=False)

## Make Dataset

In [13]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv(mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

### Make testset

In [14]:
trainset = WikiDataset("test", tokenizer=tokenizer)

## DataLoader

In [15]:
"""
Create a DataLoader that can return a mini-batch each time
This DataLoader works with the 'WikiDataset' we define previously
we need 4 tensors when training a BERT model：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [24]:
testloader = DataLoader(trainset, batch_size=128, collate_fn=create_mini_batch)

## Load model

In [17]:
model = torch.load('model_place_of_birth')

In [18]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

## Prediction

In [38]:
def get_predictions(model, dataloader):
    predictions = None
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            pred = logits.data
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    return predictions

In [39]:
# run the model on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [40]:
with torch.no_grad():
    preds = get_predictions(model, testloader)
torch.cuda.empty_cache()

In [41]:
preds

tensor([[ 2.7277, -2.7563],
        [ 2.3453, -2.4456],
        [ 2.8337, -2.7861],
        ...,
        [ 2.7025, -2.8294],
        [ 1.7974, -2.5492],
        [ 2.8001, -2.9380]], device='cuda:0')

In [59]:
m = torch.nn.Softmax(dim=0)
inconsistence_prob = []

for i in range(len(preds)):
    value = m(preds[i])[1].cpu().numpy().tolist()
    inconsistence_prob.append(value)

In [61]:
df['label'] = inconsistence_prob
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,sentence,wikidata,label
0,ludmila manicler born 6 july 1987 is an argent...,place of birth San Pedro,0.004135
1,she is a former member of the argentina women'...,place of birth San Pedro,0.008236
2,li ge born 12 april 1969 is a chinese former g...,place of birth Zigong,0.003612
3,"kazuhiro ""daimajin"" sasaki 佐々木 主浩 sasaki kazuh...",place of birth Sendai,0.003284
4,he played his entire npb career with the yokoh...,place of birth Sendai,0.008221
...,...,...,...
4995,clemens fritz born 7 december 1980 is a german...,place of birth Erfurt,0.005101
4996,he is mostly known for his 11-year spell at we...,place of birth Erfurt,0.002713
4997,"jim elmer larue august 11, 1925 – march 29, 20...",place of birth Clinton,0.003943
4998,he served as the head coach at the university ...,place of birth Clinton,0.012785


In [64]:
df = df.sort_values(by=['label'], ascending=False)

In [67]:
for i in range(10):
    print(df.iloc[i]['sentence'])
    print(df.iloc[i]['wikidata'])
    print(df.iloc[i]['label'])
    print('')

she was the country's first female prime minister, attorney general, and leader of the opposition, pnm lose to peoples partnership in trinidad elections 2010]
place of birth Penal
0.993681788444519

november 1879 mit seiner majestät alphons xii., könig von spanien, inhaber des infanterie-regiments nr
place of birth Židlochovice
0.977453887462616

thumb|righteousness permits no turning back – hong kong art museum exhibit
place of birth Shunde District
0.9736953973770142

he portrays jake sully in the avatar film series, marcus wright in terminator salvation, and perseus in clash of the titans as well as its sequel wrath of the titans
place of birth Godalming
0.9644454717636108

in 2000 she was declared a saint by the catholic church
place of birth Sultanate of Darfur
0.962001383304596

her first piece, a one-shot, was about old-style talking demons
place of birth Kitakyūshū
0.959858775138855

he was an instrumental contributor to a number of major events in hungarian history, including 