## Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-uncased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


In [2]:
vocab = tokenizer.vocab
print("vocab size : ", len(vocab))

vocab size :  28996


In [3]:
# text = 'yokozeki was born in osaka prefecture on september 11, 1979'
# tokens = tokenizer.tokenize(text)
# ids = tokenizer.convert_tokens_to_ids(tokens)

# print(text)
# print(tokens[:10], '...')
# print(ids[:10], '...')

## Import Data

In [4]:
import pandas as pd
from sklearn.utils import shuffle

In [5]:
# data_place_of_birth / data_place_of_death / data_occupation

data_0 = pd.read_csv('data_place_of_birth_0.csv')
data_1 = pd.read_csv('data_place_of_birth_1.csv')
data_2 = pd.read_csv('data_POB_POD_OCC_2.csv')

In [6]:
data_0_train = data_0[0:900]
data_0_val = data_0[900:1800]
data_0_test = data_0[1800:2700]

data_1_train = data_1[0:100]
data_1_val = data_1[100:200]
data_1_test = data_1[200:300]

data_2_train = data_2[0:1000]
data_2_val = data_2[1000:2000]
data_2_test = data_2[2000:3000]

frame_train = [data_0_train, data_1_train, data_2_train]
frame_val = [data_0_val, data_1_val, data_2_val]
frame_test = [data_0_test, data_1_test, data_2_test]

df_train = shuffle(pd.concat(frame_train, ignore_index=True))
df_val = shuffle(pd.concat(frame_val, ignore_index=True))
df_test = shuffle(pd.concat(frame_test, ignore_index=True))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [7]:
print("Training Set:"% df_train.columns, df_train.shape)
print("Validation Set:"% df_val.columns, df_val.shape)
print("Test Set:"% df_test.columns, df_test.shape)

Training Set: (2000, 3)
Validation Set: (2000, 3)
Test Set: (2000, 3)


In [8]:
df_train.head()

Unnamed: 0,sentence,wikidata,label
0,"van niekerk was born in kraaifontein, cape tow...",place of birth Cape Town,0
1,"shui junyi , born september 20, 1963 in sanjia...",place of birth Lanzhou,0
2,"huang's family was from meixian, guangdong but...",place of birth Tianjin,0
3,"born in hitchin, hertfordshire, kitson spent h...",place of birth Hitchin,0
4,in june 1841 he completed his first scientific...,place of birth Heilbronn,2


## Text Cleaning

In [9]:
print('For train data')
print("Number of sentences : ", len(df_train))
print("Longest sentence\'s length : " + str(df_train.sentence.map(len).max()))
print("Average length of the sentences : " + str(df_train.sentence.map(len).mean()))

For train data
Number of sentences :  2000
Longest sentence's length : 1247
Average length of the sentences : 131.105


In [10]:
print('For validation data')
print("Number of sentences : ", len(df_val))
print("Longest sentence\'s length : " + str(df_val.sentence.map(len).max()))
print("Average length of the sentences : " + str(df_val.sentence.map(len).mean()))

For validation data
Number of sentences :  2000
Longest sentence's length : 1456
Average length of the sentences : 133.755


In [11]:
print('For test data')
print("Number of sentences : ", len(df_test))
print("Longest sentence\'s length : " + str(df_test.sentence.map(len).max()))
print("Average length of the sentences : " + str(df_test.sentence.map(len).mean()))

For test data
Number of sentences :  2000
Longest sentence's length : 725
Average length of the sentences : 130.024


### Limit the tweet max length

In [12]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [13]:
df_test['sentence'] = df_test['sentence'].map(lambda x: max_text_length(x, 1200))

df_test

Unnamed: 0,sentence,wikidata,label
0,"he was one of the sons of saint clotilda, born...",place of birth Reims,0
1,kim han-sol was born in pyongyang in 1995 and ...,place of birth Pyongyang,0
2,"john robin warren ac born 11 june 1937, in ade...",place of birth Adelaide,0
3,the couple had four children; art was the olde...,place of birth Toledo,0
4,"""neil harbisson, ciborg de colors"", catalunya ...",place of birth Mataró,2
...,...,...,...
1995,allam was born in egypt and raised by muslim p...,place of birth london,1
1996,"his former teams are nec nijmegen, roda jc, fc...",place of birth Assen,2
1997,"born august 6, 1990 in tokyo is a japanese act...",place of birth Tokyo,0
1998,wiles states that he came across fermat's last...,place of birth Cambridge,2


In [14]:
df_val['sentence'] = df_val['sentence'].map(lambda x: max_text_length(x, 1200))

df_val

Unnamed: 0,sentence,wikidata,label
0,pinkerton was not raised in a religious upbrin...,place of birth Glasgow,2
1,his father was a māori farmer and artist of te...,place of birth Raukokore,2
2,maas was born in saarlouis to a catholic famil...,place of birth Saarlouis,0
3,he became the first brazilian badminton player...,place of birth Campinas,2
4,in 2000 he resigned from his position on micro...,place of birth Seattle,2
...,...,...,...
1995,berg graduated from henderson high school in w...,place of birth Philadelphia,2
1996,baumgarten was born in berlin as the fifth of ...,place of birth Berlin,0
1997,"sayers, her life and soul london hodder and st...",place of birth Oxford,2
1998,"muro was born in yanaka, musashi province mode...",place of birth Yanaka,0


In [15]:
len(df_train)

2000

In [16]:
L = 2000
MAX_LENGTH = 300
print('Number of sentences that exceed 300 characters')
print(L - len(df_train[~(df_train.sentence.apply(lambda x : len(x)) > MAX_LENGTH)]))

# print("Number of tweets : ", len(df_train))
# df_train['sentence'] = df_train['sentence'].map(lambda x: max_text_length(x, 300))
df_train = df_train[~(df_train.sentence.apply(lambda x : len(x)) > MAX_LENGTH)]

df_train

Number of sentences that exceed 300 characters
68


Unnamed: 0,sentence,wikidata,label
0,"van niekerk was born in kraaifontein, cape tow...",place of birth Cape Town,0
1,"shui junyi , born september 20, 1963 in sanjia...",place of birth Lanzhou,0
2,"huang's family was from meixian, guangdong but...",place of birth Tianjin,0
3,"born in hitchin, hertfordshire, kitson spent h...",place of birth Hitchin,0
4,in june 1841 he completed his first scientific...,place of birth Heilbronn,2
...,...,...,...
1995,until he was pressured to retire shortly befor...,place of birth Greenwich,2
1996,lindenberg started his musical career as a dru...,place of birth Gronau,2
1997,mott was born in leeds to lilian mary reynolds...,place of birth Leeds,0
1998,"rafael eduardo medina born february 15, 1975 i...",place of birth Panama City,0


In [17]:
df_train.to_csv("train.csv", index=False)
df_val.to_csv("validation.csv", index=False)
df_test.to_csv("test.csv", index=False)

In [18]:
T = pd.read_csv('train.csv')
T1, T2, T3= T.iloc[0, :].values
print(T1)
print(T2)
print(T3)

van niekerk was born in kraaifontein, cape town, to wayne van niekerk and sprinter odessa swarts.wayde's olympic glory what his parents have to say, iol he was born prematurely and needed a blood transfusion
place of birth Cape Town
0


## Make Dataset

In [19]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv(mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len


### Make trainset

In [20]:
# Dataset
trainset = WikiDataset("train", tokenizer=tokenizer)

In [21]:
# select first sample
sample_idx = 0

# compare with the original document
sentence_0, wikidata_0, label_0 = trainset.df.iloc[sample_idx].values

# use the Dataset we built to extract the transformed id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# convert the tokens_tensor to original document
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

print(f"""[original document]
sentence  ：{sentence_0}
wikidata  ：{wikidata_0}
label ：{label_0}

--------------------

[ tensors return by Dataset ]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[tokens_tensors word pieces]
{combined_text}
""")

[original document]
sentence  ：van niekerk was born in kraaifontein, cape town, to wayne van niekerk and sprinter odessa swarts.wayde's olympic glory what his parents have to say, iol he was born prematurely and needed a blood transfusion
wikidata  ：place of birth Cape Town
label ：0

--------------------

[ tensors return by Dataset ]
tokens_tensor  ：tensor([  101,  3498, 11437,  4820,  1200,  1377,  1108,  1255,  1107,   180,
         1611,  3814, 14467, 11656,  1394,   117, 23546,  1411,   117,  1106,
         1236,  1673,  3498, 11437,  4820,  1200,  1377,  1105, 24360,   184,
         4704,  3202,   188, 18320,  1116,   119,  1236,  2007,   112,   188,
          184,  1193,  8223,  1596, 12887,  1184,  1117,  2153,  1138,  1106,
         1474,   117,   178,  4063,  1119,  1108,  1255, 24505,  1193,  1105,
         1834,   170,  1892, 14715, 17149,   102,  1282,  1104,  3485,  4343,
         2779,   102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## DataLoader

In [22]:
"""
Create a DataLoader that can return a mini-batch each time
This DataLoader works with the 'WikiDataset' we define previously
we need 4 tensors when training a BERT model：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

### trainloader

In [23]:
# Initialize a DataLoader
# use `collate_fn` to combine list of samples into a mini-batch
BATCH_SIZE = 24
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [24]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([24, 86]) 
tensor([[  101,  3498, 11437,  ...,     0,     0,     0],
        [  101,   188, 23618,  ...,     0,     0,     0],
        [  101,   177, 25530,  ...,     0,     0,     0],
        ...,
        [  101,   175, 25081,  ...,     0,     0,     0],
        [  101, 22904,  1320,  ...,     0,     0,     0],
        [  101,  3840,  9238,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([24, 86])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([24, 86])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------

## Import Pretrained BERT Model

In [25]:
# n_class = 3
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-cased"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)


In [26]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

## Prediction

In [27]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # calculate accuracy when training
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    

In [28]:
# run the model on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
# _, acc = get_predictions(model, trainloader, compute_acc=True)
# print("classification acc:", acc)

device: cuda:0


In [29]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
number of parameters of whole model：{sum(p.numel() for p in model_params)}
number of parameters of the linear classifier：{sum(p.numel() for p in clf_params)}
""")


number of parameters of whole model：108312579
number of parameters of the linear classifier：2307



## Start training

In [30]:
%%time

# train model
model.train()

# using Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 6    # number of epochs
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        # set the gradients to zero
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # record batch loss
        running_loss += loss.item()
        
    # calculate accuracy
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 47.299, acc: 0.918
[epoch 2] loss: 17.139, acc: 0.954
[epoch 3] loss: 11.167, acc: 0.974
[epoch 4] loss: 8.062, acc: 0.980
[epoch 5] loss: 6.243, acc: 0.986
[epoch 6] loss: 4.715, acc: 0.986
Wall time: 3min 59s


In [31]:
PATH = 'B:/Documents/2020五上/Wikidata/Wikipedia_Wikidata_Alignment/model_place_of_birth'
torch.save(model, PATH)

## Validation

In [32]:
valiset = WikiDataset("validation", tokenizer=tokenizer)

In [33]:
valiloader = DataLoader(valiset, batch_size=32, collate_fn=create_mini_batch)

with torch.no_grad():
    validations = get_predictions(model, valiloader)
torch.cuda.empty_cache()

In [34]:
validations_numpy = validations.cpu().clone().numpy()
validations_numpy = validations_numpy.reshape((2000, 1))

In [35]:
ids = np.arange(1,2001)
ids = ids.reshape((2000, 1))

In [36]:
validations_array = np.concatenate((ids, validations_numpy), axis=1)
validations_array

array([[   1,    2],
       [   2,    2],
       [   3,    0],
       ...,
       [1998,    2],
       [1999,    0],
       [2000,    0]], dtype=int64)

In [37]:
df_validations = pd.DataFrame(data = validations_array, columns=["id", "label"])
df_validations['label'] = df_validations['label'].map({0:0, 1:1, 2:2})
# df_validations.to_csv('answer.txt', index = False)

### Calculate F1 score

In [38]:
df_validations['label']

0       2
1       2
2       0
3       2
4       2
       ..
1995    2
1996    0
1997    2
1998    0
1999    0
Name: label, Length: 2000, dtype: int64

In [39]:
from sklearn.metrics import f1_score

print("F1 : ",f1_score(df_validations['label'], df_val['label'],  average = 'weighted'))

F1 :  0.9714806325693123


In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [41]:
print(confusion_matrix(df_val['label'], df_validations['label']))

[[893   0   7]
 [  0 100   0]
 [ 50   0 950]]


In [42]:
print(classification_report(df_val['label'], df_validations['label']))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       900
           1       1.00      1.00      1.00       100
           2       0.99      0.95      0.97      1000

    accuracy                           0.97      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.97      0.97      0.97      2000

