## Import PyTorch and BERT

In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-cased"  

# import the tokenizer which is used on this pretrained model
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


In [2]:
vocab = tokenizer.vocab
print("vocab size : ", len(vocab))

vocab size :  28996


In [3]:
# text = 'yokozeki was born in osaka prefecture on september 11, 1979'
# tokens = tokenizer.tokenize(text)
# ids = tokenizer.convert_tokens_to_ids(tokens)

# print(text)
# print(tokens[:10], '...')
# print(ids[:10], '...')

## Import Data

In [4]:
import pandas as pd
from sklearn.utils import shuffle

In [5]:
# data_place_of_birth / data_place_of_death / data_occupation

data_0 = pd.read_csv('./data/data_occupation_0.csv')
data_1 = pd.read_csv('./data/data_occupation_1.csv')

In [6]:
data_0_train = data_0[0:3000]
data_0_val = data_0[3000:6000]
data_0_test = data_0[6000:9000]

data_1_train = data_1[0:3000]
data_1_val = data_1[3000:6000]
data_1_test = data_1[6000:9000]

df_train = shuffle(data_0_train.append(data_1_train, ignore_index=True))
df_val = shuffle(data_0_val.append(data_1_val, ignore_index=True))
df_test = shuffle(data_0_test.append(data_1_test, ignore_index=True))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [7]:
print("Training Set:"% df_train.columns, df_train.shape)
print("Validation Set:"% df_val.columns, df_val.shape)
print("Test Set:"% df_test.columns, df_test.shape)

Training Set: (6000, 3)
Validation Set: (6000, 3)
Test Set: (6000, 3)


In [8]:
df_train.head()

Unnamed: 0,sentence,wikidata,label
0,category best musical or comedy actor golden g...,occupation actor,0
1,yearaward categorytitleresultsref.1947 academy...,occupation writer,1
2,"sean patrick hayes born june 26, 1970 is an am...",occupation film producer,0
3,peruggi did not accompany him to washington wh...,occupation writer,1
4,"retrieved 2015-03-27 on 21 december 2014, chan...",occupation actor,1


## Text Cleaning

In [9]:
print('For train data')
print("Number of sentences : ", len(df_train))
print("Longest sentence\'s length : " + str(df_train.sentence.map(len).max()))
print("Average length of the sentences : " + str(df_train.sentence.map(len).mean()))

For train data
Number of sentences :  6000
Longest sentence's length : 4825
Average length of the sentences : 179.35866666666666


In [10]:
print('For validation data')
print("Number of sentences : ", len(df_val))
print("Longest sentence\'s length : " + str(df_val.sentence.map(len).max()))
print("Average length of the sentences : " + str(df_val.sentence.map(len).mean()))

For validation data
Number of sentences :  6000
Longest sentence's length : 4467
Average length of the sentences : 176.75933333333333


In [11]:
print('For test data')
print("Number of sentences : ", len(df_test))
print("Longest sentence\'s length : " + str(df_test.sentence.map(len).max()))
print("Average length of the sentences : " + str(df_test.sentence.map(len).mean()))

For test data
Number of sentences :  6000
Longest sentence's length : 3520
Average length of the sentences : 176.91333333333333


### Limit the tweet max length

In [12]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [13]:
df_test['sentence'] = df_test['sentence'].map(lambda x: max_text_length(x, 1200))

df_test

Unnamed: 0,sentence,wikidata,label
0,"baldwin was nominated for an academy award, a ...",occupation film producer,1
1,it was selected as one of the best science fic...,occupation writer,0
2,pope john iv ; died 12 october 642 was the bis...,occupation politician,1
3,"soon, franquin was considered an undisputed ma...",occupation animator,0
4,"while it is believed four died early, the last...",occupation poet,0
...,...,...,...
5995,she received her only nomination for a filmfar...,occupation actor,0
5996,william sami étienne grigahcine ; born 13 june...,occupation disc jockey,0
5997,"june 18, 2004 was an american helicopter engin...",occupation engineer,0
5998,justice harlan was very close to the law clerk...,occupation judge,0


In [14]:
df_val['sentence'] = df_val['sentence'].map(lambda x: max_text_length(x, 1200))

df_val

Unnamed: 0,sentence,wikidata,label
0,"yi is the author of ten books, and he served a...",occupation author,0
1,he studied engineering at the technical univer...,occupation architect,1
2,winnie hsin ; born 8 february 1962 is a taiwan...,occupation writer,1
3,", also spelled nukada, was a japanese poet of ...",occupation poet,0
4,"during the final eight days of the expedition,...",occupation theologian,1
...,...,...,...
5995,"2002 underground zero — director segment ""isai...",occupation screenwriter,1
5996,"2007 korean model awards model star award, fas...",occupation television presenter,1
5997,2010 jupiter award – best male actor tv for hi...,occupation screenwriter,1
5998,originally ellsworth did not have a first name...,occupation film producer,0


In [15]:
len(df_train)

6000

In [16]:
L = 6000
MAX_LENGTH = 300
print('Number of sentences that exceed 300 characters')
print(L - len(df_train[~(df_train.sentence.apply(lambda x : len(x)) > MAX_LENGTH)]))

# print("Number of tweets : ", len(df_train))
# df_train['sentence'] = df_train['sentence'].map(lambda x: max_text_length(x, 300))
df_train = df_train[~(df_train.sentence.apply(lambda x : len(x)) > MAX_LENGTH)]

df_train

Number of sentences that exceed 300 characters
546


Unnamed: 0,sentence,wikidata,label
0,category best musical or comedy actor golden g...,occupation actor,0
2,"sean patrick hayes born june 26, 1970 is an am...",occupation film producer,0
3,peruggi did not accompany him to washington wh...,occupation writer,1
4,"retrieved 2015-03-27 on 21 december 2014, chan...",occupation actor,1
5,justin haber born 9 june 1981 is a maltese pro...,occupation seiyū,1
...,...,...,...
5994,frans michel penning 12 september 1894 – 6 dec...,occupation politician,1
5996,vladimir volfovich zhirinovsky ; né eidelstein...,occupation military personnel,1
5997,alexander vlahos born 30 july 1988 is a welsh ...,occupation inventor,1
5998,anies rasyid baswedan born 7 may 1969 is an in...,occupation peasant,1


In [17]:
df_train.to_csv("train.csv", index=False)
df_val.to_csv("validation.csv", index=False)
df_test.to_csv("test.csv", index=False)

In [18]:
T = pd.read_csv('train.csv')
T1, T2, T3= T.iloc[0, :].values
print(T1)
print(T2)
print(T3)

category best musical or comedy actor golden globe television winners
occupation actor
0


## Make Dataset

In [19]:
from torch.utils.data import Dataset

class WikiDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv(mode + ".csv")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            sentence, wikidata = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            sentence, wikidata, label_id = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label_id)
            
        # BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(wikidata)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # convert hole token sequence into index sequence
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len


### Make trainset

In [20]:
# Dataset
trainset = WikiDataset("train", tokenizer=tokenizer)

In [21]:
# select first sample
sample_idx = 0

# compare with the original document
sentence_0, wikidata_0, label_0 = trainset.df.iloc[sample_idx].values

# use the Dataset we built to extract the transformed id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# convert the tokens_tensor to original document
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

print(f"""[original document]
sentence  ：{sentence_0}
wikidata  ：{wikidata_0}
label ：{label_0}

--------------------

[ tensors return by Dataset ]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[tokens_tensors word pieces]
{combined_text}
""")

[original document]
sentence  ：category best musical or comedy actor golden globe television winners
wikidata  ：occupation actor
label ：0

--------------------

[ tensors return by Dataset ]
tokens_tensor  ：tensor([  101,  4370,  1436,  2696,  1137,  3789,  2811,  5404, 12868,  1778,
         5222,   102,  5846,  2811,   102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

label_tensor   ：0

--------------------

[tokens_tensors word pieces]
[CLS]categorybestmusicalorcomedyactorgoldenglobetelevisionwinners[SEP]occupationactor[SEP]



## DataLoader

In [22]:
"""
Create a DataLoader that can return a mini-batch each time
This DataLoader works with the 'WikiDataset' we define previously
we need 4 tensors when training a BERT model：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# The input samples of this function is a list,
# every element in it is a sample return by the 'WikiDataset'

# Every sample contains 3 tensors : 
# - tokens_tensor
# - segments_tensor
# - label_tensor

# It will procecss zero padding on the first two tensors,
# then create a masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # with labels or not
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks, 
    # set the locations that are not zero padding tokens_tensors to 1 in order to let bert only focus on those tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

### trainloader

In [23]:
# Initialize a DataLoader
# use `collate_fn` to combine list of samples into a mini-batch
BATCH_SIZE = 24
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [24]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([24, 72]) 
tensor([[  101,  4370,  1436,  ...,     0,     0,     0],
        [  101,  2343,  1179,  ...,     0,     0,     0],
        [  101,  1679,  9610,  ...,     0,     0,     0],
        ...,
        [  101, 24181,  7702,  ...,     0,     0,     0],
        [  101,  1113,  1185,  ...,     0,     0,     0],
        [  101,  5871,  3740,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([24, 72])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([24, 72])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------

## Import Pretrained BERT Model

In [25]:
# n_class = 2
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-cased"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [26]:
model.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

## Prediction

In [27]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # first 3 tensors are tokens, segments and masks 
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # calculate accuracy when training
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    

In [28]:
# run the model on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
# _, acc = get_predictions(model, trainloader, compute_acc=True)
# print("classification acc:", acc)

device: cuda:0


In [29]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
number of parameters of whole model：{sum(p.numel() for p in model_params)}
number of parameters of the linear classifier：{sum(p.numel() for p in clf_params)}
""")


number of parameters of whole model：108311810
number of parameters of the linear classifier：1538



## Start training

In [30]:
%%time

# train model
model.train()

# using Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 3    # number of epochs
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        # set the gradients to zero
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # record batch loss
        running_loss += loss.item()
        
    # calculate accuracy
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 64.420, acc: 0.964
[epoch 2] loss: 26.406, acc: 0.973
[epoch 3] loss: 18.959, acc: 0.974
Wall time: 5min 29s


In [31]:
# PATH = 'B:/Documents/2020五上/Wikidata/Wikipedia_Wikidata_Alignment/model_place_of_birth'
# torch.save(model, PATH)

## Validation

In [32]:
valiset = WikiDataset("validation", tokenizer=tokenizer)

In [33]:
valiloader = DataLoader(valiset, batch_size=32, collate_fn=create_mini_batch)

with torch.no_grad():
    validations = get_predictions(model, valiloader)
torch.cuda.empty_cache()

In [34]:
validations_numpy = validations.cpu().clone().numpy()
validations_numpy = validations_numpy.reshape((6000, 1))

In [35]:
ids = np.arange(1,6001)
ids = ids.reshape((6000, 1))

In [36]:
validations_array = np.concatenate((ids, validations_numpy), axis=1)
validations_array

array([[   1,    0],
       [   2,    1],
       [   3,    1],
       ...,
       [5998,    1],
       [5999,    0],
       [6000,    1]], dtype=int64)

In [37]:
df_validations = pd.DataFrame(data = validations_array, columns=["id", "label"])
df_validations['label'] = df_validations['label'].map({0:0, 1:1})
# df_validations.to_csv('answer.txt', index = False)

### Calculate F1 score

In [38]:
from sklearn.metrics import f1_score

print("F1 : ",f1_score(df_validations['label'], df_val['label'],  average = 'weighted'))

F1 :  0.9461907963061191


In [39]:
from sklearn.metrics import confusion_matrix

In [40]:
print(confusion_matrix(df_val['label'], df_validations['label']))

[[2902   98]
 [ 225 2775]]
