In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertForQuestionAnswering, TFBertModel, TFBertForNextSentencePrediction

from tqdm.notebook import tqdm

from keras.preprocessing.sequence import pad_sequences


from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense

import _pickle as pickle

## Some useful functions to ease the processings
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

Using TensorFlow backend.


## Importation and parsing of the json file into a pandas dataframe

In [None]:
import json
with open('train-v2.0.json') as f:
    data = json.load(f)

In [None]:
data.keys()

In [None]:
data['version']

In [None]:
data['data'][0].keys()

In [None]:
data['data'][1]['paragraphs']

In [None]:
count = 0

titles = []
ids = []
contextes = []
possible = []
answers_text = []
answers_pos = []
quests = []
for theme in data['data']:
    title = theme['title']
        
    for contexte in theme['paragraphs']:
        text =  contexte['context']
        
        for questions in contexte['qas']:
            
            cond = True
            
            id1 = questions['id']
            question = questions['question']
            is_impossible = questions['is_impossible']
            
            if is_impossible:
#                 print(questions['plausible_answers'])
                if len(questions['plausible_answers'])>0:
                    answer_text = questions['plausible_answers'][0]['text']
                    answer_pos = questions['plausible_answers'][0]['answer_start']
                else:
                    cond = False
            else:
                answer_text = questions['answers'][0]['text']
                answer_pos = questions['answers'][0]['answer_start']
            if cond:
                titles.append(title)
                ids.append(id1)
                contextes.append(text)
                possible.append(is_impossible)
                answers_text.append(answer_text)
                answers_pos.append(answer_pos)
                quests.append(question)
                
dico = {
    'title':titles,
    'id' : ids,
    'contexte':contextes,
    'question':quests,
    'is_impossible':possible,
    'answer_text':answers_text,
    'answer_pos': answers_pos
}

In [None]:
df = pd.DataFrame(dico)

In [None]:
save(df, 'train_raw')

In [None]:
df[df['is_impossible'] == True]

## Tokenization with bert tokenizer

In [None]:
df = load('train')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_length = 256

X = list(np.zeros(df.shape[0]))
X_type = list(np.zeros(df.shape[0]))
X_masks = list(np.zeros(df.shape[0]))
Y = list(np.zeros(df.shape[0]))

text_pairs = []
for index, line in tqdm(df.iterrows(), total = df.shape[0]):
    s1 = line['question']
    s2 = line['contexte']
    s3 = line['answer_text']
    tokenized = tokenizer.encode_plus(str(s1), str(s2), add_special_tokens = True, max_length = max_length, pad_to_max_length = True)
    
    answer = tokenizer.encode_plus(str(s3), add_special_tokens = False)
    
    X[index] = tokenized['input_ids']
    X_type[index] = tokenized['token_type_ids']
    X_masks[index] = tokenized['attention_mask']
    Y[index] = answer['input_ids']
    text_pairs.append(tokenizer.decode(tokenized['input_ids']))

In [None]:
df['inputs_ids'] = X
df['inputs_type'] = X_type
df['masks'] = X_masks
df['encoded_answer'] = Y

In [None]:
save(df, 'train_refined')

## Creation of the target of the network

In [None]:
df = load('train_refined')

In [None]:
start = list(np.zeros(df.shape[0]))
end = list(np.zeros(df.shape[0]))
y_possible = list(np.zeros(df.shape[0]))

for index, line in tqdm(df.iterrows(), total = df.shape[0]):
    a = line['inputs_ids']
    b = line['encoded_answer']
    c = line['is_impossible']
    
    if c == False:
        y_possible[index] = 1
    else:
        y_possible[index] = 0
        
    indices = [(i, i+len(b)) for i in range(len(a)) if a[i:i+len(b)] == b]
    
    temp = list(np.zeros(len(a)))
    temp1 = list(np.zeros(len(a)))
    if len(indices) == 0:
        y_possible[index] = 0
    else:
        for ind in indices:
            temp[ind[0]] = 1
            temp1[ind[1]-1] = 1
    start[index] = temp
    end[index] = temp1
    
df['start'] = list(np.array(start).astype(int))
df['end'] = list(np.array(end).astype(int))
df['target_possible'] = np.array(y_possible).astype(int)

In [None]:
save(df, 'train_final')

## Preparing the data to go through Bert

In [2]:
df = load('train_final')

In [3]:
X = np.array([list(elt) for elt in df['inputs_ids'].values]).astype(int)
X_masks = np.array([list(elt) for elt in df['masks'].values]).astype(int)
X_type = np.array([list(elt) for elt in df['inputs_type']]).astype(int)

y_start = np.array([list(elt) for elt in df['start'].values]).astype('float64')
y_end = np.array([list(elt) for elt in df['end'].values]).astype('float64')
y_possible = df['target_possible'].values.astype(int).astype(int)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

ind = 24

print(tokenizer.decode(X[ind]))
print(df.iloc[ind]['answer_text'])
print(tokenizer.decode([X[ind][elt] for elt in range(len(X[ind])) if y_start[ind][elt] == 1 or y_end[ind][elt] == 1])  )

[CLS] what was beyonce's first acting job, in 2006? [SEP] following the disbandment of destiny's child in june 2005, she released her second solo album, b'day ( 2006 ), which contained hits " deja vu ", " irreplaceable ", and " beautiful liar ". beyonce also ventured into acting, with a golden globe - nominated performance in dreamgirls ( 2006 ), and starring roles in the pink panther ( 2006 ) and obsessed ( 2009 ). her marriage to rapper jay z and portrayal of etta james in cadillac records ( 2008 ) influenced her third album, i am... sasha fierce ( 2008 ), which saw the birth of her alter - ego sasha fierce and earned a record - setting six grammy awards in 2010, including song of the year for " single ladies ( put a ring on it ) ". beyonce took a hiatus from music in 2010 and took over management of her career ; her fourth album 4 ( 2011 ) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. her critically acclaimed fifth studio album, beyonce ( 2013 )

In [5]:
X_train_ids, X_test_ids, y_train_start, y_test_start = train_test_split(X, y_start, random_state=42, test_size=0.1)
X_train_masks, X_test_masks, y_train_possible, y_test_possible = train_test_split(X_masks, y_possible, random_state=42, test_size=0.1)
X_train_type, X_test_type, y_train_end , y_test_end = train_test_split(X_type, y_end, random_state=42, test_size=0.1)

X_train = [X_train_ids, X_train_masks, X_train_type]
X_test = [X_test_ids, X_test_masks, X_test_type]
y_train = [y_train_start, y_train_end, y_train_possible]#, y_train_possible]
y_test = [y_test_start, y_test_end, y_test_possible]

## Loading and preparing Bert

In [6]:
X_train[0]

array([[ 101, 2129, 2116, ...,    0,    0,    0],
       [ 101, 2043, 2020, ...,    0,    0,    0],
       [ 101, 2029, 6830, ...,    0,    0,    0],
       ...,
       [ 101, 2012, 2054, ...,    0,    0,    0],
       [ 101, 2043, 2003, ...,    0,    0,    0],
       [ 101, 2129, 2116, ...,    0,    0,    0]])

##### Model inputs : array of size 3 with :
                  In first position the sentence pair tokenized (question + context)
                  In second position the attention masks
                  In third position the inputs ids type, to separate question from answer
#### Model outputs : array of size 3 with:
                  In first position the most probable answer start
                  In second position the most probable answer end
                  In third position probability that there is an answer for the question in the contexte

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed

max_length = 256

inputs_ids = Input(shape = (max_length,), dtype = 'int32')
inputs_mask = Input(shape = (max_length,), dtype = 'int32')
inputs_type = Input(shape = (max_length,), dtype = 'int32')

inputs = [inputs_ids, inputs_mask, inputs_type]

sentence_encoder = TFBertModel.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.  
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

encoded = sentence_encoder(inputs_ids, attention_mask = inputs_mask, token_type_ids = inputs_type)

word_embedding = encoded[0]
pooled_encoded = encoded[1]

output_start = Dense(1, activation = 'sigmoid')(word_embedding)
output_end = Dense(1, activation = 'sigmoid')(word_embedding)
output_possible = Dense(1, activation = 'sigmoid')(pooled_encoded)

model = Model(inputs, [output_start, output_end, output_possible])

In [8]:
encoded

(<tf.Tensor 'tf_bert_model/Identity:0' shape=(None, 256, 768) dtype=float32>,
 <tf.Tensor 'tf_bert_model/Identity_1:0' shape=(None, 768) dtype=float32>)

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 256, 768), ( 109482240   input_1[0][0]                    
______________________________________________________________________________________________

In [10]:
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
import tensorflow as tf

loss_classif     =  'categorical_crossentropy'# find the right loss for multi-class classification
optimizer        =  Adam(3e-5, 1e-8) # find the right optimizer
metrics_classif  =  ['accuracy']


def compute_loss(positions, logits):
#     one_hot_positions = tf.one_hot(
#                     positions, depth=max_length, dtype=tf.float32
#                 )

    one_hot_positions = tf.convert_to_tensor(positions, dtype=tf.float32)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    loss = -tf.reduce_mean(
    tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
                )
    return loss

# def loss(y_true, y_pred):
#     return sparse_categorical_crossentropy(y_true, y_pred,from_logits = True)
 
losses = [compute_loss, compute_loss, 'binary_crossentropy']

model.compile(loss=losses,
              optimizer=optimizer,
              metrics=metrics_classif)

In [11]:
bs = 12
n_epochs = 10
#, batch_size=bs
history = model.fit(X_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(X_test,  y_test))

Train on 117287 samples, validate on 13032 samples
Epoch 1/10
Epoch 2/10
   936/117287 [..............................] - ETA: 1:32:18 - loss: 11.0721 - dense_loss: 5.3907 - dense_1_loss: 5.3886 - dense_2_loss: 0.2927 - dense_accuracy: 0.9503 - dense_1_accuracy: 0.9616 - dense_2_accuracy: 0.8766

KeyboardInterrupt: 

In [13]:
model.save('bert_qa')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: bert_qa\assets


In [None]:
model.evaluate(X_test, y_test)

In [113]:
ind = 55
start, end, possible = model.predict([[X_test[0][ind]], [X_test[1][ind]], [X_test[2][ind]]])
start = start.reshape(256)
end = end.reshape(256)



In [114]:
st = np.argmax(start)
en = np.argmax(end)

st_true = np.argmax(y_test[0][ind])
en_true = np.argmax(y_test[1][ind])

In [115]:
print(st)
print(en)
print(st_true)
print(en_true)

124
128
124
128


In [116]:
print(tokenizer.decode(X_test[0][ind]))
print(tokenizer.decode(X_test[0][ind][st:en+1]))
print(tokenizer.decode(X_test[0][ind][st_true:en_true+1]))
print(y_test_possible[ind])
print(possible[0][0])

[CLS] what old form of accounting was created during the late 13th and early 14th centuries? [SEP] in the late 13th and early 14th centuries, a process took place – primarily in italy but partly also in the empire – that historians have termed a'commercial revolution '. among the innovations of the period were new forms of partnership and the issuing of insurance, both of which contributed to reducing the risk of commercial ventures ; the bill of exchange and other forms of credit that circumvented the canonical laws for gentiles against usury, and eliminated the dangers of carrying bullion ; and new forms of accounting, in particular double - entry bookkeeping, which allowed for better oversight and accuracy. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD