In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import _pickle as pickle
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

from tqdm.notebook import tqdm

from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel

In [None]:
train = load('mlni_train_set')

In [None]:
train.head()

In [None]:
Y = []

for label in tqdm(train['gold_label']):
    
    if label == 'neutral':
        Y.append(0)
    elif label == 'contradiction':
        Y.append(-1)
    elif label == 'entailment':
        Y.append(1)
    else:
        Y.append(0)

        




In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_length = 64

In [None]:
X = []
X_type = []
X_masks = []
text_pairs = []
max_length = 64
for index, line in tqdm(train.iterrows(), total = train.shape[0]):
    s1 = line['sentence1']
    s2 = line['sentence2']
    
    tokenized = tokenizer.encode_plus(str(s1), str(s2), add_special_tokens = True, max_length = max_length, pad_to_max_length = True)
    
    X.append(tokenized['input_ids'])
    X_type.append(tokenized['token_type_ids'])
    X_masks.append(tokenized['attention_mask'])
    text_pairs.append(tokenizer.decode(tokenized['input_ids']))

In [None]:
save((X, X_type, X_masks, text_pairs, Y), 'mlni_dataset_padded')

In [2]:
(X, X_type, X_masks, text_pairs, Y) = load('mlni_dataset_padded')

In [3]:
from keras.utils import np_utils
X = np.array(X)
X_type = np.array(X_type)
X_masks = np.array(X_masks)
Y = np.array(Y)

y = np_utils.to_categorical(Y+1)

Using TensorFlow backend.


In [4]:
from sklearn.model_selection import train_test_split
X_train_ids, X_test_ids, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)
X_train_mask, X_test_mask, _, _ = train_test_split(X_masks, y, random_state=42, test_size=0.1)
X_train_type, X_test_type, _, _ = train_test_split(X_type, y, random_state=42, test_size=0.1)

X_train = [X_train_ids, X_train_mask, X_train_type]
X_test = [X_test_ids, X_test_mask, X_test_type]

#### Model inputs : array of size 3
        First position the inputs tokens
        Second position the attention masks
        Third position the inputs type (for sentence pair differentiation)
#### Model outputs : array of size 3
        First position : probability of contradiction
        Second position : probability of neutral
        Third position : probability of agreement

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

max_length = 64

inputs_ids = Input(shape = (max_length,), dtype = 'int32')
inputs_mask = Input(shape = (max_length,), dtype = 'int32')
inputs_type = Input(shape = (max_length,), dtype = 'int32')

inputs = [inputs_ids, inputs_mask, inputs_type]

sentence_encoder = TFBertModel.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.  
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

encoded = sentence_encoder(inputs_ids, attention_mask = inputs_mask, token_type_ids = inputs_type)
pooled_encoded = encoded[1]

drop = Dropout(0.3)(pooled_encoded)

out = Dense(3, activation = 'sigmoid')(drop)


model = Model(inputs, out)

In [6]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 64)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 64, 768), (N 109482240   input_1[0][0]                    
______________________________________________________________________________________________

In [23]:
from tensorflow.keras.optimizers import Adam, SGD

loss_classif     =  'categorical_crossentropy'# find the right loss for multi-class classification
optimizer        =  Adam(3e-6, 1e-8) # find the right optimizer
metrics_classif  =  ['accuracy']

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [24]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=4, verbose=1, 
                                                mode='auto', restore_best_weights=True)
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, 
                                                     mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

bs = 54
n_epochs = 3
#, batch_size=bs
history = model.fit(X_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(X_test,  y_test), callbacks = [early, reduce])

Train on 352048 samples, validate on 39117 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


KeyboardInterrupt: 

In [25]:
model.save('mlni_classification')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: mlni_classification\assets


In [None]:
model.layers

In [None]:
import tensorflow as tf
model = tf.keras.models.load_model('bert_agreement')

In [None]:
model.summary()

In [36]:
s1 = "it is too light"
s2 =  "i like the color of the product"

tokenized = tokenizer.encode_plus(str(s1), str(s2), add_special_tokens = True, max_length = max_length, pad_to_max_length = True)

ids  = np.array([tokenized['input_ids']])
type1 = np.array([tokenized['token_type_ids']])
mask = np.array([tokenized['attention_mask']])

inputs = [ids, mask, type1]

In [37]:
model.predict(inputs)

array([[3.5618606e-04, 9.9145573e-01, 1.3154164e-03]], dtype=float32)