In [1]:
import csv

from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
# Helps view the longer sentences in the dataframe
pd.set_option('display.max_colwidth', None)

test_csv = "../contradictory-my-dear-watson/test.csv"
train_csv = "../contradictory-my-dear-watson/train.csv"
test_csv_mod = "../contradictory-my-dear-watson/test_mod.csv"
train_csv_mod = "../contradictory-my-dear-watson/train_mod.csv"

# possiible predictions are [0, 1, 2] corresponding to entailment, neutral, and contradiction
# datset name: contradictory my dear watson
# datset url: https://www.kaggle.com/c/contradictory-my-dear-watson/overview

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [3]:
def parse_raw_csv_data(target_file, modded_file):
    with open(target_file, encoding="utf-8") as rf:
        with open(modded_file, "w", newline="\n") as wf:
            writer = csv.writer(wf)
            csv_reader = csv.reader(rf, delimiter=",")
            headers = next(csv_reader)
            writer.writerow(headers)
            count = 1
            for row in csv_reader:
                if row[3]=="en":
                    try:
                        writer.writerow(row)
                        count+=1
                    except UnicodeEncodeError:
                        print(count)
                        continue

In [4]:
## only run if you need to remake english csv files
# parse_raw_csv_data(train_csv, train_csv_mod)
# parse_raw_csv_data(test_csv, test_csv_mod)

In [5]:
# Deleted the last line of the csv file to make it work, the last line is blank and for some reason it doesn't like that
train = pd.read_csv(train_csv_mod, encoding="utf-8")

In [6]:
train.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulating the interim rules.,The rules developed in the interim were put together with these comments in mind.,en,English,0
1,5b72532a0b,"These are issues that we wrestle with in practice groups of law firms, she said.",Practice groups are not permitted to work on these issues.,en,English,2
2,5622f0c60b,you know they can't really defend themselves like somebody grown uh say my age you know yeah,They can't defend themselves because of their age.,en,English,0
3,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,en,English,2
4,7cfb3d272c,"Look, it's your skin, but you're going to be in trouble if you don't get busy.",The boss will fire you if he sees you slacking off.,en,English,1


In [7]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [8]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [9]:
print(encode_sentence(train.premise[1]))

[2122, 2024, 3314, 2008, 2057, 25579, 2007, 1999, 3218, 2967, 1997, 2375, 9786, 1010, 2016, 2056, 1012, 102]


In [10]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  sentence2 = tf.ragged.constant([
      encode_sentence(s)
       for s in np.array(premises)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [11]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

In [12]:
max_len = 237
# add recall

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding1 = bert_encoder([input_word_ids, input_mask, input_type_ids])
    # to visualize what is going on
    print(embedding1)
    print(embedding1[0])
    print(embedding1[1])
    embedding = embedding1[0]
    # embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]

    #numpy documentation describes the [:,0,:] notation, we basically drop the middle numpy array output
    print(embedding)
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [13]:
model = build_model()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cau

In [14]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 237)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 237)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 237)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [15]:
print(os.getcwd())
checkpoint_path = "checkpoints"
os.makedirs(checkpoint_path, exist_ok=True)
print(os.listdir())
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_best_only=True,
                                                 monitor="val_accuracy",
                                                 verbose=1)

e:\Software\L3Mentorship\discriminator_model
['checkpoints', 'discriminator.ipynb', 'model_path']


In [16]:
model.fit(train_input, train.label.values, epochs = 1, verbose = 1,
        batch_size = 1, 
        validation_split = 0.2)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


<tensorflow.python.keras.callbacks.History at 0x23039cafc08>

In [21]:
## Save model
## this solved an 'index out of range' error, must be something new
model2 = tf.keras.Model(model)

model_folder = "../model_path"
os.makedirs(model_folder, exist_ok=True)
model_name = "discriminator"
model_path = os.path.join(model_folder, model_name)

In [22]:
model.get_config()
# NotImplementedError: When subclassing the `Model` class, you should implement a `call` method.

NotImplementedError: 

In [20]:
tf.saved_model.save(model2, model_path)
# model.save(model_path)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ../model_path\discriminator\assets


In [None]:
test_pd = pd.read_csv(test_csv_mod)
test_input = bert_encode(test_pd.premise.values, test_pd.hypothesis.values, tokenizer)

In [None]:
# li = [np.argmax(i) for i in model2.predict(test_input)]
model5 = tf.keras.models.load_model(model_path)
li = [np.argmax(i) for i in model2.predict(test_input)]

In [None]:
print(li)

In [None]:
sub = test_pd.id.copy().to_frame()
sub["predictions"] = li
sub.head()

In [None]:
test_pd.head()