In [1]:
import csv

from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd

test_csv = "../contradictory-my-dear-watson/test.csv"
train_csv = "../contradictory-my-dear-watson/train.csv"
test_csv_mod = "../contradictory-my-dear-watson/test_mod.csv"
train_csv_mod = "../contradictory-my-dear-watson/train_mod.csv"

In [22]:
def parse_raw_csv_data(target_file, modded_file):
    with open(target_file, encoding="utf-8") as rf:
        with open(modded_file, "w", newline="\n") as wf:
            writer = csv.writer(wf)
            csv_reader = csv.reader(rf, delimiter=",")
            headers = next(csv_reader)
            writer.writerow(headers)
            count = 1
            for row in csv_reader:
                if row[3]=="en":
                    try:
                        writer.writerow(row)
                        count+=1
                    except UnicodeEncodeError:
                        print(count)
                        break

In [23]:
## only run if you need to remake english csv files
# parse_raw_csv_data(train_csv, train_csv_mod)
# parse_raw_csv_data(test_csv, test_csv_mod)

2153


In [24]:
# Deleted the last line of the csv file to make it work, the last line is blank and for some reason it doesn't like that
train = pd.read_csv(train_csv_mod, encoding="utf-8")

In [25]:
train.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
3,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,en,English,2
4,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,en,English,1


In [27]:
model_name = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading: 100%|██████████| 213k/213k [00:00<00:00, 4.54MB/s]


In [35]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [36]:
print(encode_sentence(train.premise[1]))

These are issues that we wrestle with in practice groups of law firms, she said. 
['These', 'are', 'issues', 'that', 'we', 'w', '##restle', 'with', 'in', 'practice', 'groups', 'of', 'law', 'firms', ',', 'she', 'said', '.', '[SEP]']
[1636, 1132, 2492, 1115, 1195, 192, 22713, 1114, 1107, 2415, 2114, 1104, 1644, 9780, 117, 1131, 1163, 119, 102]


In [37]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  sentence2 = tf.ragged.constant([
      encode_sentence(s)
       for s in np.array(premises)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [38]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

['and', 'these', 'comments', 'were', 'considered', 'in', 'formula', '##ting', 'the', 'interim', 'rules', '.', '[SEP]']
['These', 'are', 'issues', 'that', 'we', 'w', '##restle', 'with', 'in', 'practice', 'groups', 'of', 'law', 'firms', ',', 'she', 'said', '.', '[SEP]']
['you', 'know', 'they', 'can', "'", 't', 'really', 'defend', 'themselves', 'like', 'somebody', 'grown', 'uh', 'say', 'my', 'age', 'you', 'know', 'yeah', '[SEP]']
['From', 'Co', '##ck', '##pit', 'Country', 'to', 'St', '.', 'Ann', "'", 's', 'Bay', '[SEP]']
['Look', ',', 'it', "'", 's', 'your', 'skin', ',', 'but', 'you', "'", 're', 'going', 'to', 'be', 'in', 'trouble', 'if', 'you', 'don', "'", 't', 'get', 'busy', '.', '[SEP]']
['"', 'If', 'you', 'people', 'only', 'knew', 'how', 'fatal', '##ly', 'easy', 'it', 'is', 'to', 'poison', 'some', 'one', 'by', 'mistake', ',', 'you', 'wouldn', "'", 't', 'joke', 'about', 'it', '.', '[SEP]']
['My', 'own', 'little', 'corner', 'of', 'the', 'world', ',', 'policy', 'won', '##king', ',', 'is'

In [39]:
max_len = 50

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [41]:
model = build_model()
model.summary()

Downloading: 100%|██████████| 433/433 [00:00<00:00, 217kB/s]
Downloading: 100%|██████████| 527M/527M [00:13<00:00, 39.7MB/s]
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Please report this to the TensorFlow team.

In [42]:
model.fit(train_input, train.label.values, epochs = 2, verbose = 1, batch_size = 16, validation_split = 0.2)

Epoch 1/2
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[9984,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node functional_1/tf_bert_model/bert/encoder/layer_._5/attention/self/key/Tensordot/MatMul (defined at e:\Software\L3Mentorship\venv\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:276) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[gradient_tape/functional_1/tf_bert_model/bert/embeddings/position_embeddings/embedding_lookup/Reshape/_536]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[9984,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node functional_1/tf_bert_model/bert/encoder/layer_._5/attention/self/key/Tensordot/MatMul (defined at e:\Software\L3Mentorship\venv\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:276) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_29396]

Function call stack:
train_function -> train_function
