In [91]:
import tensorflow as tf
# tf.config.run_functions_eagerly(True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn
import pathlib
import io, os, re, string, time, datetime
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense, Flatten, InputLayer, BatchNormalization, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dropout, Conv1D, MultiHeadAttention, LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorboard.plugins import projector
import traceback

from datasets import load_dataset
from transformers import BertTokenizerFast
from transformers import DataCollatorWithPadding
from transformers import TFBertForSequenceClassification, TFBertModel
from transformers import create_optimizer

In [50]:
BATCH_SIZE = 64
SEQUENCE_LENGTH = 512 # defalt for BertTokenizerFast

# Data preparation

### Tokenizer
- Ex: The Weather of today is great!
- Standardization : the weather of today is great!
- word splitting : the, weather, of, today, is, great, !
- Subword level tokenization : [CLS], the, wea, ##ther, of, to, ##day, is, great, !, [SEP]
  - bpe
  - wordpiece
  - sentencepiece
- numericalization
- postprocessing

In [22]:
dataset_id = 'imdb'
dataset = load_dataset(dataset_id)

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [24]:
dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

### [Tokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizerFast)

In [26]:
model_id = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [27]:
tokenizer.is_fast

True

In [28]:
tokenizer.tokenize('The Weather of Today is Great!')

['the', 'weather', 'of', 'today', 'is', 'great', '!']

In [29]:
tokenizer(['The Weather of Today is Great!'])

{'input_ids': [[101, 1996, 4633, 1997, 2651, 2003, 2307, 999, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [32]:
test_input_1 = 'The Weather of Today is Great! zwp'
test_input_2 = 'How are you doing?'
inputs = [test_input_1, test_input_2]

tokenizer.tokenize(inputs)

['the',
 'weather',
 'of',
 'today',
 'is',
 'great',
 '!',
 'z',
 '##w',
 '##p',
 'how',
 'are',
 'you',
 'doing',
 '?']

In [33]:
tokenizer(inputs, padding=True)

{'input_ids': [[101, 1996, 4633, 1997, 2651, 2003, 2307, 999, 1062, 2860, 2361, 102], [101, 2129, 2024, 2017, 2725, 1029, 102, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}

In [34]:
output = tokenizer(inputs, padding=True)

In [35]:
tokenizer.decode(output['input_ids'][0])

'[CLS] the weather of today is great! zwp [SEP]'

In [36]:
tokenizer.decode(output['input_ids'][1])

'[CLS] how are you doing? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [51]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True) #, max_length=128)
    # return tokenizer(examples["text"], truncation=True, max_length=128) # for DataCollatorWithPadding

In [52]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [53]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [54]:
tokenized_dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [55]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [73]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    # collate_fn=data_collator
)

In [74]:
tf_test_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=False,
    batch_size=BATCH_SIZE,
    # collate_fn=data_collator
)

In [78]:
def swap_positions(dataset):
    return {'input_ids': dataset['input_ids'],
            'token_type_ids': dataset['token_type_ids'],
            'attention_mask': dataset['attention_mask']}, \
            dataset['label']

In [77]:
tf_train_dataset = tf_train_dataset.map(swap_positions)
tf_test_dataset = tf_test_dataset.map(swap_positions)

In [79]:
tf_train_dataset

<_MapDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [80]:
for i in tf_train_dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(64, 512), dtype=int64, numpy=
array([[  101, 10468,  1010, ...,     0,     0,     0],
       [  101,  2023,  2186, ...,     0,     0,     0],
       [  101,  2023,  2659, ...,  1012,  1026,   102],
       ...,
       [  101,  2023,  3185, ...,     0,     0,     0],
       [  101,  2023,  3185, ...,     0,     0,     0],
       [  101,  2043,  2009, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(64, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(64, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(64,), dtype=int64

# Modeling

In [60]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [63]:
model.layers[0].trainable=False
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 1538 (6.01 KB)
Non-trainable params: 109482240 (417.64 MB)
_________________________________________________________________


# Training

In [65]:
num_epochs = 3
batches_per_epoch = len(tokenized_dataset["train"]) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * num_epochs)

In [66]:
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)



In [None]:
model.compile(
    # loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(2e-5), # optimizer
    metrics=['accuracy'],
)

In [None]:
model.evaluate(tf_test_dataset.take(1))

In [88]:
model.compile(
    # loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(2e-5), # optimizer
    metrics=['accuracy'],
)

In [89]:
history = model.fit(
    tf_train_dataset,
    validation_data=tf_test_dataset,
    epochs=1)



KeyboardInterrupt: 

### Based on TFBertModel

In [92]:
model = TFBertModel.from_pretrained("bert-base-uncased", num_labels=1)
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109482240 (417.64 MB)
Trainable params: 109482240 (417.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [93]:
input_ids = Input(shape=(512,), dtype=tf.int64, name='input_ids')
token_type_ids = Input(shape=(512,), dtype=tf.int64, name='token_type_ids')
attention_mask = Input(shape=(512,), dtype=tf.int64, name='attention_mask')

x = model([input_ids, token_type_ids, attention_mask])
print(x)
x = Dense(128, activation='relu')(x[0][:, 0, :])
output = Dense(1, activation='sigmoid', name='label')(x)

custom_bert = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=output)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 512, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)


In [94]:
custom_bert.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 token_type_ids (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                              

In [97]:
custom_bert.compile(
    # loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(2e-5), # optimizer
    metrics=['accuracy'],
)

In [98]:
custom_bert.evaluate(tf_test_dataset.take(1))



[0.0, 0.046875]

In [105]:
custom_bert.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(2e-5), # optimizer
    metrics=['accuracy'],
)

In [106]:
history = custom_bert.fit(
    tf_train_dataset,
    validation_data=tf_test_dataset,
    epochs=1)









  1/391 [..............................] - ETA: 6:40:41 - loss: 0.6845 - accuracy: 0.5938

KeyboardInterrupt: 