# 

- **Questions**: [Here](../data/homework_4/HW.pdf)
- **Answer Set** : 04
- **Full Name** : Fatemeh Karimi Barikarasfi
- **Student Code** : 610301060

The goal of this homework is IMDB Sentiment Classification using Implemented Transformer Encoder and Pre-Trained Bert Model.

[Dataset](https://drive.google.com/drive/folders/1YJvoIpInw0fYz2fjHSR65mjs6IYFZUvN)

## Importing Needed Libraries

In [6]:
import numpy as np
import pickle as pk
import tensorflow as tf
import tensorflow_hub as tfh
import tensorflow_text as tft
import tensorflow_models as tfm
import matplotlib.pyplot as plt

## Importing and Preprocessing data



In [8]:
with open('/gdrive/MyDrive/unsupervised.pickle', 'rb') as f:
    x_unsupervised = pk.load(f)

with open('/gdrive/MyDrive/x_train.pickle', 'rb') as f:
    x_train = pk.load(f)

with open('/gdrive/MyDrive/x_val.pickle', 'rb') as f:
    x_valid = pk.load(f)

with open('/gdrive/MyDrive/x_test.pickle', 'rb') as f:
    x_test = pk.load(f)

y_train = np.loadtxt('/gdrive/MyDrive/y_train.txt', dtype='int32')
y_valid = np.loadtxt('/gdrive/MyDrive/y_val.txt', dtype='int32')

unwanted_digit=['0','1','2','3','4','5','6','7','8','9']

for digit in unwanted_digit:
    x_unsupervised = [sent.replace(digit, "") for sent in x_unsupervised]
    x_train = [sent.replace(digit, "") for sent in x_train]
    x_valid = [sent.replace(digit, "") for sent in x_valid]
    x_test = [sent.replace(digit, "") for sent in x_test]

unwanted_punc=['<br />','...',',','"','=','@','&','%',',',':','\\','$','^','<','>','{','}',';','\n','\t','(',')','[',']','/','*','+','#','-','_','|']

for punc in unwanted_punc:
    x_unsupervised = [sent.replace(punc, "") for sent in x_unsupervised]
    x_train = [sent.replace(punc, "") for sent in x_train]
    x_valid = [sent.replace(punc, "") for sent in x_valid]
    x_test = [sent.replace(punc, "") for sent in x_test]

x_unsupervised = [sent.lower() for sent in x_unsupervised]
x_train = [sent.lower() for sent in x_train]
x_valid = [sent.lower() for sent in x_valid]
x_test = [sent.lower() for sent in x_test]

trainset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).map(lambda x, y: (x, tf.one_hot(y, 2))).batch(32)
validset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).map(lambda x, y: (x, tf.one_hot(y, 2))).batch(32)
testset = tf.data.Dataset.from_tensor_slices(x_test).batch(32)

## BPE

In [5]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset

vocab = bert_vocab_from_dataset.bert_vocab_from_dataset(
    tf.data.Dataset.from_tensor_slices(x_unsupervised),
    vocab_size = 8000,
    learn_params = {},
    reserved_tokens = ["[PAD]", "[UNK]", "[MASK]", "[START]", "[END]"],
    bert_tokenizer_params = dict(lower_case=True)
)

with open('/gdrive/MyDrive/vocab.txt', 'w') as f:
    for token in vocab:
        print(token, file=f)

## Transformer encoder

In [9]:
class TransformerProcessor(tf.keras.layers.Layer):
    def __init__(self, max_length=500):
        super().__init__()

        self.max_length = max_length
        self.tokenizer = tft.BertTokenizer('/gdrive/MyDrive/vocab.txt')
        self.trimmer = tft.RoundRobinTrimmer(max_seq_length=max_length)

    def call(self, x):
        # input_type_ids, input_word_ids, input_mask
        tokens = self.tokenizer.tokenize(x).merge_dims(1, -1)
        tokens = self.trimmer.trim([tokens])[0]

        words, mask = tft.pad_model_inputs(tokens, max_seq_length=self.max_length)

        return {
            "input_mask": tf.cast(mask, tf.int32),
            "input_word_ids": tf.cast(words, tf.int32),
            # "input_type_ids": tf.cast(tf.zeros(words.shape), tf.int32),
        }

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_size, num_layers, num_heads, dense_size, max_length=500):
        super().__init__()

        self.embedding = tf.keras.layers.Embedding(input_dim=8000, output_dim=embed_size, mask_zero=True)
        self.position = tfm.nlp.layers.PositionEmbedding(max_length=max_length)
        self.encoder = tfm.nlp.models.TransformerEncoder(num_layers=num_layers, num_attention_heads=num_heads, intermediate_size=dense_size)
        self.pooler = tf.keras.layers.GlobalAveragePooling1D()

    def call(self, inputs):
        # input_type_ids, input_word_ids, input_mask
        outputs = self.embedding(inputs["input_word_ids"])
        outputs = tf.keras.layers.add([outputs, self.position(outputs)])
        outputs = self.encoder(outputs)
        outputs = self.pooler(outputs)

        return outputs

#### Model 1 (EmbedSize: 128, NumLayer: 2, NumHead: 2)

In [10]:
input = tf.keras.Input(shape=(), dtype=tf.string)
output = TransformerProcessor(max_length=500)(input)
output = TransformerEncoder(embed_size=128, num_layers=2, num_heads=2, dense_size=50, max_length=500)(output)
output = tf.keras.layers.Dropout(0.1)(output)
output = tf.keras.layers.Dense(2, activation='softmax')(output)

model_1 = tf.keras.Model(inputs=input, outputs=output)

model_1.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

print(model_1.summary())


model_1.fit(trainset, validation_data=validset, epochs=5)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 transformer_processor (Tra  {'input_mask': (None, 500)   0         ['input_1[0][0]']             
 nsformerProcessor)          , 'input_word_ids': (None,                                           
                              500)}                                                               
                                                                                                  
 transformer_encoder (Trans  (None, 128)                  1246308   ['transformer_processor[0][0]'
 formerEncoder)                                                     , 'transformer_processor[0

<keras.src.callbacks.History at 0x7f3cb875eda0>

In [11]:
print(model_1.evaluate(validset))

[0.6222927570343018, 0.8571199774742126]


#### Model 2 (EmbedSize: 128, NumLayer: 4, NumHead: 4)

In [12]:
input = tf.keras.Input(shape=(), dtype=tf.string)
output = TransformerProcessor(max_length=500)(input)
output = TransformerEncoder(embed_size=128, num_layers=4, num_heads=4, dense_size=50, max_length=500)(output)
output = tf.keras.layers.Dropout(0.1)(output)
output = tf.keras.layers.Dense(2, activation='softmax')(output)

model_2 = tf.keras.Model(inputs=input, outputs=output)

model_2.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

print(model_2.summary())


model_2.fit(trainset, validation_data=validset, epochs=5)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 transformer_processor_1 (T  {'input_mask': (None, 500)   0         ['input_2[0][0]']             
 ransformerProcessor)        , 'input_word_ids': (None,                                           
                              500)}                                                               
                                                                                                  
 transformer_encoder_2 (Tra  (None, 128)                  1404360   ['transformer_processor_1[0][0
 nsformerEncoder)                                                   ]',                     

<keras.src.callbacks.History at 0x7f3ca8992740>

In [13]:
print(model_2.evaluate(validset))

[0.544844388961792, 0.8555999994277954]


#### Model 3 (EmbedSize: 128, NumLayer: 8, NumHead: 8)

In [14]:
input = tf.keras.Input(shape=(), dtype=tf.string)
output = TransformerProcessor(max_length=500)(input)
output = TransformerEncoder(embed_size=128, num_layers=8, num_heads=8, dense_size=50, max_length=500)(output)
output = tf.keras.layers.Dropout(0.1)(output)
output = tf.keras.layers.Dense(2, activation='softmax')(output)

model_3 = tf.keras.Model(inputs=input, outputs=output)

model_3.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

print(model_3.summary())

model_3.fit(trainset, validation_data=validset, epochs=5)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 transformer_processor_2 (T  {'input_mask': (None, 500)   0         ['input_3[0][0]']             
 ransformerProcessor)        , 'input_word_ids': (None,                                           
                              500)}                                                               
                                                                                                  
 transformer_encoder_4 (Tra  (None, 128)                  1720464   ['transformer_processor_2[0][0
 nsformerEncoder)                                                   ]',                     

<keras.src.callbacks.History at 0x7f3ca5c14970>

In [15]:
print(model_3.evaluate(validset))

[0.5002080798149109, 0.857200026512146]


In [16]:
np.savetxt("/gdrive/MyDrive/y_test.txt", np.argmax(model_1.predict(testset), axis=1), fmt="%d")



## Masked LM BERT

In [17]:
preprocessor = tfh.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

mask_id = preprocessor.tokenize.get_special_tokens_dict()["mask_id"].numpy()
padding_id = preprocessor.tokenize.get_special_tokens_dict()["padding_id"].numpy()
end_of_segment_id = preprocessor.tokenize.get_special_tokens_dict()["end_of_segment_id"].numpy()
start_of_sequence_id = preprocessor.tokenize.get_special_tokens_dict()["start_of_sequence_id"].numpy()
vocab_size = preprocessor.tokenize.get_special_tokens_dict()["vocab_size"].numpy()

selector = tft.RandomItemSelector(max_selections_per_batch=20, selection_rate=0.2, unselectable_ids=[mask_id, padding_id, end_of_segment_id, start_of_sequence_id])
chooser = tft.MaskValuesChooser(vocab_size=vocab_size, mask_token=mask_id, mask_token_rate=0.8, random_token_rate=0.1)

def preprocess(x):
    input = preprocessor(x)

    masked_input_ids, masked_lm_positions, masked_lm_ids = tft.mask_language_model(
        input_ids=tf.RaggedTensor.from_tensor(input["input_word_ids"]),
        item_selector=selector,
        mask_values_chooser=chooser
    )

    words, _ = tft.pad_model_inputs(masked_input_ids, max_seq_length=128)
    positions, _ = tft.pad_model_inputs(masked_lm_positions, max_seq_length=20)
    reals, _ = tft.pad_model_inputs(masked_lm_ids, max_seq_length=20)

    input["input_word_ids"] = tf.cast(words, tf.int32)
    input["masked_lm_positions"] = tf.cast(positions, tf.int32)

    return (input, tf.one_hot(reals, vocab_size))

mlmset = tf.data.Dataset.from_tensor_slices(x_unsupervised).batch(32).map(lambda x: preprocess(x))

tf.saved_model.save(preprocessor, "./bert_preprocessor")

In [20]:
bert = tfh.load("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2")

mask = tf.keras.Input(shape=(128), name="input_mask", dtype=tf.int32)
words = tf.keras.Input(shape=(128), name="input_word_ids", dtype=tf.int32)
types = tf.keras.Input(shape=(128), name="input_type_ids", dtype=tf.int32)
positions = tf.keras.Input(shape=(20), name="masked_lm_positions", dtype=tf.int32)
output = tfh.KerasLayer(bert.mlm, trainable=True)({"input_mask": mask, "input_word_ids": words, "input_type_ids": types, "masked_lm_positions": positions})

model_mlm = tf.keras.Model(inputs=[mask, words, types, positions], outputs=output["mlm_logits"])

model_mlm.compile(
    optimizer="adam",
    loss="categorical_crossentropy"
)

print(model_mlm.summary())

model_mlm.fit(mlmset, epochs=5)

tf.saved_model.save(bert, "./bert_model")

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_mask (InputLayer)     [(None, 128)]                0         []                            
                                                                                                  
 input_type_ids (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 input_word_ids (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                            

## Model BERT Non-Trainable (SeqLen: 128, NumLayer: 4, NumHead: 4)

In [21]:
input = tf.keras.Input(shape=(), dtype=tf.string)
output = tfh.KerasLayer("./bert_preprocessor")(input)
output = tfh.KerasLayer("./bert_model", trainable=False)(output)
output = tf.keras.layers.Dropout(0.1)(output["pooled_output"])
output = tf.keras.layers.Dense(2, activation='softmax')(output)

model_4 = tf.keras.Model(inputs=input, outputs=output)

model_4.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

print(model_4.summary())

model_4.fit(trainset, validation_data=validset, epochs=5)

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 keras_layer_3 (KerasLayer)  {'input_type_ids': (None,    0         ['input_4[0][0]']             
                             128),                                                                
                              'input_word_ids': (None,                                            
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             }                                                              

<keras.src.callbacks.History at 0x7f3bc07906d0>

In [26]:
np.savetxt("/gdrive/MyDrive/y_test2.txt", np.argmax(model_4.predict(testset), axis=1), fmt="%d")



## Model BERT Trainable (SeqLen: 128, NumLayer: 4, NumHead: 4)

<div dir="rtl">
در این قسمت مدل بخش 5 به عنوان استخراج کننده ویژگی به صورت قابل آموزش استفاده خواهد شد.
 </div>

In [22]:
input = tf.keras.Input(shape=(), dtype=tf.string)
output = tfh.KerasLayer("./bert_preprocessor")(input)
output = tfh.KerasLayer("./bert_model", trainable=False)(output)
output = tf.keras.layers.Dropout(0.1)(output["pooled_output"])
output = tf.keras.layers.Dense(2, activation='softmax')(output)

model_5 = tf.keras.Model(inputs=input, outputs=output)

model_5.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

print(model_5.summary())

model_5.fit(trainset, validation_data=validset, epochs=5)

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 keras_layer_5 (KerasLayer)  {'input_word_ids': (None,    0         ['input_5[0][0]']             
                             128),                                                                
                              'input_type_ids': (None,                                            
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             }                                                              

<keras.src.callbacks.History at 0x7f3b64d29630>

In [23]:
np.savetxt("/gdrive/MyDrive/y_test3.txt", np.argmax(model_5.predict(testset), axis=1), fmt="%d")

