In [11]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

Num GPUs Available:  1


In [12]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce GTX 1050 Ti (UUID: GPU-ada268bc-35a4-5a51-f812-70e7584578bb)


In [13]:
from transformers import TFAutoModel
from transformers import BertTokenizer

bert128k = TFAutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
#bert = TFAutoModel.from_pretrained("dbmdz/bert-base-turkish-uncased")
#distilbert = TFAutoModel.from_pretrained("dbmdz/distilbert-base-turkish-cased")

Some layers from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
#bert128k.summary()

In [15]:
bert128k.summary()

Model: "tf_bert_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  184345344 
Total params: 184,345,344
Trainable params: 184,345,344
Non-trainable params: 0
_________________________________________________________________


In [16]:
import tensorflow as tf

# two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids = tf.keras.layers.Input(shape=(160,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(160,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert128k.bert(input_ids, attention_mask=mask)[1]  # access final activations (alread max-pooled) [1]
# convert bert embeddings into 6 output classes
x = tf.keras.layers.Dropout(0.2)(embeddings)
x = tf.keras.layers.Dense(512, activation='relu')(embeddings) # (x)
outputs = tf.keras.layers.Dense(6, activation='softmax', name='outputs')(x)

# model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=outputs)

In [17]:
model.layers[2].trainable=False
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 160)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 184345344   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 512)          393728      bert[0][1]                 

In [18]:
optimizer = tf.keras.optimizers.Adam(lr=2e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, 
              loss=loss, 
              metrics=[acc])

In [19]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(16, 160), dtype=tf.int32, name=None),
                 'attention_mask': tf.TensorSpec(shape=(16, 160), dtype=tf.int32, name=None)},
                tf.TensorSpec(shape=(16, 6), dtype=tf.float64, name=None))

# load the training and validation sets
train_ds = tf.data.experimental.load('train', element_spec=element_spec)
val_ds = tf.data.experimental.load('val', element_spec=element_spec)

# view the input format
train_ds.take(1)

<TakeDataset shapes: ({input_ids: (16, 160), attention_mask: (16, 160)}, (16, 6)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.float64)>

In [20]:
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(tf.data.AUTOTUNE)
   
train_ds

<PrefetchDataset shapes: ({input_ids: (16, 160), attention_mask: (16, 160)}, (16, 6)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.float64)>

In [21]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    verbose=1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
