In [29]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

from transformers import BertTokenizer, TFBertModel

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *

import tensorflow_text as text
import tensorflow_addons as tfa

from transformers import create_optimizer

tf.get_logger().setLevel('ERROR')

In [7]:
%%capture
!pip install tensorflow-addons tensorflow-text

In [9]:
%config Completer.use_jedi = False
%matplotlib inline

In [10]:
num_epochs = 12
batch_size = 32
max_seq_length = 128

In [13]:
%%capture
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [28]:
def tokenize(caption):
    x = tokenizer(caption, truncation=True, padding='max_length', max_length=max_seq_length, return_tensors="tf")
    return x["input_ids"], x["attention_mask"], x["token_type_ids"]

def preprocess_dataset(dataset):
    input_ids = np.zeros((len(dataset), max_seq_length))
    attention_masks = np.zeros((len(dataset), max_seq_length))
    token_type_ids = np.zeros((len(dataset), max_seq_length))
    
    dataset.reset_index(drop=True, inplace=True)
    
    # loop through data and tokenize everything
    for i, row in dataset.iterrows():
        input_ids[i, :], attention_masks[i, :], token_type_ids[i, :] = tokenize(row["text"])
        
    return input_ids, attention_masks, token_type_ids

In [21]:
def build_model(trainable_encoder=False):
    input_ids = tf.keras.Input(shape=(max_seq_length, ), dtype='int32', name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_seq_length, ), dtype='int32', name="attention_mask")
    token_type_ids = tf.keras.Input(shape=(max_seq_length, ), dtype='int32', name="token_type_ids")

    encoded_ouput = bert_model({"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids})
    output = GlobalMaxPooling1D(name="global_max_pooling1d")(encoded_ouput["last_hidden_state"])
    
    model = tf.keras.models.Model(inputs={"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}, outputs=output)

    # freeze the encoder network
    model.layers[2].trainable = trainable_encoder
    return model

In [22]:
model = build_model()

In [30]:
caption_files = glob.glob("data/captions/*.csv")

In [45]:
for caption_file in caption_files:
    output_file = f"data/features/text/{caption_file.split('/')[-1].replace('.csv', '.npy')}"
    
    caption_data = pd.read_csv(caption_file)
    
    input_ids, attention_mask, token_type_ids = preprocess_dataset(caption_data)
    
    features = model.predict({
        "input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids
    })
    
    np.save(output_file, features)

In [46]:
!chown -R 1000:1000 data/features/text/*