# Text classification model based on BERT and LSTM

In [None]:
# Install BERT for tf2 module
%pip install bert-for-tf2
# Install sentencepiece library for text cleaning
%pip install sentencepiece

In [4]:
%pip install --upgrade tensorflow-hub

Collecting tensorflow-hub
  Obtaining dependency information for tensorflow-hub from https://files.pythonhosted.org/packages/6e/1a/fbae76f4057b9bcdf9468025d7a8ca952dec14bfafb9fc0b1e4244ce212f/tensorflow_hub-0.15.0-py2.py3-none-any.whl.metadata
  Downloading tensorflow_hub-0.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading tensorflow_hub-0.15.0-py2.py3-none-any.whl (85 kB)
   ---------------------------------------- 85.4/85.4 kB 2.4 MB/s eta 0:00:00
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.15.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
from tensorflow import keras

In [21]:
%conda install -c conda-forge cudatoolkit=11.2 cudnn=8.1.0
# Anything above 2.10 is not supported on the GPU on Windows Native
%python -m pip install "tensorflow=2.10"

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.




  current version: 23.1.0
  latest version: 23.9.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.9.0


UsageError: Line magic function `%python` not found (But cell magic `%%python` exists, did you mean that instead?).


In [22]:

import tensorflow

In [25]:
from keras.callbacks import CSVLogger

In [77]:
# Import all necessary libraries
try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
import tensorflow_hub as hub
from keras import layers
from keras import callbacks
from keras import optimizers
from keras import utils
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import bert
import os
import numpy as np
import re
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Import the training and test .csv files
colnames=['image_path', 'text', 'food']
train = pd.read_csv('train_titles.csv', names=colnames, header=None, sep = ',', index_col=['image_path'])
test = pd.read_csv('test_titles.csv', names=colnames, header=None, sep = ',', index_col=['image_path'])

In [3]:
# Sort values by 'image_path'
test = test.sort_values('image_path')
train = train.sort_values('image_path')

In [4]:
train.head()

Unnamed: 0_level_0,text,food
image_path,Unnamed: 1_level_1,Unnamed: 2_level_1
apple_pie_0.jpg,Apple pie - Wikipedia,apple_pie
apple_pie_100.jpg,Glazed Apple Pie Squares Recipe | Taste of Home,apple_pie
apple_pie_101.jpg,Mock Apple Pie Recipe - Allrecipes.com,apple_pie
apple_pie_102.jpg,Crock-Pot Ladies Crock-Pot Apple Pie Moonshine,apple_pie
apple_pie_104.jpg,All-Star Apple Pie Recipe | Taste of Home,apple_pie


In [5]:
# Check the shapes
print("train samples:",train.shape[0])
print("test samples:",test.shape[0])

train samples: 67972
test samples: 22716


In [6]:
# Cleaning text function

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    sentence = sentence.lower()

    return sentence

def remove_tags(text):
    return TAG_RE.sub('', text)

TAG_RE = re.compile(r'<[^>]+>')
vec_preprocess_text = np.vectorize(preprocess_text)

In [7]:
# Check number of classes
nClasses = train.food.nunique()
print(nClasses)

101


In [8]:
encoder = LabelEncoder()
processed_train = vec_preprocess_text(train.text.values)
processed_test = vec_preprocess_text(test.text.values)


encoded_labels_train = encoder.fit_transform(train.food.values)
labels_train = utils.to_categorical(encoded_labels_train, nClasses)

encoded_labels_test = encoder.fit_transform(test.food.values)
labels_test = utils.to_categorical(encoded_labels_test, nClasses)

print("Processed text sample:", processed_train[0])
print("Shape of train labels:", labels_train.shape)

Processed text sample: apple pie wikipedia
Shape of train labels: (67972, 101)


In [10]:
# Import the BERT BASE model from Tensorflow HUB (layer, vocab_file and tokenizer)
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [11]:
# Preprocessing of texts according to BERT

def get_masks(text, max_length):
    """Mask for padding"""
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    length = len(tokens)
    if length > max_length:
        tokens = tokens[:max_length]

    return np.asarray([1]*len(tokens) + [0] * (max_length - len(tokens)))
vec_get_masks = np.vectorize(get_masks, signature = '(),()->(n)')

def get_segments(text, max_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    length = len(tokens)
    if length > max_length:
        tokens = tokens[:max_length]
    
    segments = []
    current_segment_id = 0
    with_tags = ["[CLS]"] + tokens + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return np.asarray(segments + [0] * (max_length - len(tokens)))
vec_get_segments = np.vectorize(get_segments, signature = '(),()->(n)')

def get_ids(text, tokenizer, max_length):
    """Token ids from Tokenizer vocab"""
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    length = len(tokens)
    if length > max_length:
        tokens = tokens[:max_length]

    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = np.asarray(token_ids + [0] * (max_length-length))
    return input_ids
vec_get_ids = np.vectorize(get_ids, signature = '(),(),()->(n)')


def prepare(text_array, tokenizer, max_length = 128):
    
    ids = vec_get_ids(text_array, 
                      tokenizer, 
                      max_length).squeeze()
    masks = vec_get_masks(text_array,
                      max_length).squeeze()
    segments = vec_get_segments(text_array,
                      max_length).squeeze()

    return ids, segments, masks

In [63]:
max_length =40 # that must be set according to your dataset
ids_train, segments_train, masks_train = prepare(processed_train,
                                                 tokenizer,
                                                 max_length)
ids_test, segments_test, masks_test = prepare(processed_test, 
                                               tokenizer,
                                               max_length)

In [64]:
input_word_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = layers.Input(shape=(max_length,), dtype=tf.int32,
                                   name="input_masks")
segment_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                    name="segment_ids")
den_out, seq_out = bert_layer([input_word_ids, input_mask, segment_ids])

In [65]:
# Classification Model
input_word_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = layers.Input(shape=(max_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                    name="segment_ids")
den_out, seq_out = bert_layer([input_word_ids, input_mask, segment_ids])

X = layers.LSTM(128)(seq_out)
X = layers.Dropout(0.5)(X)
X = layers.Dense(256, activation="relu")(X)
X = layers.Dropout(0.5)(X)
output = layers.Dense(nClasses, activation = 'softmax')(X)

model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[output])

In [66]:
# Adam optimizer
opt = optimizers.Adam(lr=.001)

# Compile model
model.compile(loss = 'categorical_crossentropy',
              optimizer = opt,
              metrics = ['accuracy'])



In [67]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_word_ids (InputLayer  [(None, 40)]                 0         []                            
 )                                                                                                
                                                                                                  
 input_mask (InputLayer)     [(None, 40)]                 0         []                            
                                                                                                  
 segment_ids (InputLayer)    [(None, 40)]                 0         []                            
                                                                                                  
 keras_layer (KerasLayer)    multiple                     1094822   ['input_word_ids[0][0]',

In [68]:
es = callbacks.EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

In [69]:
# Setup callbacks, logs and early stopping condition
checkpoint_path = "BERT_LSTM/weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
cp = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_accuracy',save_best_only=True,verbose=1, mode='max')
csv_logger = keras.callbacks.CSVLogger('BERT_LSTM/BERT_LSTM.log')
es = keras.callbacks.EarlyStopping(patience = 3, restore_best_weights=True)

In [70]:
# Reduce learning rate if no improvement is observed
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_accuracy', factor=0.1, patience=1, min_lr=0.00001)

In [72]:
history = model.fit([ids_train, masks_train, segments_train], 
          labels_train,
          epochs = 1,
          batch_size = 512,
          validation_split = 0.3,
          callbacks = [csv_logger, reduce_lr]
          )



In [73]:
# Load the log file
df = pd.read_csv('BERT_LSTM/BERT_LSTM.log')

In [74]:
# Training and Test accuracy
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['epoch'], y=df['accuracy'],
                    mode='lines',
                    name='training'))

fig.add_trace(go.Scatter(x=df['epoch'], y=df['val_accuracy'],
                    mode='lines',
                    name='test'))

fig.update_layout(
    font_size = 20,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray')

In [75]:
# Training and Test loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['epoch'], y=df['loss'],
                    mode='lines',
                    name='training'))

fig.add_trace(go.Scatter(x=df['epoch'], y=df['val_loss'],
                    mode='lines',
                    name='test'))

fig.update_layout(
    font_size = 20,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray')

In [76]:
model.evaluate([ids_test, masks_test, segments_test],
               labels_test, 
               batch_size = 512)



[4.200828552246094, 0.534337043762207]