This notebook explores a neural network model built on top of BERT. We also conduct finetuning of the underlying bert weights in addition to the neural network weights. This model does not use any up/down sampling to address the class imbalance. Instead it uses the class weights, to intialize the bias to push the model towards favoring the minority class more. 

Also given the size of the model we introduce global quantization to reduce precision from 32 to 16 bit precision. 

# Installs

In [1]:
# !pip install pyspark
from pyspark.sql import SparkSession
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 
os.environ["SPARK_HOME"] = '/home/ubuntu/spark-3.5.1-bin-hadoop3'

spark = SparkSession.builder \
        .appName("Spark") \
        .master("local[*]") \
        .config("spark.driver.memory", "120G") \
        .getOrCreate()


print("Apache Spark version: ", spark.version)
print("Apache Spark version: ", spark.version) 
findspark.init()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/23 16:25:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Apache Spark version:  3.5.1
Apache Spark version:  3.5.1


In [2]:
# !pip install torch
# !pip install findspark
import findspark
findspark.init()
import torch
print("Pytorch CUDA Available =", torch.cuda.is_available())
print("Pytorch CUDA Device Count =", torch.cuda.device_count())
print("Pytorch CUDA Current Device =", torch.cuda.current_device())
print("Pytorch CUDA Current Device Name =", torch.cuda.get_device_name(torch.cuda.current_device()))

Pytorch CUDA Available = True
Pytorch CUDA Device Count = 4
Pytorch CUDA Current Device = 0
Pytorch CUDA Current Device Name = NVIDIA A10G


In [3]:
!pip install -q --upgrade keras-nlp
!pip install -q --upgrade keras # Upgrade to Keras 3.
# from google.colab import drive
# drive.mount('/content/drive/')
import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers 
import keras_nlp
import pandas as pd
import numpy as np
import pyspark.sql.functions as F

In [5]:
import os
import tensorflow as tf
from tensorflow.keras.mixed_precision import set_global_policy

# Set TensorFlow to use TensorFlow as backend
os.environ["KERAS_BACKEND"] = "tensorflow"

# Configure the GPU memory and set logical devices before initializing them
physical_devices = tf.config.list_physical_devices("GPU")
if physical_devices:
    try:
        # Assuming you have one GPU available, split it into two logical devices
        tf.config.set_logical_device_configuration(
            physical_devices[0],
            [
                tf.config.LogicalDeviceConfiguration(memory_limit=15360 // 2),
                tf.config.LogicalDeviceConfiguration(memory_limit=15360 // 2),
            ]
        )
    except RuntimeError as e:
        print(e)

# Now initialize the logical devices
logical_devices = tf.config.list_logical_devices("GPU")
print(logical_devices)

base_batch_size = 32
base_learning_rate = 1e-4

# Initialize the distributed training strategy after setting up logical devices
strategy = tf.distribute.MirroredStrategy()
print(f"Number of devices: {strategy.num_replicas_in_sync}")

scaled_batch_size = base_batch_size * strategy.num_replicas_in_sync 
print(scaled_batch_size)
scaled_learning_rate = base_learning_rate * strategy.num_replicas_in_sync
print(scaled_learning_rate)

strategy = tf.distribute.MirroredStrategy()
print(f"Number of devices: {strategy.num_replicas_in_sync}")

# Set the global policy to mixed_float16
set_global_policy('mixed_float16')

[LogicalDevice(name='/device:GPU:0', device_type='GPU'), LogicalDevice(name='/device:GPU:1', device_type='GPU'), LogicalDevice(name='/device:GPU:2', device_type='GPU'), LogicalDevice(name='/device:GPU:3', device_type='GPU'), LogicalDevice(name='/device:GPU:4', device_type='GPU')]
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')


Number of devices: 5
160
0.0005
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4')


Number of devices: 5


# Load the dataset

In [6]:
train_merged_data_conversations_path = 's3a://capstone210/data/train_merged_data_conversations/'
test_merged_data_conversations_path = 's3a://capstone210/data/test_merged_data_conversations/'

#latest
df_train = spark.read.parquet(train_merged_data_conversations_path)
df_test = spark.read.parquet(test_merged_data_conversations_path)

#latest
df_train = df_train.filter((F.col('label') == 1) | (F.col('label') == 0))
df_test = df_test.filter((F.col('label') == 1) | (F.col('label') == 0))


24/03/23 16:26:16 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [7]:
# ################## CLASS IMBALANCE ############################
def undersample(df, outcome_col, seed=1234):
  # Split dataset based on outcome
  split0_df = df.filter(F.col(outcome_col) == 0)
  split1_df = df.filter(F.col(outcome_col) == 1)
  # determine which split is major vs minor
  if (split0_df.count() > split1_df.count()):
    major_df = split0_df
    minor_df = split1_df
  else:
    minor_df = split0_df
    major_df = split1_df
  ratio = major_df.count()/minor_df.count()
  print("Ratio of major vs minor before sampling: {}".format(ratio))
  # Start under-sampling with Spark
  sampled_majority_df = major_df.sample(False, 1/ratio, seed)
  combined_df = sampled_majority_df.unionAll(minor_df)
  print(f"Final sample size: {combined_df.count()}")
  return combined_df

# Perform undersampling technique
df_train = undersample(df_train, outcome_col='label')
####################################################################

                                                                                

Ratio of major vs minor before sampling: 33.05456656346749




Final sample size: 5239


                                                                                

In [8]:
df_train = df_train.select("merged_text","label")
df_test = df_test.select("merged_text","label")
(df_trainsplit, df_valsplit) = df_train.randomSplit([0.7, 0.3], seed = 100)

pandas_df_train = df_trainsplit.toPandas()
pandas_df_val = df_valsplit.toPandas()
pandas_df_test = df_test.toPandas()

from sklearn.utils import shuffle
pandas_df_train = shuffle(pandas_df_train)
pandas_df_val = shuffle(pandas_df_val)
pandas_df_test = shuffle(pandas_df_test)


del df_train
del df_valsplit
del df_test

                                                                                

# Data Processing

In [9]:
import tensorflow as tf
import numpy as np

text_column_name = "merged_text"
labels_column_name = "label"
# Extract texts and labels
train_texts = pandas_df_train[text_column_name].values
train_labels = pandas_df_train[labels_column_name].values

val_texts = pandas_df_val[text_column_name].values
val_labels = pandas_df_val[labels_column_name].values

test_texts = pandas_df_test[text_column_name].values
test_labels = pandas_df_test[labels_column_name].values

#keras expects specific dimension for binary, need to adjust the labels dim, extra dimension
train_labels = np.expand_dims(train_labels, axis=-1)
val_labels = np.expand_dims(val_labels, axis=-1)
test_labels = np.expand_dims(test_labels, axis=-1)

# Create TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_texts, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))

In [10]:
# import pandas as pd
# # Assuming train_labels is a 2D numpy array as indicated by the error
# # train_labels = pandas_df_train[labels_column_name].values
# # Convert the 2D numpy array to 1D
# train_labels_1d = train_labels.ravel()
# # Convert the numpy array to a pandas Series
# labels_series = pd.Series(train_labels_1d)
# # Get the count of unique values
# frequency_count = labels_series.value_counts()
# # Display the frequency count
# print(frequency_count)

In [11]:
!nvidia-smi --query-gpu=memory.total --format=csv,noheader

23028 MiB
23028 MiB
23028 MiB
23028 MiB


In [12]:
import keras_nlp
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Initialize BERT preprocessor and backbone from a preset
# preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_tiny_en_uncased")
# backbone = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased")
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased")
backbone = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")

# Batch the dataset before preprocessing
train_dataset_batched = train_dataset.batch(scaled_batch_size)
val_dataset_batched = val_dataset.batch(scaled_batch_size)
test_dataset_batched = test_dataset.batch(scaled_batch_size)

# Apply preprocessing to the batched dataset
train_dataset_preprocessed = (
    train_dataset_batched.map(
        lambda x, y: (preprocessor(x), y),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)
val_dataset_preprocessed = (
    val_dataset_batched.map(
        lambda x, y: (preprocessor(x), y),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

test_dataset_preprocessed = (
    test_dataset_batched.map(
        lambda x, y: (preprocessor(x), y),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

In [16]:
# import numpy as np

# # Assuming `train_labels` contains your training dataset labels
# counts = np.bincount(train_labels)

# # Display the number of positive samples and the percentage of positive samples
# print("Number of positive samples in training data: {} ({:.2f}% of total)".format(
#     counts[1], 100 * float(counts[1]) / len(train_labels)))

# # Calculate the weights for each class
# weight_for_0 = 1.0 / counts[0]
# weight_for_1 = 1.0 / counts[1]
# pos = 2584
# neg = 85413
# # # # Calculate the weights for each class
# # weight_for_0 = 1.0 / neg
# # weight_for_1 = 1.0 / pos
# weight_for_0 = 0.515
# weight_for_1 = 17

# # Define class weights dictionary
# class_weight = {0: weight_for_0, 1: weight_for_1}
# print(class_weight.items())

# Build Model

In [13]:
METRICS = [
keras.metrics.BinaryCrossentropy(name='cross_entropy'),
            keras.metrics.TruePositives(name='tp'),
            keras.metrics.FalsePositives(name='fp'),
            keras.metrics.TrueNegatives(name='tn'),
            keras.metrics.FalseNegatives(name='fn'),
            keras.metrics.BinaryAccuracy(name='accuracy'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc')
]

In [14]:
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D, SeparableConv2D

os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
print(os.getenv('TF_GPU_ALLOCATOR'))

cuda_malloc_async


In [17]:
# Make the BERT backbone not trainable to freeze its weights
def make_model(backbone, num_classes=1, metrics=METRICS):
  backbone.trainable = False

  # Define the model architecture
  inputs = backbone.input
  sequence_output = backbone(inputs)["sequence_output"]

  # # Add additional transformer encoders
  # for _ in range(1):
  #     sequence_output = keras_nlp.layers.TransformerEncoder(
  #         num_heads=2,
  #         intermediate_dim=512,
  #         dropout=0.1,
  #     )(sequence_output)

  # Use the [CLS] token output to classify
  cls_output = sequence_output[:, backbone.cls_token_index, :]
  cls_output = layers.Dense(256, activation='relu')(cls_output)
  cls_output = layers.Dropout(0.1)(cls_output)
  # cls_output = layers.Dense(256, activation='relu')(cls_output)
  # cls_output = layers.Dropout(0.1)(cls_output)
  cls_output = layers.Dense(128, activation='relu')(cls_output)
  cls_output = layers.Dropout(0.1)(cls_output)
  outputs = keras.layers.Dense(units = 1, activation='sigmoid')(cls_output)  # Assuming `num_classes` is defined

  # Build the model
  model = keras.Model(inputs, outputs)
  sgd = optimizers.SGD(0.01)

  # Compile the model
  model.compile(
      #optimizer=keras.optimizers.Adam(0.001),
      optimizer=sgd,
      #loss=keras.losses.BinaryCrossentropy(from_logits=True),
      loss= tf.keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, gamma=2, from_logits=False),
      metrics=METRICS
  )
  return model

# Display the model summary
model = make_model(backbone=backbone, num_classes=1, metrics=METRICS)
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_precision', # Monitor validation loss
    min_delta=0.001, # Minimum change to qualify as an improvement
    patience=3, # Number of epochs with no improvement after which training
    verbose=1,
    mode='min', # Stop training when the quantity monitored has stopped dec
    restore_best_weights=False # Restore model weights from the best epoch
)
# callbacks = keras.callbacks.ModelCheckpoint("childgrooming_NN_model_{epoch}.keras", save_best_only=True)
callbacks = keras.callbacks.ModelCheckpoint(
    "best_weights.weights.h5",  # Filename for the saved weights
    save_best_only=True,  # Save only the best model
    save_weights_only=True,  # Save only the weights, not the full model
)
# Train the model
model.fit(
    train_dataset_preprocessed,
    validation_data=val_dataset_preprocessed,
    # batch_size= 1000,
    epochs=2,  # Adjust the number of epochs according to your needs
    #class_weight=class_weight,  # Use class weights
    callbacks=[early_stopping, callbacks]
)

In [1]:
model

NameError: name 'model' is not defined

# Run Validation Set Eval

In [None]:
#predict = model.predict(validation_generator, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False, verbose=0 )


In [None]:
from sklearn.metrics import classification_report, fbeta_score, confusion_matrix
# Predict probabilities for test data
y_pred_prob = model.predict(val_dataset_preprocessed)
# Convert probabilities to binary labels based on 0.5 threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Assuming test_labels are your true binary labels for the test set
# Flatten y_pred to match the shape of test_labels if necessary
y_pred = y_pred.flatten()

# Calculate F_beta score with beta=3
f_beta3_score_test = fbeta_score(val_labels, y_pred, average='binary', beta=3, pos_label=1)
print("F1 Score (w/ Beta =3):", round((f_beta3_score_test*100), 4))


In [None]:
y_pred_prob

In [None]:
print(classification_report(val_labels, y_pred))

# Run Test Set Eval

In [39]:
from sklearn.metrics import fbeta_score
# Predict probabilities for test data
y_pred_prob_test = model.predict(test_dataset_preprocessed)
# Convert probabilities to binary labels based on 0.5 threshold
y_pred_test = np.where(y_pred_prob_test > 0.5, 1, 0)

# Assuming test_labels are your true binary labels for the test set
# Flatten y_pred to match the shape of test_labels if necessary
y_pred_test = y_pred_test.flatten()

# Calculate F_beta score with beta=3
f_beta3_score_test = fbeta_score(test_labels, y_pred_test, average='binary', beta=3, pos_label=1)
print("F1 Score (w/ Beta =3):", round((f_beta3_score_test*100), 4))

[1m2424/2424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m469s[0m 194ms/step
F1 Score (w/ Beta =3): 0.0


In [31]:
np.unique(y_pred_prob_test)

array([0.], dtype=float16)

In [40]:
print(classification_report(test_labels, y_pred_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    152233
           1       0.00      0.00      0.00      2895

    accuracy                           0.98    155128
   macro avg       0.49      0.50      0.50    155128
weighted avg       0.96      0.98      0.97    155128



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


which was solved by using more neurons, more layers and adding more dropout. Also lowering learning rate almost always helps.

* less.more layers?
* shuffle the training data
* try batch very small like 1-5
* add more dropout like 0.3
* try with leaky relu instead
   * https://datascience.stackexchange.com/questions/39042/how-to-use-leakyrelu-as-activation-function-in-sequence-dnn-in-keraswhen-it-per
    
    