# Familial DistilBERT Model Using Merged Data Batch 1 + Batch 2 Experiment 2

In [4]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
from sklearn.model_selection import StratifiedKFold
import warnings
from sklearn.utils import shuffle

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [5]:
merged_familial_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Familial_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_familial_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Familial Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')

merged_familial_df = pd.concat([merged_familial_df_batch_1, merged_familial_df_batch_2])

# Shuffle the merged dataset
merged_familial_df = shuffle(merged_familial_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_familial_df, test_size=0.1, random_state=42, stratify=merged_familial_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [6]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (2380, 3) 
Test dataset shape: (265, 3)
Positive labels present in the dataset : 228  out of 2380 or 9.57983193277311%
Positive labels present in the test dataset : 25  out of 265 or 9.433962264150944%


In [7]:
print(training_df.shape)
print(test_df.shape)

(2380, 3)
(265, 3)


## 2. Experimental Design

In [8]:
MAXLEN = 150

In [9]:
X = training_df['sentence']
y = training_df['label']

In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)
X_test.shape

(238,)

In [11]:
model_name = 'bert-base-uncased'

In [12]:
best_val_loss = np.inf  # Initialize with a very large value for minimum loss
best_val_acc = 0  # Initialize with a very low accuracy
best_model = None  # Placeholder to store the best model

In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import gc

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

dict(zip(classes, class_weights))


[0.55297398 5.21929825]


{0: 0.5529739776951673, 1: 5.219298245614035}

In [15]:
# Number of folds for cross-validation
n_folds = 5
MAXLEN = 150
model_name = 'distilbert-base-uncased'

# Initialize stratified K-Fold
skf = StratifiedKFold(n_splits=n_folds)

# Placeholder for results
cv_results = []

# Loop over each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{n_folds}")

    # Split the data into training and validation sets using .iloc
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Convert to list format (if they are pandas Series)
    X_train_fold = X_train_fold.tolist()
    X_val_fold = X_val_fold.tolist()
    y_train_fold = y_train_fold.tolist()
    y_val_fold = y_val_fold.tolist()

    # Define classes and class labels
    classes = np.array([0, 1])
    class_labels = list(training_df.label)

    # Compute class weights
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)


    # Initialize the distillbert transformer for this fold
    distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0, 1])

    # Preprocess the training and validation sets for the current fold
    training_set = distillbert_transformer.preprocess_train(X_train_fold, y_train_fold)
    validation_set = distillbert_transformer.preprocess_test(X_val_fold, y_val_fold)

    # Build the model
    distillbert_base_model = distillbert_transformer.get_classifier()

    # Initialize the learner
    distilbert_learner = ktrain.get_learner(
        distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6
    )

    # Apply weight decay
    distilbert_learner.set_weight_decay(0.001)

    # Train the model with autofit and early stopping
    distilbert_learner.autofit(2e-5, 2, class_weight=dict(zip(classes, class_weights)))

    distilbert_learner.validate(class_names=distillbert_transformer.get_classes())


    # Access the validation loss and accuracy from the last epoch
    val_loss = distilbert_learner.history.history['val_loss'][-1]
    val_acc = distilbert_learner.history.history['val_accuracy'][-1]

    # Store the result for this fold
    print(f"Fold {fold+1} - Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}")
    cv_results.append((val_loss, val_acc))

    

    # Update the best model if the current one has a lower validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_acc = val_acc
        best_model = distilbert_learner.model  # Save the best model

    gc.collect()
    del distilbert_learner.model

# After all folds are done, compute mean and std of the performance
mean_val_acc = np.mean([x[1] for x in cv_results])
std_val_acc = np.std([x[1] for x in cv_results])

mean_val_loss = np.mean([x[0] for x in cv_results])
std_val_loss = np.std([x[0] for x in cv_results])

print(f"Mean Validation Accuracy: {mean_val_acc:.4f} (+/- {std_val_acc:.4f})")
print(f"Mean Validation Loss: {mean_val_loss:.4f} (+/- {std_val_loss:.4f})")

# Print the best model's validation performance
print(f"Best Model - Validation Accuracy: {best_val_acc:.4f}, Validation Loss: {best_val_loss:.4f}")

Fold 1/5
Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 21
	95percentile : 42
	99percentile : 58


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 40
	99percentile : 58




begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

           0       1.00      0.81      0.89       431
           1       0.35      0.98      0.51        45

    accuracy                           0.83       476
   macro avg       0.67      0.89      0.70       476
weighted avg       0.94      0.83      0.86       476

Fold 1 - Validation Accuracy: 0.8256, Validation Loss: 0.3721
Fold 2/5
preprocessing train...
language: en
train sequence lengths:
	mean : 21
	95percentile : 41
	99percentile : 57


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 22
	95percentile : 44
	99percentile : 58




begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/2


In [52]:
import os
from datetime import datetime

# Get current time
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Create 'performance' folder if it doesn't exist
if not os.path.exists('performance'):
    os.makedirs('performance')

# Evaluate the best model on the holdout set
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

# Preprocess the holdout data
holdout_set = distillbert_transformer.preprocess_test(distillbert_test_data, distillbert_test_label)

# Create a new learner for the best model and evaluate it on the holdout set
best_predictor = ktrain.get_predictor(best_model, preproc=distillbert_transformer)


# Print the confusion matrix and classification report for the best model on the holdout set
y_pred_distillbert = best_predictor.predict(distillbert_test_data)
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

# Classification report and confusion matrix for holdout set
holdout_report = classification_report(distillbert_test_label, y_pred_distillbert)
print(holdout_report)
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
holdout_matrix = 'True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp)

print(f"Holdout Set - Classification Report:\n{holdout_report}")
print(holdout_matrix)

# Filename for metrics
filename = 'performance/metrics.txt'  # Using a fixed filename to append to

# Save classification report and confusion matrix for holdout set
# Open file in append mode; it will create the file if it doesn't exist
with open(filename, 'a') as f:
    f.write("\n\n")
    f.write(current_time)
    f.write("\n\n")
    f.write("Holdout Set - Classification Report:\n")
    f.write(str(holdout_report))
    f.write("\n\n")
    f.write("Holdout Set - Confusion Matrix:\n")
    f.write(holdout_matrix)


preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 43
	99percentile : 64


              precision    recall  f1-score   support

           0       0.91      1.00      0.95       240
           1       0.00      0.00      0.00        25

    accuracy                           0.91       265
   macro avg       0.45      0.50      0.48       265
weighted avg       0.82      0.91      0.86       265

Holdout Set - Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       240
           1       0.00      0.00      0.00        25

    accuracy                           0.91       265
   macro avg       0.45      0.50      0.48       265
weighted avg       0.82      0.91      0.86       265

True Negative: 240, False Positive: 0, False Negative: 25, True Positive: 0


In [10]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)


[0.55297398 5.21929825]


In [11]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
# learner.fit_onecycle(2e-5, 4, class_weight=class_weights)
# learner.autofit(2.27E-06, early_stopping=4)
distillbert_learner.set_weight_decay(0.001)
distillbert_learner.autofit(2e-5, early_stopping=4)
# distillbert_learner.set_weight_decay(0.001)
# distillbert_learner.autofit(2.27E-06, early_stopping=4, class_weight=class_weights)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 7/1024
Epoch 8/1024
Epoch 00008: Reducing Max LR on Plateau: new max lr will be 1e-05 (if not early_stopping).
Epoch 9/1024
Epoch 10/1024
Epoch 00010: Reducing Max LR on Plateau: new max lr will be 5e-06 (if not early_stopping).
Restoring model weights from the end of the best epoch: 6.
Epoch 10: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x359ddb7c0>

In [14]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       215
           1       0.62      0.70      0.65        23

    accuracy                           0.93       238
   macro avg       0.79      0.82      0.81       238
weighted avg       0.93      0.93      0.93       238



array([[205,  10],
       [  7,  16]])

In [16]:
distillbert_learner.model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [18]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [19]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [20]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [21]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 229, False Positive: 11, False Negative: 19, True Positive: 6


In [22]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.94       240
           1       0.35      0.24      0.29        25

    accuracy                           0.89       265
   macro avg       0.64      0.60      0.61       265
weighted avg       0.87      0.89      0.88       265
 



In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [23]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.5970833333333333
