# Aspirational DistilBERT Model Using Merged Data Experiment 1.1

In [1]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
import warnings
from sklearn.utils import shuffle

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [3]:
merged_aspirational_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Aspirational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')

# Shuffle the merged dataset
merged_aspirational_df = shuffle(merged_aspirational_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.1, random_state=18, stratify=merged_aspirational_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

## 2. Experimental Design

In [4]:
MAXLEN = 150

In [5]:
X = training_df['sentence']
y = training_df['label']

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18, stratify=y)

In [6]:
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

ModuleNotFoundError: No module named 'keras_tuner'

In [18]:
import keras_tuner as kt

# Preprocess the text data
t = text.Transformer('distilbert-base-uncased', maxlen=512, classes=np.unique(y))
train_data = t.preprocess_train(X_train.tolist(), y_train.tolist())
val_data = t.preprocess_test(X_test.tolist(), y_test.tolist())
# Create the model
model = t.get_classifier()

# Define hyperparameter search space
search_space = {
    'epochs': [1, 2, 3, 4, 5],
    'batch_size': [8, 16, 32, 64],
    'learning_rate': [1e-5, 3e-5, 5e-5],
}

# Initialize the Ktrain Tuner
tuner = kt.Hyperband(model,
                               train_data,
                               val_data,
                               search_space,
                               max_epochs=10,
                               patience=3,
                               min_epochs=1)

# Start the tuning process
tuner.fit()

# Retrieve the best hyperparameters
best_hyperparams = tuner.get_best_hyperparameters()
print("Best Hyperparameters: ", best_hyperparams)


ModuleNotFoundError: No module named 'keras_tuner'

In [6]:
model_name = 'distilbert-base-uncased'

In [10]:
distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 21
	95percentile : 39
	99percentile : 53


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 20
	95percentile : 38
	99percentile : 63


In [11]:
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)

In [12]:
distillbert_learner.autofit(2e-5, 5)



begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x3a63113d0>

In [13]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       741
           1       0.35      0.28      0.31       106

    accuracy                           0.84       847
   macro avg       0.62      0.60      0.61       847
weighted avg       0.83      0.84      0.84       847



array([[685,  56],
       [ 76,  30]])

In [14]:
distillbert_learner.model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [16]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [17]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [18]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [19]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 362, False Positive: 50, False Negative: 35, True Positive: 24


In [20]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.89       412
           1       0.32      0.41      0.36        59

    accuracy                           0.82       471
   macro avg       0.62      0.64      0.63       471
weighted avg       0.84      0.82      0.83       471
 



In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [21]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.6427102188579892
