In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
seed = 18

merged_aspirational_df = pd.read_csv("../../../../data/processed_for_model/merged_themes_using_jaccard_method/merged_Aspirational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')

# Shuffle the merged dataset
merged_aspirational_df = shuffle(merged_aspirational_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.2, random_state=18, stratify=merged_aspirational_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [5]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (3763, 3) 
Test dataset shape: (941, 3)
Positive labels present in the dataset : 470  out of 3763 or 12.490034546904067%
Positive labels present in the test dataset : 118  out of 941 or 12.539851222104145%


In [6]:
training_df

Unnamed: 0,sentence,label,phrase
0,this year has been extremely long and challeng...,0,"['So, im here to complete school and walk acro..."
1,i am here at sfsu to gain the knowledge and ex...,1,['I am here at SFSU to gain the knowledge and ...
2,i am taking this sci 115 course so it can help...,0,['Chem 115 is a requirement for me because I a...
3,i have never left the country too which i hate...,0,['I am here because the only thing i want fo b...
4,"with that in mind, i am taking this course so ...",0,['planning on declaring a concentration in zoo...
...,...,...,...
3758,i want to be able to help my family out with p...,0,['I want to get a job in the film industry. Im...
3759,i hope to graduate with my degree and run for ...,0,['I am here because I want to be as educated a...
3760,i am here to learn about physics that are rela...,1,['I am here to learn about physics that are re...
3761,im here to be a better student and a better ve...,1,"[""I'm here to be a better student and a better..."


In [7]:
MAXLEN = 128


In [8]:
X = training_df['sentence']
y = training_df['label']

In [9]:
import ktrain
from ktrain import text

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18, stratify=y)

model_name = 'distilbert-base-cased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()


Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 20
	95percentile : 39
	99percentile : 55


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 39
	99percentile : 54


In [10]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = {0:0.5713635,1:4.00319149}

[0.5713635  4.00319149]


In [11]:
import tensorflow as tf
import numpy as np
import os
import random
def reset_random_seeds(seed=2):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [30]:
# import tensorflow as tf
# from sklearn.metrics import precision_recall_curve, average_precision_score
# import numpy as np

# # Custom PR AUC function using scikit-learn
# def pr_auc_sklearn(y_true, y_pred):
#     if len(y_true.shape) > 1 and y_true.shape[1] > 1:  # Convert one-hot labels to single class
#         y_true = np.argmax(y_true, axis=1)
#     return average_precision_score(y_true, y_pred, average='macro')  # For multilabel

# # Wrap sklearn PR AUC in tf.py_function
# def pr_auc(y_true, y_pred):
#     return tf.py_function(pr_auc_sklearn, [y_true, y_pred], tf.float32)

# # Custom Keras metric for PR AUC
# class PRAUC(tf.keras.metrics.Metric):
#     def __init__(self, name='pr_auc', **kwargs):
#         super().__init__(name=name, **kwargs)
#         self.pr_auc_val = self.add_weight(name='pr_auc_val', initializer='zeros')

#     def update_state(self, y_true, y_pred, sample_weight=None):
#         y_pred = tf.argmax(y_pred, axis=-1) if y_pred.shape[-1] > 1 else tf.round(y_pred)
#         self.pr_auc_val.assign(pr_auc(y_true, y_pred))

#     def result(self):
#         return self.pr_auc_val

#     def reset_states(self):
#         self.pr_auc_val.assign(0.0)


In [41]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = dict(zip(classes, class_weights))


[0.5713635  4.00319149]


In [53]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)

# # Pass the custom metric (PRAUC) to the learner
# learner.model.compile(
#     optimizer='adam',
#     loss='binary_crossentropy',
#     # metrics=[PRAUC()]
# )

# Train the model
learner.autofit(2e-5, early_stopping=4, class_weight=class_weights)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 00003: Reducing Max LR on Plateau: new max lr will be 1e-05 (if not early_stopping).
Epoch 4/1024
Epoch 5/1024
Epoch 00005: Reducing Max LR on Plateau: new max lr will be 5e-06 (if not early_stopping).
Restoring model weights from the end of the best epoch: 1.
Epoch 5: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x5bbe12d60>

In [48]:
learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.88      1.00      0.93       659
           1       0.25      0.93      0.40        94

   micro avg       0.68      0.99      0.81       753
   macro avg       0.56      0.96      0.67       753
weighted avg       0.80      0.99      0.87       753
 samples avg       0.76      0.99      0.84       753





In [49]:
distillbert_predictor = ktrain.get_predictor(learner.model, preproc=distillbert_transformer)

distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

y_pred_distillbert = [np.argmax(pair[1]) for pair in y_pred_distillbert]


tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

True Negative: 823, False Positive: 0, False Negative: 118, True Positive: 0
  Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       823
           1       0.00      0.00      0.00       118

    accuracy                           0.87       941
   macro avg       0.44      0.50      0.47       941
weighted avg       0.76      0.87      0.82       941
 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


True Negative: 681, False Positive: 142, False Negative: 40, True Positive: 78
  Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.83      0.88       823
           1       0.35      0.66      0.46       118

    accuracy                           0.81       941
   macro avg       0.65      0.74      0.67       941
weighted avg       0.87      0.81      0.83       941

In [32]:
distillbert_predictor.save('../../../../saved_models/aspirational_bert_base_cased_model_08012024_v1') # 0.67 

In [5]:
learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

    negative       0.98      0.95      0.97      1440
    positive       0.71      0.88      0.78       185

    accuracy                           0.94      1625
   macro avg       0.85      0.91      0.88      1625
weighted avg       0.95      0.94      0.95      1625



array([[1373,   67],
       [  23,  162]])