# Aspirational DistilBERT Model Using Single Batch 1 + Batch 2

In [2]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
import warnings
from sklearn.utils import shuffle

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [9]:
import os

# Define the folder path and themes
folder_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method'
themes = [
    'Aspirational', 'Attainment', 'Community Consciousness', 'Familial', 'Filial Piety', 
    'First Gen', 'Navigational', 'Perseverance', 'Resistance', 'Social', 'Spiritual'
]

# Initialize an empty dictionary to store DataFrames
batch_1_theme_dataframes = {}
# Loop through each theme and load its corresponding file
for theme in themes:
    # Construct the filename without modifying the theme name
    file_name = f"{theme}_sentence_level_batch_1_jaccard.csv"
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the file exists before attempting to load
    if os.path.exists(file_path):
        batch_1_theme_dataframes[theme] = pd.read_csv(file_path)
        print(f"Loaded {file_name}")
    else:
        print(f"File not found for theme: {theme}")

# Define the folder path and themes
folder_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/batch_2'
themes = [
    'Aspirational', 'Attainment', 'Community Consciouss', 'Familial', 'Filial Piety', 
    'First Generation', 'Navigational', 'Perseverance', 'Resistance', 'Social', 'Spiritual'
]

# Initialize an empty dictionary to store DataFrames
batch_2_theme_dataframes = {}
# Loop through each theme and load its corresponding file
for theme in themes:
    # Construct the filename without modifying the theme name
    file_name = f"{theme}_sentence_level_batch_2_jaccard.csv"
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the file exists before attempting to load
    if os.path.exists(file_path):
        batch_2_theme_dataframes[theme] = pd.read_csv(file_path)
        print(f"Loaded {file_name}")
    else:
        print(f"File not found for theme: {theme}")

Loaded Aspirational_sentence_level_batch_1_jaccard.csv
Loaded Attainment_sentence_level_batch_1_jaccard.csv
Loaded Community Consciousness_sentence_level_batch_1_jaccard.csv
Loaded Familial_sentence_level_batch_1_jaccard.csv
Loaded Filial Piety_sentence_level_batch_1_jaccard.csv
Loaded First Gen_sentence_level_batch_1_jaccard.csv
Loaded Navigational_sentence_level_batch_1_jaccard.csv
Loaded Perseverance_sentence_level_batch_1_jaccard.csv
Loaded Resistance_sentence_level_batch_1_jaccard.csv
Loaded Social_sentence_level_batch_1_jaccard.csv
Loaded Spiritual_sentence_level_batch_1_jaccard.csv
Loaded Aspirational_sentence_level_batch_2_jaccard.csv
Loaded Attainment_sentence_level_batch_2_jaccard.csv
Loaded Community Consciouss_sentence_level_batch_2_jaccard.csv
Loaded Familial_sentence_level_batch_2_jaccard.csv
Loaded Filial Piety_sentence_level_batch_2_jaccard.csv
Loaded First Generation_sentence_level_batch_2_jaccard.csv
Loaded Navigational_sentence_level_batch_2_jaccard.csv
Loaded Persev

In [112]:
aspirational_df_batch_1 = batch_1_theme_dataframes["Attainment"]
aspirational_df_batch_2 = batch_2_theme_dataframes["Attainment"]


merged_aspirational_df = pd.concat([aspirational_df_batch_1, aspirational_df_batch_2])

# Shuffle the merged dataset
merged_aspirational_df = shuffle(merged_aspirational_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.1, random_state=42, stratify=merged_aspirational_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)



In [113]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (4151, 3) 
Test dataset shape: (462, 3)
Positive labels present in the dataset : 295  out of 4151 or 7.106721271982655%
Positive labels present in the test dataset : 33  out of 462 or 7.142857142857142%


In [114]:
training_df

Unnamed: 0,sentence,label,phrase
0,"after working minimum wage part time jobs while in community college, i've learned that there is no way i can do that for the rest of my life.",0,['The bigger reason for why I am here as SF State is to get my degree to get a hopefully get a decent to well paying job when I graduate.']
1,i struggle in day to day life and classes because of the grief.,0,"['To be quite honest, this class is necessary for my degree and most likely to get into medical school. Its a shallow answer, but if I had to elaborate then I would say that this class is a step towards becoming a doctor. Ive wanted to become a doctor for as long as medical books have existed in libraries.']"
2,"it here to learn new things, meet new people, and to be able to learn the culture and how is the america dream that a lot of people from the different country dream of.",0,"['I am here to pursue my career of becoming an Electrical Engineering, and physics is one of the most important parts of it.']"
3,jordaan render phys 102 reflection 1 why am i here?,0,"['I am here because I want to receive a quality education around a new group of people and further my academic career as well as personal character development. I would like to use my background in a way that allows me to me travel and connect with many different people from differing cultures and really gain a broader view on life. My current path is to become a Physical Therapist because I enjoy communicating with and helping people and as a PT, I will have many opportunities to travel for work.']"
4,i am the first in my family to attempt to get into med school.,0,"['I am here because of my goal in trying to get a B.S. in biology concentrated physiology.', 'Im here to become a doctor.']"
...,...,...,...
4146,i was not sure if i wanted to take a sci class but i have never taken a physics class in my life so i knew that this class was supposed to help me out.,0,"[""I decided to take this because I want to be able to pass my physics class so I won't get farther behind in completing my major. The reason for wanting to work so hard is because I want to try my best to finish college in 4 years, even though I know that it is pretty much 5 years now. I am also taking this course so I can be able to go to PA school and this class is a requirement for my major which will get me into PA school.""]"
4147,on the other hand my fantasy football team is pretty good.,0,['I am here because I need it to move on to my other classes and hopefully in the long run become a successful engineer.']
4148,physics has always interested me because i am someone who is looking at how things work.,0,"['In addition, I want to become an engineer someday']"
4149,life works in mysterious ways and mannn am i learning a whole lot.,0,['I want to go to medical school']


In [138]:
from sklearn.utils import resample
import pandas as pd

# Assuming `training_df` already exists and has a 'label' column

# Check original class distribution
print("Original class distribution:")
print(training_df['label'].value_counts())

# Set target distribution
target_minority_ratio = 0.18

  # 20%
target_majority_ratio = 0.85  # 80%

# Separate majority and minority classes
minority_class = training_df[training_df['label'] == 1]
majority_class = training_df[training_df['label'] == 0]

# Use all samples from the minority class
target_minority_count = len(minority_class)

# Calculate the maximum possible number of samples for the majority class
target_majority_count = min(len(majority_class), int(target_minority_count / target_minority_ratio * target_majority_ratio))

# Debug information
print(f"Minority samples: {target_minority_count}, "
      f"Target majority count: {target_majority_count}, "
      f"Available majority samples: {len(majority_class)}")

# Perform undersampling
if target_majority_count < len(majority_class):
    undersampled_majority = resample(
        majority_class,
        replace=False,  # without replacement
        n_samples=target_majority_count,
        random_state=42
    )
else:
    print("Target majority count exceeds available samples. Using all majority samples.")
    undersampled_majority = majority_class

# Combine the resampled majority class with the full minority class
training_df = pd.concat([minority_class, undersampled_majority]).sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
print("New class distribution:")
print(training_df['label'].value_counts())

Original class distribution:
label
0    3856
1     846
Name: count, dtype: int64
Minority samples: 846, Target majority count: 1973, Available majority samples: 3856
New class distribution:
label
0    1973
1     846
Name: count, dtype: int64


In [118]:
# Assuming `training_df` is your dataset
training_df = oversample_minority(
    training_df=training_df,
    minority_label=1,  # The label of the minority class
    majority_label=0,  # The label of the majority class
    target_minority_ratio=0.18  # Target ratio for the minority class
)

Minority samples: 295, Target minority count: 846, Available majority samples: 3856
New class distribution:
label
0    3856
1     846
Name: count, dtype: int64


In [119]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (4702, 3) 
Test dataset shape: (462, 3)
Positive labels present in the dataset : 846  out of 4702 or 17.99234368353892%
Positive labels present in the test dataset : 33  out of 462 or 7.142857142857142%


In [120]:
print(training_df.shape)
print(test_df.shape)

(4702, 3)
(462, 3)


## 2. Experimental Design

(471,)

In [125]:
MAXLEN = 150

X = training_df['sentence']
y = training_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)
X_test.shape

model_name = 'bert-base-uncased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

preprocessing train...
language: en
train sequence lengths:
	mean : 21
	95percentile : 41
	99percentile : 58


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 22
	95percentile : 42
	99percentile : 60


In [126]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = dict(zip(classes, class_weights))

[0.60969917 2.77895981]


In [127]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=16)
# learner.fit_onecycle(2e-5, 4, class_weight=class_weights)
# learner.autofit(2.27E-06, early_stopping=4)
# distillbert_learner.set_weight_decay(0.001)
distillbert_learner.autofit(0.000012, early_stopping=4, class_weight=class_weights)
# distillbert_learner.set_weight_decay(0.001)
# distillbert_learner.autofit(2.27E-06, early_stopping=4, class_weight=class_weights)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 1.2e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 00006: Reducing Max LR on Plateau: new max lr will be 6e-06 (if not early_stopping).
Epoch 7/1024
Epoch 8/1024
Epoch 00008: Reducing Max LR on Plateau: new max lr will be 3e-06 (if not early_stopping).
Restoring model weights from the end of the best epoch: 4.
Epoch 8: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x4477a3280>

In [128]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.99      0.90      0.95       386
           1       0.69      0.96      0.80        85

    accuracy                           0.92       471
   macro avg       0.84      0.93      0.87       471
weighted avg       0.94      0.92      0.92       471



array([[349,  37],
       [  3,  82]])

In [129]:
distillbert_learner.model.summary()

Model: "tf_bert_for_sequence_classification_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_303 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [130]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [131]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [132]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [133]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [134]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 391, False Positive: 38, False Negative: 11, True Positive: 22


In [135]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.91      0.94       429
           1       0.37      0.67      0.47        33

    accuracy                           0.89       462
   macro avg       0.67      0.79      0.71       462
weighted avg       0.93      0.89      0.91       462
 



In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [21]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.6427102188579892
