# Aspirational DistilBERT Model Using Single Batch 1 + Batch 2

In [1]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
import warnings
from sklearn.utils import shuffle

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [3]:
import os

# Define the folder path and themes
folder_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method'
themes = [
    'Aspirational', 'Attainment', 'Community Consciousness', 'Familial', 'Filial Piety', 
    'First Gen', 'Navigational', 'Perseverance', 'Resistance', 'Social', 'Spiritual'
]

# Initialize an empty dictionary to store DataFrames
batch_1_theme_dataframes = {}
# Loop through each theme and load its corresponding file
for theme in themes:
    # Construct the filename without modifying the theme name
    file_name = f"{theme}_sentence_level_batch_1_jaccard.csv"
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the file exists before attempting to load
    if os.path.exists(file_path):
        batch_1_theme_dataframes[theme] = pd.read_csv(file_path)
        print(f"Loaded {file_name}")
    else:
        print(f"File not found for theme: {theme}")

# Define the folder path and themes
folder_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/batch_2'
themes = [
    'Aspirational', 'Attainment', 'Community Consciouss', 'Familial', 'Filial Piety', 
    'First Generation', 'Navigational', 'Perseverance', 'Resistance', 'Social', 'Spiritual'
]

# Initialize an empty dictionary to store DataFrames
batch_2_theme_dataframes = {}
# Loop through each theme and load its corresponding file
for theme in themes:
    # Construct the filename without modifying the theme name
    file_name = f"{theme}_sentence_level_batch_2_jaccard.csv"
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the file exists before attempting to load
    if os.path.exists(file_path):
        batch_2_theme_dataframes[theme] = pd.read_csv(file_path)
        print(f"Loaded {file_name}")
    else:
        print(f"File not found for theme: {theme}")

Loaded Aspirational_sentence_level_batch_1_jaccard.csv
Loaded Attainment_sentence_level_batch_1_jaccard.csv
Loaded Community Consciousness_sentence_level_batch_1_jaccard.csv
Loaded Familial_sentence_level_batch_1_jaccard.csv
Loaded Filial Piety_sentence_level_batch_1_jaccard.csv
Loaded First Gen_sentence_level_batch_1_jaccard.csv
Loaded Navigational_sentence_level_batch_1_jaccard.csv
Loaded Perseverance_sentence_level_batch_1_jaccard.csv
Loaded Resistance_sentence_level_batch_1_jaccard.csv
Loaded Social_sentence_level_batch_1_jaccard.csv
Loaded Spiritual_sentence_level_batch_1_jaccard.csv
Loaded Aspirational_sentence_level_batch_2_jaccard.csv
Loaded Attainment_sentence_level_batch_2_jaccard.csv
Loaded Community Consciouss_sentence_level_batch_2_jaccard.csv
Loaded Familial_sentence_level_batch_2_jaccard.csv
Loaded Filial Piety_sentence_level_batch_2_jaccard.csv
Loaded First Generation_sentence_level_batch_2_jaccard.csv
Loaded Navigational_sentence_level_batch_2_jaccard.csv
Loaded Persev

In [45]:
familial_batch_1 = batch_1_theme_dataframes["Familial"]
familial_batch_2 = batch_2_theme_dataframes["Familial"]


merged_familial_df = pd.concat([familial_batch_1, familial_batch_2])

# Shuffle the merged dataset
merged_familial_df = shuffle(merged_familial_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_familial_df, test_size=0.1, random_state=18, stratify=merged_familial_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)



In [8]:
merged_familial_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Familial_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_familial_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Familial Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')

merged_familial_df = pd.concat([merged_familial_df_batch_1, merged_familial_df_batch_2])

# Shuffle the merged dataset
merged_familial_df = shuffle(merged_familial_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_familial_df, test_size=0.3, random_state=42, stratify=merged_familial_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [27]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (1289, 3) 
Test dataset shape: (144, 3)
Positive labels present in the dataset : 106  out of 1289 or 8.223429014740109%
Positive labels present in the test dataset : 12  out of 144 or 8.333333333333332%


In [28]:
training_df

Unnamed: 0,sentence,label,phrase
0,if i was capable of doing my work while in labor i have tons of reasons why i am here!,0,"['But my mom raised me and my sister. Also I was given the opportunity to be the first one in college and prove my family wrong that even though I was raised by a single parent I still made it to college.', 'I am here now to teach my daughter the right and wrong as she gets older. I am her leader as well as her dad even though her father is from another background (Im Mexican, his Salvadorian).', '']"
1,"anyways, the path im referring to is basically the roadmap of the career im currently interested in, which is being a physician in emergency medicine; in other words, im unsure if i should pursue this career due to financial reasons and time.",0,"['Of course, an important reason as to why I am here is because of everything my parents have done for me from all the guidance, sacrifices, affection, time, and so on that theyve given me till now.']"
2,i find it therapeutic to deliberately and thoroughly research a single topic in the lab.,0,"['There is a certain pressure put upon many first generation children to thrive for the sake of the family. Ive been thinking about that a lot these past couple years. I wonder how this pressure has affected my development, and in what ways (if any) do I plan to fulfill these generational wishes.']"
3,i have a dream that is to learn things in computer science and then program a game myself.,0,"['When I was still in high school, the question that what major should I study was always brought up by my parents, teachers, and even my schoolmates.']"
4,i am also here for my parents who never got to finish college themselves.,0,"['', 'I am also here for my parents who never got to finish college themselves. I am here to make them proud. To one day use my education to make a bunch of money and repay them everything they have done for me because they deserve it.', '']"
...,...,...,...
1284,"ever since i was a little girl, ive dreamed of becoming a veterinarian.",0,"['Im an at SFSU because I come from a family that did not have the privilege of attending college, so I am doing it also on my familys behalf. I hope that one day when I graduate from college and become a vet, my family will be proud of me and know that I didnt only go it for my own benefit, but theirs as well. I also want to be able to give back to my family as much as possible, because they sacrificed so much to get me to where I am today.']"
1285,i am here to take this physics class so i can graduate with a degree from san francisco state university.,0,['It was hard but my parents always prioritized school first for my sister and me.']
1286,but currently right now i found another purpose of my years here.,0,['My support system is my parents. Yes they did encourage and force me to choose my major but I do find interest in it.']
1287,i want to help change our world for the better and i dont want to end up living a life that ill regret.,0,"['I wouldnt be here if it wasnt for my parents constant support financially, physically, and mentally. I am grateful for everything they have done and will do to get me where I need to be.']"


In [49]:
from sklearn.utils import resample
import pandas as pd

def oversample_minority(training_df, minority_label, majority_label, target_minority_ratio):
    """
    Oversample the minority class to achieve a desired class ratio.

    Parameters:
        training_df (pd.DataFrame): The input training dataset containing a 'label' column.
        minority_label (int): The label of the minority class (e.g., 1).
        majority_label (int): The label of the majority class (e.g., 0).
        target_minority_ratio (float): The desired ratio of the minority class (e.g., 0.18 for 18%).

    Returns:
        pd.DataFrame: The balanced dataset with oversampled minority class.
    """
    # Separate majority and minority classes
    minority_class = training_df[training_df['label'] == minority_label]
    majority_class = training_df[training_df['label'] == majority_label]

    # Calculate the target size for the minority class
    target_minority_count = int(len(majority_class) * (target_minority_ratio / (1 - target_minority_ratio)))

    # Debug information
    print(f"Minority samples: {len(minority_class)}, "
          f"Target minority count: {target_minority_count}, "
          f"Available majority samples: {len(majority_class)}")

    # Perform oversampling
    oversampled_minority = resample(
        minority_class,
        replace=True,  # with replacement
        n_samples=target_minority_count,
        random_state=42
    )

    # Combine the resampled minority class with the majority class
    balanced_training_df = pd.concat([oversampled_minority, majority_class]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Check the new class distribution
    print("New class distribution:")
    print(balanced_training_df['label'].value_counts())

    return balanced_training_df

In [50]:
# Assuming `training_df` is your dataset
training_df = oversample_minority(
    training_df=training_df,
    minority_label=1,  # The label of the minority class
    majority_label=0,  # The label of the majority class
    target_minority_ratio=0.18  # Target ratio for the minority class
)

Minority samples: 106, Target minority count: 259, Available majority samples: 1183
New class distribution:
label
0    1183
1     259
Name: count, dtype: int64


In [32]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (1442, 3) 
Test dataset shape: (144, 3)
Positive labels present in the dataset : 259  out of 1442 or 17.96116504854369%
Positive labels present in the test dataset : 12  out of 144 or 8.333333333333332%


In [33]:
print(training_df.shape)
print(test_df.shape)

(1442, 3)
(144, 3)


## 2. Experimental Design

In [53]:
MAXLEN = 150

X = merged_familial_df['sentence']
y = merged_familial_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)


In [56]:
# Combine X_train and y_train into a single DataFrame
training_df = pd.DataFrame({
    'sentence': X_train,
    'label': y_train
})

# Reset the index to avoid misalignment issues
training_df.reset_index(drop=True, inplace=True)

training_df

Unnamed: 0,sentence,label
0,if i was capable of doing my work while in labor i have tons of reasons why i am here!,0
1,"anyways, the path im referring to is basically the roadmap of the career im currently interested in, which is being a physician in emergency medicine; in other words, im unsure if i should pursue this career due to financial reasons and time.",0
2,i find it therapeutic to deliberately and thoroughly research a single topic in the lab.,0
3,i have a dream that is to learn things in computer science and then program a game myself.,0
4,i am also here for my parents who never got to finish college themselves.,0
...,...,...
1284,"ever since i was a little girl, ive dreamed of becoming a veterinarian.",0
1285,i am here to take this physics class so i can graduate with a degree from san francisco state university.,0
1286,but currently right now i found another purpose of my years here.,0
1287,i want to help change our world for the better and i dont want to end up living a life that ill regret.,0


In [58]:
MAXLEN = 150

X = merged_familial_df['sentence']
y = merged_familial_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)

# Combine X_train and y_train into a single DataFrame
training_df = pd.DataFrame({
    'sentence': X_train,
    'label': y_train
})

# Reset the index to avoid misalignment issues
training_df.reset_index(drop=True, inplace=True)

# Assuming `training_df` is your dataset
training_df = oversample_minority(
    training_df=training_df,
    minority_label=1,  # The label of the minority class
    majority_label=0,  # The label of the majority class
    target_minority_ratio=0.18  # Target ratio for the minority class
)

X_train = training_df["sentence"]
y_train = test_df["label"]

model_name = 'bert-base-uncased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

Minority samples: 106, Target minority count: 259, Available majority samples: 1183
New class distribution:
label
0    1183
1     259
Name: count, dtype: int64
preprocessing train...
language: en
train sequence lengths:
	mean : 22
	95percentile : 45
	99percentile : 62


IndexError: index 144 is out of bounds for axis 0 with size 144

In [47]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = dict(zip(classes, class_weights))

[0.54480135 6.08018868]


In [48]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
# learner.fit_onecycle(2e-5, 4, class_weight=class_weights)
# learner.autofit(2.27E-06, early_stopping=4)
# distillbert_learner.set_weight_decay(0.001)
distillbert_learner.set_weight_decay(0.001)
distillbert_learner.autofit(4.96E-05, epochs=11, early_stopping=4, class_weight=class_weights)
# distillbert_learner.set_weight_decay(0.001)
# distillbert_learner.autofit(2.27E-06, early_stopping=4, class_weight=class_weights)



begin training using triangular learning rate policy with max lr of 4.96e-05...
Epoch 1/11


KeyboardInterrupt: 

In [37]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       119
           1       0.83      0.96      0.89        26

    accuracy                           0.96       145
   macro avg       0.91      0.96      0.93       145
weighted avg       0.96      0.96      0.96       145



array([[114,   5],
       [  1,  25]])

In [38]:
distillbert_learner.model.summary()

Model: "tf_bert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_151 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [40]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [41]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [42]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [43]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 122, False Positive: 10, False Negative: 7, True Positive: 5


In [44]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.92      0.93       132
           1       0.33      0.42      0.37        12

    accuracy                           0.88       144
   macro avg       0.64      0.67      0.65       144
weighted avg       0.89      0.88      0.89       144
 



In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [21]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.6427102188579892
