# Aspirational DistilBERT Model Using Single Batch 1 + Batch 2

In [2]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
import warnings
from sklearn.utils import shuffle

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [None]:
# import os

# # Define the folder path and themes
# folder_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method'
# themes = [
#     'Aspirational', 'Attainment', 'Community Consciousness', 'Familial', 'Filial Piety', 
#     'First Gen', 'Navigational', 'Perseverance', 'Resistance', 'Social', 'Spiritual'
# ]

# # Initialize an empty dictionary to store DataFrames
# batch_1_theme_dataframes = {}
# # Loop through each theme and load its corresponding file
# for theme in themes:
#     # Construct the filename without modifying the theme name
#     file_name = f"{theme}_sentence_level_batch_1_jaccard.csv"
#     file_path = os.path.join(folder_path, file_name)
    
#     # Check if the file exists before attempting to load
#     if os.path.exists(file_path):
#         batch_1_theme_dataframes[theme] = pd.read_csv(file_path)
#         print(f"Loaded {file_name}")
#     else:
#         print(f"File not found for theme: {theme}")

# # Define the folder path and themes
# folder_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/batch_2'
# themes = [
#     'Aspirational', 'Attainment', 'Community Consciouss', 'Familial', 'Filial Piety', 
#     'First Generation', 'Navigational', 'Perseverance', 'Resistance', 'Social', 'Spiritual'
# ]

# # Initialize an empty dictionary to store DataFrames
# batch_2_theme_dataframes = {}
# # Loop through each theme and load its corresponding file
# for theme in themes:
#     # Construct the filename without modifying the theme name
#     file_name = f"{theme}_sentence_level_batch_2_jaccard.csv"
#     file_path = os.path.join(folder_path, file_name)
    
#     # Check if the file exists before attempting to load
#     if os.path.exists(file_path):
#         batch_2_theme_dataframes[theme] = pd.read_csv(file_path)
#         print(f"Loaded {file_name}")
#     else:
#         print(f"File not found for theme: {theme}")

In [3]:
attainment_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Attainment/Attainment_sentence_level_batch_1_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])
attainment_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Attainment/Attainment_sentence_level_batch_2_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])

aspirational_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Aspirational/Aspirational_sentence_level_batch_1_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])
aspirational_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Aspirational/Aspirational_sentence_level_batch_2_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])

asp_plus_df = [attainment_df_batch_1,
               attainment_df_batch_2,
               aspirational_df_batch_1,
               aspirational_df_batch_2,
               ]
merged_aspirational_df = pd.concat(asp_plus_df)

merged_aspirational_df.head()

Unnamed: 0,sentence,label,phrase
0,why am i here?,0,"[""Ever since I was little I wanted to be a doctor, so much so that I can't see myself doing anything else.""]"
1,well why does anyone pursue a higher education?,0,"[""Ever since I was little I wanted to be a doctor, so much so that I can't see myself doing anything else.""]"
2,to better one self and be able to succeed later on in life.,0,"[""Ever since I was little I wanted to be a doctor, so much so that I can't see myself doing anything else.""]"
3,"ever since i was little i wanted to be a doctor, so much so that i can't see myself doing anything else.",1,"[""Ever since I was little I wanted to be a doctor, so much so that I can't see myself doing anything else.""]"
4,i always wanted to be able to help people and i always had an intrest in medicine and the human body.,0,"[""Ever since I was little I wanted to be a doctor, so much so that I can't see myself doing anything else.""]"


In [4]:
# Shuffle the merged dataset
merged_aspirational_df = shuffle(merged_aspirational_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.1, random_state=18, stratify=merged_aspirational_df["label"])


training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [5]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (10709, 3) 
Test dataset shape: (1190, 3)
Positive labels present in the dataset : 1864  out of 10709 or 17.40592025399197%
Positive labels present in the test dataset : 207  out of 1190 or 17.394957983193276%


In [32]:
# from sklearn.utils import resample
# import pandas as pd

# # Assuming `training_df` already exists and has a 'label' column

# # Check original class distribution
# print("Original class distribution:")
# print(training_df['label'].value_counts())

# # Set target distribution
# target_minority_ratio = 0.40  # 20%
# target_majority_ratio = 0.60  # 80%

# # Separate majority and minority classes
# minority_class = training_df[training_df['label'] == 1]
# majority_class = training_df[training_df['label'] == 0]

# # Use all samples from the minority class
# target_minority_count = len(minority_class)

# # Calculate the maximum possible number of samples for the majority class
# target_majority_count = min(len(majority_class), int(target_minority_count / target_minority_ratio * target_majority_ratio))

# # Debug information
# print(f"Minority samples: {target_minority_count}, "
#       f"Target majority count: {target_majority_count}, "
#       f"Available majority samples: {len(majority_class)}")

# # Perform undersampling
# if target_majority_count < len(majority_class):
#     undersampled_majority = resample(
#         majority_class,
#         replace=False,  # without replacement
#         n_samples=target_majority_count,
#         random_state=42
#     )
# else:
#     print("Target majority count exceeds available samples. Using all majority samples.")
#     undersampled_majority = majority_class

# # Combine the resampled majority class with the full minority class
# training_df = pd.concat([minority_class, undersampled_majority]).sample(frac=1, random_state=42).reset_index(drop=True)

# # Check new class distribution
# print("New class distribution:")
# print(training_df['label'].value_counts())

Original class distribution:
label
0    3864
1     848
Name: count, dtype: int64
Minority samples: 848, Target majority count: 1272, Available majority samples: 3864
New class distribution:
label
0    1272
1     848
Name: count, dtype: int64


In [5]:
from sklearn.utils import resample
import pandas as pd

def oversample_minority(training_df, minority_label, majority_label, target_minority_ratio):
    """
    Oversample the minority class to achieve a desired class ratio.

    Parameters:
        training_df (pd.DataFrame): The input training dataset containing a 'label' column.
        minority_label (int): The label of the minority class (e.g., 1).
        majority_label (int): The label of the majority class (e.g., 0).
        target_minority_ratio (float): The desired ratio of the minority class (e.g., 0.18 for 18%).

    Returns:
        pd.DataFrame: The balanced dataset with oversampled minority class.
    """
    # Separate majority and minority classes
    minority_class = training_df[training_df['label'] == minority_label]
    majority_class = training_df[training_df['label'] == majority_label]

    # Calculate the target size for the minority class
    target_minority_count = int(len(majority_class) * (target_minority_ratio / (1 - target_minority_ratio)))

    # Debug information
    print(f"Minority samples: {len(minority_class)}, "
          f"Target minority count: {target_minority_count}, "
          f"Available majority samples: {len(majority_class)}")

    # Perform oversampling
    oversampled_minority = resample(
        minority_class,
        replace=True,  # with replacement
        n_samples=target_minority_count,
        random_state=seed
    )

    # Combine the resampled minority class with the majority class
    balanced_training_df = pd.concat([oversampled_minority, majority_class]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Check the new class distribution
    print("New class distribution:")
    print(balanced_training_df['label'].value_counts())

    return balanced_training_df

In [6]:
train_df, validate_df = train_test_split(training_df, test_size=0.1, random_state=18)

In [22]:
# Assuming `training_df` is your dataset
training_df = oversample_minority(
    training_df=train_df,
    minority_label=1,  # The label of the minority class
    majority_label=0,  # The label of the majority class
    target_minority_ratio=0.30  # Target ratio for the minority class
)

Minority samples: 257, Target minority count: 1490, Available majority samples: 3478
New class distribution:
label
0    3478
1    1490
Name: count, dtype: int64


In [9]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))
pos_labels = len([n for n in validate_df['label'] if n==1])
print("Positive labels present in the validation dataset : {}  out of {} or {}%".format(pos_labels, len(validate_df['label']), (pos_labels/len(validate_df['label']))*100))

Training dataset shape: (4151, 3) 
Test dataset shape: (462, 3)
Positive labels present in the dataset : 724  out of 4151 or 17.441580342086244%
Positive labels present in the test dataset : 81  out of 462 or 17.532467532467532%


NameError: name 'validate_df' is not defined

In [24]:
print(training_df.shape)
print(test_df.shape)

(4968, 3)
(462, 3)


## 2. Experimental Design

In [25]:
validate_df

Unnamed: 0,sentence,label,phrase
1486,"i want to at least help animals, such as my bird and reptiles that need care, as there are not that many exotic vets, only primarily those who focus on cats and dogs.",0,"['I know I want to become an exotic Veterinarian, but I feel like I am not smart enough to get into Vet School. I want to at least help animals, such as my bird and reptiles that need care, as there are not that many exotic vets, only primarily those who focus on cats and dogs. If that is the path I eventually do not take, I am very interested in virology and would like to study them. Maybe that is why I am here, to at least provide a slight change to science, although it may not be a big one at least it is something I can contribute to the world.']"
2888,as i mentioned earlier im here because i need to take physics in as a course for my kinesiology degree.,0,['As I mentioned earlier Im here because I need to take physics in as a course for my Kinesiology degree. Another reason why I feel like Im here is to further my education and career . To keep striving at being a better person each and every day .']
4042,it's funny because ive never been happier than i ever been because of everything ive been doing lately.,0,['The only way that Im able to create a better future for my community is by getting a PhD']
2915,"science has always intrigued me, but i feel like the schooling system of stem classes kind of makes it harder for me to enjoy.",0,"[""In the context of school, is I am here for the reason I'm on the path to become a doctor.""]"
4081,"while in high school i was deciding between 3 school, but ultimately picked here.",0,"['I also wanted to go into civil engineer, and this was the only school with it, the others had mechanical but not civil. As to why I am currently here in this moment in time of writing this essay thing, I am a third year trying to become a civil engineer.']"
...,...,...,...
2402,"point and case, my cat sitting on my printer glaring at me because he wants something from me.",0,['This could help me simply better understand every day experiences but ultimately will help me in my future career as a physical therapist. Im at SFSU because I want a degree.. I want a degree in kinesiology because I want to pursue becoming a physical therapist.']
1107,"this class is also along side a lecture, which has been able to help me better understand concepts by doing experiments.",0,"['By the end of this class, I hope to have a better understanding of what physics is and that it will help me when I take my MCAT and enter medical school in the next couple of years after I graduate with my fouryear degree at San Francisco State.']"
1308,i am part of the first generation in my family to go to college and hopefully get a degree and i am trying to make my parents proud but also learn things for myself and get a career that will give me financial stability and also a career where i will help tons of people.,0,"['Why am I here, I am here because I want to become a doctor and go onto medical school. I am here to hopefully pass this class and come out with a good grade and complete the requirement in my major. I am here to get a degree to go on and become a doctor which is what I am studying for. I am part of the first generation in my family to go to college and hopefully get a degree and I am trying to make my parents proud but also learn things for myself and get a career that will give me financial stability and also a career where I will help tons of people.']"
3843,it may be helpful as future reference when there might be a situation or problem i may encounter in my life since some of the analysis and theories are relatable to use.,0,['I chose to be in this class because it is a requirement to fulfill in order for me to obtain my Cell and Molecular Biology bachelors degree.']


In [26]:
# MAXLEN = 256

# X_train = training_df['sentence']
# y_train = training_df['label']
# X_test = validate_df['sentence']
# y_test = validate_df['label']
# # Split the data
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)
# # X_test.shape

# model_name = 'bert-base-uncased'

# distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
# training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
# validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
# distillbert_base_model = distillbert_transformer.get_classifier()

preprocessing train...
language: en
train sequence lengths:
	mean : 22
	95percentile : 41
	99percentile : 60


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 22
	95percentile : 39
	99percentile : 58


In [6]:
MAXLEN = 150

X = training_df['sentence']
y = training_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)


model_name = 'bert-base-uncased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 21
	95percentile : 39
	99percentile : 55


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 41
	99percentile : 61


In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights

class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = dict(zip(classes, class_weights))

[0.59887037 3.02856335]


In [None]:
# Build BERT model
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
distillbert_learner.set_weight_decay(0.001)
distillbert_learner.autofit(3.328132762062889e-05, epochs=12, early_stopping=4, class_weight=class_weights)



begin training using triangular learning rate policy with max lr of 3.328132762062889e-05...
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
 303/1607 [====>.........................] - ETA: 6:48 - loss: 0.2213 - accuracy: 0.8839

In [13]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.97      0.92      0.94       343
           1       0.69      0.88      0.77        73

    accuracy                           0.91       416
   macro avg       0.83      0.90      0.86       416
weighted avg       0.92      0.91      0.91       416



array([[314,  29],
       [  9,  64]])

In [14]:
distillbert_learner.model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [16]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [17]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [18]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [19]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 352, False Positive: 29, False Negative: 2, True Positive: 79


In [20]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.92      0.96       381
           1       0.73      0.98      0.84        81

    accuracy                           0.93       462
   macro avg       0.86      0.95      0.90       462
weighted avg       0.95      0.93      0.94       462
 



In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [21]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.6427102188579892
