# Navigational DistilBERT Model Using Single Batch 1 + Batch 2

In [2]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
import warnings
from sklearn.utils import shuffle

# Set random seed"
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [31]:
navigational_batch_1_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Navigational/Navigational_sentence_level_batch_1_jaccard_reevaluated.csv")
navigational_batch_2_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Navigational/Navigational_sentence_level_batch_2_jaccard_reevaluated.csv")

merged_navigational_df = pd.concat([navigational_batch_1_df, navigational_batch_2_df])


# Function to check for duplicates in merged data
def find_duplicates(merged_themes):
    duplicates = {}
    
    for theme, data in merged_themes.items():
        # Check for duplicates across all columns
        duplicate_rows = data[data.duplicated()]
        if not duplicate_rows.empty:
            duplicates[theme] = duplicate_rows
    
    return duplicates

find_duplicates(merged_navigational_df)

{'sentence': 107                                                                                                                             at the sci classes, we usually go over worksheets that answer my questions.
 108                                                                                                                                                                           im here because i want to be.
 274                                                                                                                                                                                          why am i here?
 319                                                                                                                                                                                          why am i here?
 450                                                                                                                                                                    

In [32]:
merged_navigational_df

Unnamed: 0,sentence,label,phrase,updated_label,comments,review
0,"i am here because i want to better myself my family, not only financially but in health.",0,['Being in this instituion will pave a way for me to become a professional.'],0,"Better fits Aspirational, Familial, or Social themes.",False
1,being in this instituion will pave a way for me to become a professional.,1,['Being in this instituion will pave a way for me to become a professional.'],1,Reviewed; no issues found.,True
2,i know that as a child i never thought of education and a career for someone who is undocumented.,0,['Being in this instituion will pave a way for me to become a professional.'],0,Reviewed; no issues found.,True
3,"i hope to reach a position in which i can inspire and prove to all of the ""immigrants"" in the usa that achieving and surpassing struggle is possible.",0,['Being in this instituion will pave a way for me to become a professional.'],0,Reviewed; no issues found.,True
4,"also, statistics have shown that people with higher education and wealth tend to be healthier.",0,['Being in this instituion will pave a way for me to become a professional.'],0,Reviewed; no issues found.,True
...,...,...,...,...,...,...
5001,physics will be very helpful in reallife situations that i may come across in the medical field.,0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True
5002,the physics lab specifically is showing me isolated scenarios and allowing me to analyze them so that i can fully understand them when i come across them in real life.,0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True
5003,an example of me using this once i become a doctor is if my patient was in a car accident.,0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True
5004,"if i understand the mechanics of the physics behind a car accident, i will be able to better assist my patients and help them to the best of my ability.",0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True


In [5]:
merged_navigational_df.drop_duplicates(keep='first')
merged_navigational_df.shape

(8202, 6)

In [33]:
merged_navigational_df.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Navigational/Navigational_merged.csv", index=False)

In [6]:
merged_navigational_df

Unnamed: 0,sentence,label,phrase,updated_label,comments,review
0,"i am here because i want to better myself my family, not only financially but in health.",0,['Being in this instituion will pave a way for me to become a professional.'],0,"Better fits Aspirational, Familial, or Social themes.",False
1,being in this instituion will pave a way for me to become a professional.,1,['Being in this instituion will pave a way for me to become a professional.'],1,Reviewed; no issues found.,True
2,i know that as a child i never thought of education and a career for someone who is undocumented.,0,['Being in this instituion will pave a way for me to become a professional.'],0,Reviewed; no issues found.,True
3,"i hope to reach a position in which i can inspire and prove to all of the ""immigrants"" in the usa that achieving and surpassing struggle is possible.",0,['Being in this instituion will pave a way for me to become a professional.'],0,Reviewed; no issues found.,True
4,"also, statistics have shown that people with higher education and wealth tend to be healthier.",0,['Being in this instituion will pave a way for me to become a professional.'],0,Reviewed; no issues found.,True
...,...,...,...,...,...,...
5001,physics will be very helpful in reallife situations that i may come across in the medical field.,0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True
5002,the physics lab specifically is showing me isolated scenarios and allowing me to analyze them so that i can fully understand them when i come across them in real life.,0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True
5003,an example of me using this once i become a doctor is if my patient was in a car accident.,0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True
5004,"if i understand the mechanics of the physics behind a car accident, i will be able to better assist my patients and help them to the best of my ability.",0,"['My main reason for taking this class is to prepare myself for my future studies and eventually my future career goals. After receiving my bachelors degree I hope to continue on to medical school, where I will eventually become a doctor.']",0,Reviewed; no issues found.,True


In [16]:
# Shuffle the merged dataset
merged_social_df = shuffle(merged_navigational_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_social_df, test_size=0.1, random_state=18, stratify=merged_social_df["label"])


training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [17]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (2664, 4) 
Test dataset shape: (296, 4)
Positive labels present in the dataset : 367  out of 2664 or 13.776276276276276%
Positive labels present in the test dataset : 41  out of 296 or 13.85135135135135%


In [10]:
training_df

Unnamed: 0,sentence,label,phrase,orig_label
0,"with the right push from friends, family and others, i have successfully made it this far into my studies in order to pursue my dream and make something of myself within my community. i am here on behalf of those cheering me on to accomplish the completion of higher education; an opportunity some are not given.",0,"['With the right push from friends, family and others, I have successfully made it this far into my studies in order to pursue my dream and make something of myself within my community.']",1
1,being able to work on worksheets and make mistakes on it and learn from it is a great environment.,0,['I learn best when I am in a class setting working with other students and peers.'],0
2,if something i can make or contribute my efforts to can make that happen that would be satisfying to me.,0,"[""I think that I'm here to contribute to the lives of others for the general good. I desire to be of general help and to help others who are more capable than I to do well. I don't want riches; just enough to not live in constant fear of failing to meet basic needs and want that for others.""]",0
3,i chose the major kinesiology because of my background in sports and my love for health and fitness.,0,['I wanted to pursue Physical Therapy because I love to help people and improve their quality of life through manipulating certain muscles that will relieve stress or pain.'],0
4,"i am here at school to prepare for my future, to learn, and to grow.",0,"['Since I was younger, I had always wanted to help other individuals. I wish to spread kindness throughout the world. I am grateful that I am able to be surrounded by classmates that are so passionate about their career choice and so kind in helping each other.']",0
...,...,...,...,...
2659,i came to san francisco to change the person that i am.,0,['These changes that have impacted my life has got me wanting to share it with the people back at home.'],0
2660,i am the friend you come to when you need advice.,0,"[""I believe I am here for a reason, only God knows. I think as I grow up, I'm starting to come to a better understanding of what my purpose is here on Earth, but I do not completely understand yet. Sometimes, I feel as if I am here simply to be an aid to others. Maybe I was placed on this Earth so I can help other people. I Never want anyone to feel that way, so I make sure that no matter who enters my life, I become someone they can trust and confide in.""]",0
2661,with a community i would feel comfortable enough to ask questions without having to worry about sounding unintelligent.,1,"['I would be able to connect more with my fellow classmates. I figured that if I felt like I created a little community with the supplemental course, I would feel more comfortable with physics problems.']",0
2662,i found sci 230 to be super helpful in my studying and getting me to really grasp concepts.,0,['I chose to take Sci 230 to have a larger community around me so I could have more structure and help with Bio 230.'],0


In [11]:
test_df

Unnamed: 0,sentence,label,phrase,orig_label
0,learn and apply my skills to progress towards my career goal.,0,"['Meet people who have (similar) aspirationsgoals', 'Network']",0
1,im here to really learn physics so that i dont have to catch up later.,0,['It will be nice to have friends outside of this group too.'],0
2,i'm here to make myself content as i can be and to travel.,0,"[""I'm here because I want to be a nurse and help people with their needs, and so that I am able to support myself without having to worry about how much rent is going to be a dent in my wallet, so that I can learn as much as I can, especially in university where I have access to knowledge and resources that I may not have again.""]",0
3,"i'm here in physics 111112 because i have to take it for my major, and i'm here in sci because my physics class is pretty rough.",0,"[""spiritually I feel like I'm here for a big reason, and I think it's mainly just to help other people.""]",0
4,those kids that i taught (ranged from the ages three through twelve) would hike three hours in the jungle every single week to stay in the dorms to get an education.,0,['Those kids that I taught (ranged from the ages three through twelve) would hike three hours in the jungle every single week to stay in the dorms to get an education. These students in Thailand have taught me to be thankful for the opportunity I have to be able to go to college.'],0
...,...,...,...,...
291,san francisco is a really good place to explore new concepts and get to know new people.,0,"['Secondly, I am here to make friends and experience new things.']",0
292,when i was younger i enjoyed maths because it came easy to me.,0,['We were really able to collaborate with our classmates to help each other along and I really enjoyed the field trips. I saw that this could bring the opportunity a balanced worklife in and out of the field.'],0
293,i am taking a sci helper class to better understand what i am learning in class.,0,"['I also like hearing from different people on how they are learning the material because it might also help me. It is very helpful to have someone in the same shoes as you, since you are taking the class together it is a much better time to get to know your class mates and forum study groups.']",0
294,"i am here xxx, yes i am here.",0,['I think that I was made to make a difference in the peoples lives around me to make their days better and just so they can have someone to be there for them.'],0


In [13]:
# Filter rows where the 'sentence' column contains a specific substring
substring = "i am taking a sci helper class to better understand what i am learning in class."  # Replace with your desired substring
filtered_df = training_df[training_df['sentence'].str.contains(substring, case=False, na=False)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,sentence,label,phrase,orig_label


In [32]:
# from sklearn.utils import resample
# import pandas as pd

# # Assuming `training_df` already exists and has a 'label' column

# # Check original class distribution
# print("Original class distribution:")
# print(training_df['label'].value_counts())

# # Set target distribution
# target_minority_ratio = 0.40  # 20%
# target_majority_ratio = 0.60  # 80%

# # Separate majority and minority classes
# minority_class = training_df[training_df['label'] == 1]
# majority_class = training_df[training_df['label'] == 0]

# # Use all samples from the minority class
# target_minority_count = len(minority_class)

# # Calculate the maximum possible number of samples for the majority class
# target_majority_count = min(len(majority_class), int(target_minority_count / target_minority_ratio * target_majority_ratio))

# # Debug information
# print(f"Minority samples: {target_minority_count}, "
#       f"Target majority count: {target_majority_count}, "
#       f"Available majority samples: {len(majority_class)}")

# # Perform undersampling
# if target_majority_count < len(majority_class):
#     undersampled_majority = resample(
#         majority_class,
#         replace=False,  # without replacement
#         n_samples=target_majority_count,
#         random_state=42
#     )
# else:
#     print("Target majority count exceeds available samples. Using all majority samples.")
#     undersampled_majority = majority_class

# # Combine the resampled majority class with the full minority class
# training_df = pd.concat([minority_class, undersampled_majority]).sample(frac=1, random_state=42).reset_index(drop=True)

# # Check new class distribution
# print("New class distribution:")
# print(training_df['label'].value_counts())

Original class distribution:
label
0    3864
1     848
Name: count, dtype: int64
Minority samples: 848, Target majority count: 1272, Available majority samples: 3864
New class distribution:
label
0    1272
1     848
Name: count, dtype: int64


In [None]:
# Filter rows where the 'sentence' column contains a specific substring
substring = "i am here to learn"  # Replace with your desired substring
filtered_df = merged_df[merged_df['sentence'].str.contains(substring, case=False, na=False)]

# Display the filtered DataFrame
filtered_df

In [5]:
from sklearn.utils import resample
import pandas as pd

def oversample_minority(training_df, minority_label, majority_label, target_minority_ratio):
    """
    Oversample the minority class to achieve a desired class ratio.

    Parameters:
        training_df (pd.DataFrame): The input training dataset containing a 'label' column.
        minority_label (int): The label of the minority class (e.g., 1).
        majority_label (int): The label of the majority class (e.g., 0).
        target_minority_ratio (float): The desired ratio of the minority class (e.g., 0.18 for 18%).

    Returns:
        pd.DataFrame: The balanced dataset with oversampled minority class.
    """
    # Separate majority and minority classes
    minority_class = training_df[training_df['label'] == minority_label]
    majority_class = training_df[training_df['label'] == majority_label]

    # Calculate the target size for the minority class
    target_minority_count = int(len(majority_class) * (target_minority_ratio / (1 - target_minority_ratio)))

    # Debug information
    print(f"Minority samples: {len(minority_class)}, "
          f"Target minority count: {target_minority_count}, "
          f"Available majority samples: {len(majority_class)}")

    # Perform oversampling
    oversampled_minority = resample(
        minority_class,
        replace=True,  # with replacement
        n_samples=target_minority_count,
        random_state=seed
    )

    # Combine the resampled minority class with the majority class
    balanced_training_df = pd.concat([oversampled_minority, majority_class]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Check the new class distribution
    print("New class distribution:")
    print(balanced_training_df['label'].value_counts())

    return balanced_training_df

In [14]:
train_df, validate_df = train_test_split(training_df, test_size=0.1, random_state=18)

In [22]:
# Assuming `training_df` is your dataset
training_df = oversample_minority(
    training_df=train_df,
    minority_label=1,  # The label of the minority class
    majority_label=0,  # The label of the majority class
    target_minority_ratio=0.30  # Target ratio for the minority class
)

Minority samples: 257, Target minority count: 1490, Available majority samples: 3478
New class distribution:
label
0    3478
1    1490
Name: count, dtype: int64


In [15]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))
pos_labels = len([n for n in validate_df['label'] if n==1])
print("Positive labels present in the validation dataset : {}  out of {} or {}%".format(pos_labels, len(validate_df['label']), (pos_labels/len(validate_df['label']))*100))

Training dataset shape: (2664, 4) 
Test dataset shape: (296, 4)
Positive labels present in the dataset : 367  out of 2664 or 13.776276276276276%
Positive labels present in the test dataset : 41  out of 296 or 13.85135135135135%
Positive labels present in the validation dataset : 35  out of 267 or 13.108614232209737%


In [24]:
print(training_df.shape)
print(test_df.shape)

(4968, 3)
(462, 3)


## 2. Experimental Design

In [25]:
validate_df

Unnamed: 0,sentence,label,phrase
1486,"i want to at least help animals, such as my bird and reptiles that need care, as there are not that many exotic vets, only primarily those who focus on cats and dogs.",0,"['I know I want to become an exotic Veterinarian, but I feel like I am not smart enough to get into Vet School. I want to at least help animals, such as my bird and reptiles that need care, as there are not that many exotic vets, only primarily those who focus on cats and dogs. If that is the path I eventually do not take, I am very interested in virology and would like to study them. Maybe that is why I am here, to at least provide a slight change to science, although it may not be a big one at least it is something I can contribute to the world.']"
2888,as i mentioned earlier im here because i need to take physics in as a course for my kinesiology degree.,0,['As I mentioned earlier Im here because I need to take physics in as a course for my Kinesiology degree. Another reason why I feel like Im here is to further my education and career . To keep striving at being a better person each and every day .']
4042,it's funny because ive never been happier than i ever been because of everything ive been doing lately.,0,['The only way that Im able to create a better future for my community is by getting a PhD']
2915,"science has always intrigued me, but i feel like the schooling system of stem classes kind of makes it harder for me to enjoy.",0,"[""In the context of school, is I am here for the reason I'm on the path to become a doctor.""]"
4081,"while in high school i was deciding between 3 school, but ultimately picked here.",0,"['I also wanted to go into civil engineer, and this was the only school with it, the others had mechanical but not civil. As to why I am currently here in this moment in time of writing this essay thing, I am a third year trying to become a civil engineer.']"
...,...,...,...
2402,"point and case, my cat sitting on my printer glaring at me because he wants something from me.",0,['This could help me simply better understand every day experiences but ultimately will help me in my future career as a physical therapist. Im at SFSU because I want a degree.. I want a degree in kinesiology because I want to pursue becoming a physical therapist.']
1107,"this class is also along side a lecture, which has been able to help me better understand concepts by doing experiments.",0,"['By the end of this class, I hope to have a better understanding of what physics is and that it will help me when I take my MCAT and enter medical school in the next couple of years after I graduate with my fouryear degree at San Francisco State.']"
1308,i am part of the first generation in my family to go to college and hopefully get a degree and i am trying to make my parents proud but also learn things for myself and get a career that will give me financial stability and also a career where i will help tons of people.,0,"['Why am I here, I am here because I want to become a doctor and go onto medical school. I am here to hopefully pass this class and come out with a good grade and complete the requirement in my major. I am here to get a degree to go on and become a doctor which is what I am studying for. I am part of the first generation in my family to go to college and hopefully get a degree and I am trying to make my parents proud but also learn things for myself and get a career that will give me financial stability and also a career where I will help tons of people.']"
3843,it may be helpful as future reference when there might be a situation or problem i may encounter in my life since some of the analysis and theories are relatable to use.,0,['I chose to be in this class because it is a requirement to fulfill in order for me to obtain my Cell and Molecular Biology bachelors degree.']


In [26]:
# MAXLEN = 256

# X_train = training_df['sentence']
# y_train = training_df['label']
# X_test = validate_df['sentence']
# y_test = validate_df['label']
# # Split the data
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)
# # X_test.shape

# model_name = 'bert-base-uncased'

# distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
# training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
# validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
# distillbert_base_model = distillbert_transformer.get_classifier()

preprocessing train...
language: en
train sequence lengths:
	mean : 22
	95percentile : 41
	99percentile : 60


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 22
	95percentile : 39
	99percentile : 58


In [18]:
MAXLEN = 150

X = training_df['sentence']
y = training_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)


model_name = 'bert-base-uncased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 22
	95percentile : 40
	99percentile : 60


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 37
	99percentile : 54


In [19]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights

class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = dict(zip(classes, class_weights))

[0.57988681 3.62942779]


In [20]:
# Build BERT model
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
distillbert_learner.set_weight_decay(0.001)
distillbert_learner.autofit(3.328132762062889e-05, epochs=12, early_stopping=4, class_weight=class_weights)



begin training using triangular learning rate policy with max lr of 3.328132762062889e-05...
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 12: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x3a29c1130>

In [21]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       230
           1       0.79      0.89      0.84        37

    accuracy                           0.95       267
   macro avg       0.88      0.93      0.90       267
weighted avg       0.95      0.95      0.95       267



array([[221,   9],
       [  4,  33]])

In [22]:
distillbert_learner.model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [24]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [25]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [26]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [27]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 242, False Positive: 13, False Negative: 2, True Positive: 39


In [28]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       255
           1       0.75      0.95      0.84        41

    accuracy                           0.95       296
   macro avg       0.87      0.95      0.90       296
weighted avg       0.96      0.95      0.95       296
 



In [16]:
training_df.to_csv("social_training.csv", index=False)
test_df.to_csv("social_test.csv", index=False)

In [29]:
test_df_copy = test_df.copy()
test_df_copy["predicted_label"] = y_pred_distillbert
test_df_copy

Unnamed: 0,sentence,label,phrase,orig_label,predicted_label
0,i am here because i want to be.,0,"['I have a passion for babies and I myself when I was born, was born in a car, so I was incubated for about a month because I did not have much oxygen, so it was a scary moment for both my parents']",0,0
1,"besides this class, i am here in sf state to learn and gain enough knowledge for me to apply into a nursing program after i graduate.",0,"['In terms of my presence, I believe that God has a time and place for myself here in this school, earth, and where I am in life.']",0,0
2,"all i know was, this country will give me a better life than my own country.",0,"['It here to learn new things, meet new people, and to be able to learn the culture and how is the AMERICA DREAM that a lot of people from the different country dream of.']",0,0
3,"the reason why im here is because my parents decided they want to have kids, and the sperm that won out of the other thousands that couldve made it, but ended up dying before they could, making me the winner and here right now on this day.",0,"['I think I am here to help people, and support them, and for the special people in my life I am here to bring them happiness, laughter and love.']",0,0
4,this class serves me in multiple ways which helps me learn without having to go to tutors all the time.,0,"['Furthermore, I get to learn and work with likeminded individuals who have similar goals.']",0,0
...,...,...,...,...,...
291,years later i am so glad that i made this decision because i would not be who i am today had i not.,0,['I have met the most amazing people and had some really awesome experiences and travels.'],0,0
292,"however, that is not the only medical research i would do, i would love to do more than that, thats why i am here.",0,"['I am here in this world to make an impact. I am here to make a change in the world. I want to make an impact in this world, even if it is a small dent.']",0,0
293,i have always wanted to become more involved in bettering people's lives.,0,"[""I have always wanted to become more involved in bettering people's lives. The field would allow to help other people quickly and effectively as well as be able to interact with them holistically. This is why I am here.""]",0,0
294,"i am here because for the past 17 years of my career, i've dedicated my literal blood, sweat and tears as a cook and now as a chefowner of a restaurant.",0,"['The goal is to become a vet and serve my community through animals, whether it be farm or small animal practice.']",0,0


In [31]:
pwd

'/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/notebooks/experiments/exp_4_single/Social'

In [30]:
test_df_copy.to_excel("Social_capital_error_analysis.xlsx")

In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [21]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.9306334080717489


In [22]:
from sklearn.metrics import auc, precision_recall_curve

# Replace these with your actual data
# distillbert_test_label: Ground truth labels (list or numpy array)
# y_pred_distillbert: Predicted probabilities (list or numpy array)

# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(distillbert_test_label, y_pred_distillbert)

# Calculate PR AUC
pr_auc_score = auc(recall, precision)

# Print the result
print("AUC PR score for DistilBERT model: ", pr_auc_score)

AUC PR score for DistilBERT model:  0.7918205047976637
