# Resistance DistilBERT Model Using Single Batch 1 + Batch 2

In [4]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from ktrain import text
import random
import warnings
from sklearn.utils import shuffle

# Set random seed"
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

## 1. Loading the data and quick exploratory data analysis

In [5]:
merged_resistance_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Resistance/Resistance_Plus_Merged.csv")

merged_resistance_df

Unnamed: 0,sentence,label,phrase,updated_label
0,"i am here because i want to better myself my family, not only financially but in health.",0,"['I know that as a child I never thought of education and a career for someone who is undocumented.', 'I Hope to reach a position in which I can inspire and prove to all of the ""immigrants"" in the USA that achieving and surpassing struggle is possible.']",0
1,being in this instituion will pave a way for me to become a professional.,0,"['I know that as a child I never thought of education and a career for someone who is undocumented.', 'I Hope to reach a position in which I can inspire and prove to all of the ""immigrants"" in the USA that achieving and surpassing struggle is possible.']",0
2,i know that as a child i never thought of education and a career for someone who is undocumented.,1,"['I know that as a child I never thought of education and a career for someone who is undocumented.', 'I Hope to reach a position in which I can inspire and prove to all of the ""immigrants"" in the USA that achieving and surpassing struggle is possible.']",0
3,"i hope to reach a position in which i can inspire and prove to all of the ""immigrants"" in the usa that achieving and surpassing struggle is possible.",1,"['I know that as a child I never thought of education and a career for someone who is undocumented.', 'I Hope to reach a position in which I can inspire and prove to all of the ""immigrants"" in the USA that achieving and surpassing struggle is possible.']",1
4,"also, statistics have shown that people with higher education and wealth tend to be healthier.",0,"['I know that as a child I never thought of education and a career for someone who is undocumented.', 'I Hope to reach a position in which I can inspire and prove to all of the ""immigrants"" in the USA that achieving and surpassing struggle is possible.']",0
...,...,...,...,...
2193,it sounds clich but it really is the truth.,0,"['They taught me the value of hard work and perseverance, that not everything comes easily that failing is inevitable but you can always try again.']",0
2194,"it isn't just due to the fact that they're helping me financially, which of course i am exponentially grateful for, but its more than that.",0,"['They taught me the value of hard work and perseverance, that not everything comes easily that failing is inevitable but you can always try again.']",0
2195,"they taught me the value of hard work and perseverance, that not everything comes easily that failing is inevitable but you can always try again.",1,"['They taught me the value of hard work and perseverance, that not everything comes easily that failing is inevitable but you can always try again.']",1
2196,"that is the reason as to ""why i am here""...",0,"['They taught me the value of hard work and perseverance, that not everything comes easily that failing is inevitable but you can always try again.']",0


In [6]:
# Process duplicates
merged_resistance_df = (
    merged_resistance_df.groupby('sentence', as_index=False)  # Group by sentences
    .agg({'label': 'max'})  # If any label is 1, it takes precedence
)
merged_resistance_df

Unnamed: 0,sentence,label
0,"""why am i here"" is a very broad question.",0
1,"""why am i here?"" this essay is meant to connect you to your purpose.",0
2,"""why am i here?"" this question is super general but i'm here for a lot of reasons.",0
3,"""why am i here?""such an abstract question; why am i here on this earth right now?",0
4,"""why do i want to go into the stem field?""i dated this guy and he was such a tool.",0
...,...,...
2061,you learn more when you teach others how to do something or if youre learning by another student who is taking the same class.,0
2062,you may ask why i was born in a house?,0
2063,you meet great instructors that are positive and want everyone to learn and have a good grip on the concept in order to be confident when taking a quiz or exam.,0
2064,you so know that the time allotted for the professors are not enough to begin with.,0


In [7]:
# Shuffle the merged dataset
merged_social_df = shuffle(merged_resistance_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_social_df, test_size=0.1, random_state=18, stratify=merged_social_df["label"])


training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [8]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (1859, 2) 
Test dataset shape: (207, 2)
Positive labels present in the dataset : 118  out of 1859 or 6.347498655190963%
Positive labels present in the test dataset : 13  out of 207 or 6.280193236714976%


## 2. Experimental Design

In [9]:
MAXLEN = 150

X = training_df['sentence']
y = training_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 18, stratify=y)


model_name = 'bert-base-uncased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 22
	95percentile : 42
	99percentile : 59


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 23
	95percentile : 43
	99percentile : 56


In [10]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights

class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = dict(zip(classes, class_weights))

[0.53388857 7.87711864]


In [11]:
# Build BERT model
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
distillbert_learner.set_weight_decay(0.001)
distillbert_learner.autofit(3.328132762062889e-05, epochs=12, early_stopping=4, class_weight=class_weights)



begin training using triangular learning rate policy with max lr of 3.328132762062889e-05...
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x379edfe80>

In [12]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       174
           1       0.67      0.33      0.44        12

    accuracy                           0.95       186
   macro avg       0.81      0.66      0.71       186
weighted avg       0.94      0.95      0.94       186



array([[172,   2],
       [  8,   4]])

In [13]:
distillbert_learner.model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [15]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [16]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [17]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [18]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 192, False Positive: 2, False Negative: 12, True Positive: 1


In [19]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96       194
           1       0.33      0.08      0.12        13

    accuracy                           0.93       207
   macro avg       0.64      0.53      0.54       207
weighted avg       0.90      0.93      0.91       207
 



In [16]:
training_df.to_csv("social_training.csv", index=False)
test_df.to_csv("social_test.csv", index=False)

In [53]:
test_df_copy = test_df.copy()
test_df_copy["predicted_label"] = y_pred_distillbert
test_df_copy

Unnamed: 0,sentence,label,predicted_label
0,so far this class has been surprisingly fun and i actually learned a lot that helps me with the class.,0,0
1,"unfortunately, i joined the sci class as we were beginning to go over unit three, so i am hoping there will be someone i can ask for assistance with the other units.",0,0
2,but i don't think that's the case at all.,0,0
3,i decided to enroll in this 1 unit course as a sort of precautionary measure.,0,0
4,"this makes me empathetic, understanding, and more mature.",0,0
...,...,...,...
291,"as for why i am here in san francisco, i moved away to isolate myself from any distractions that would cause me to not reach my highest potential.",0,0
292,"part of my major requires me to finish math classes, and i wanted to get help with that because classes can be challenging and hard to understand sometimes, and its helpful to have assistance from classes like this in order to feel more confident about my math skills and learn more.",0,0
293,"so far, i feel like i have better understanding with the moon phases, and ive also learned the difference between the big and little dipper (finally).",0,0
294,im taking the physics lab because im also enrolled in the physics lecture on mwf. i'm also taking it because in high school i took preap physics and i really enjoyed it but it was kind of difficult for me as well.,0,0


In [31]:
pwd

'/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/notebooks/experiments/exp_4_single/Social'

In [54]:
test_df_copy.to_excel("Social_capital_error_analysis_2.xlsx")

In [24]:
# distillbert_predictor.save('../../model/first_generation_distilbert_base_uncased_model_10102020') # 256 MB

In [21]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.9306334080717489


In [22]:
from sklearn.metrics import auc, precision_recall_curve

# Replace these with your actual data
# distillbert_test_label: Ground truth labels (list or numpy array)
# y_pred_distillbert: Predicted probabilities (list or numpy array)

# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(distillbert_test_label, y_pred_distillbert)

# Calculate PR AUC
pr_auc_score = auc(recall, precision)

# Print the result
print("AUC PR score for DistilBERT model: ", pr_auc_score)

AUC PR score for DistilBERT model:  0.7918205047976637
