In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
seed = 18

merged_resistance_df = pd.read_csv("../../../../data/processed_for_model/merged_themes_using_jaccard_method/merged_Resistance_sentence_level_batch_1_jaccard.csv", encoding='utf-8')

# Shuffle the merged dataset
merged_resistance_df = shuffle(merged_resistance_df, random_state=seed)

# Train-test split 
training_df, test_df = train_test_split(merged_resistance_df, test_size=0.3, random_state=18, stratify=merged_resistance_df['label'])

training_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [16]:
merged_resistance_df["sentence"][1179]

'i am in this sci11502 supplemental course because i know that i will end up struggling with future topics in chem115.'

In [17]:
print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
pos_labels = len([n for n in training_df['label'] if n==1])
print("Positive labels present in the dataset : {}  out of {} or {}%".format(pos_labels, len(training_df['label']), (pos_labels/len(training_df['label']))*100))
pos_labels = len([n for n in test_df['label'] if n==1])
print("Positive labels present in the test dataset : {}  out of {} or {}%".format(pos_labels, len(test_df['label']), (pos_labels/len(test_df['label']))*100))

Training dataset shape: (842, 3) 
Test dataset shape: (362, 3)
Positive labels present in the dataset : 61  out of 842 or 7.244655581947744%
Positive labels present in the test dataset : 26  out of 362 or 7.18232044198895%


In [18]:
training_df

Unnamed: 0,sentence,label,phrase
0,math always seems to give me a little bit of t...,1,['Math always seems to give me a little bit of...
1,after working minimum wage part time jobs whil...,0,"[""After working minimum wage part time jobs wh..."
2,this would cause a sharp pain in my elbow when...,0,"['When I went to physical therapy, my therapis..."
3,i believe i am here to better my life.,0,['I want to be able to say I went back and fin...
4,i am here because i want to better myself my f...,0,['I know that as a child I never thought of ed...
...,...,...,...
837,im here for the community i grew up in and the...,0,['I am here to disprove stereotypes.']
838,i'm here in san francisco mostly because it's ...,0,"['I really want to be an immigration lawyer, e..."
839,i see why they want me to go to school and i w...,0,['I know it is a lot of work that lies ahead o...
840,at least not that i've seen.part of the answer...,0,"[""There have been so many hurdles already, and..."


In [19]:
MAXLEN = 150

In [20]:
X = training_df['sentence']
y = training_df['label']

In [21]:
import ktrain
from ktrain import text

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 18, stratify=y)

model_name = 'bert-base-cased'

distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()


preprocessing train...
language: en
train sequence lengths:
	mean : 22
	95percentile : 42
	99percentile : 55


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 40
	99percentile : 51


In [22]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define classes and class labels
classes = np.array([0, 1])
class_labels = list(training_df.label)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=class_labels)

# Print class weights
print(class_weights)

class_weights = {0:0.5390525,1:6.87857143}

[0.5390525  6.90163934]


In [23]:
import tensorflow as tf
import numpy as np
import os
import random
def reset_random_seeds(seed=2):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [24]:
# Build BERT model
# model = text.text_classifier('distilbert', train_data=(X_train, y_train), preproc=distillbert_transformer)
learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)
# learner.fit_onecycle(2e-5, 4, class_weight=class_weights)
# learner.autofit(2.27E-06, early_stopping=4)
learner.autofit(2e-5, early_stopping=4, class_weight=class_weights)
# distillbert_learner.set_weight_decay(0.001)
# distillbert_learner.autofit(2.27E-06, early_stopping=4, class_weight=class_weights)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 7/1024
Epoch 8/1024
Epoch 00008: Reducing Max LR on Plateau: new max lr will be 1e-05 (if not early_stopping).
Epoch 9/1024
Epoch 10/1024
Epoch 00010: Reducing Max LR on Plateau: new max lr will be 5e-06 (if not early_stopping).
Restoring model weights from the end of the best epoch: 6.
Epoch 10: early stopping
Weights from best epoch have been loaded into model.


<keras.src.callbacks.History at 0x56469d8b0>

In [25]:
learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       235
           1       0.83      0.28      0.42        18

    accuracy                           0.94       253
   macro avg       0.89      0.64      0.69       253
weighted avg       0.94      0.94      0.93       253



array([[234,   1],
       [ 13,   5]])

In [26]:
distillbert_predictor = ktrain.get_predictor(learner.model, preproc=distillbert_transformer)

distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

y_pred_distillbert = [int(x) for x in y_pred_distillbert]

tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

True Negative: 326, False Positive: 10, False Negative: 19, True Positive: 7
  Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       336
           1       0.41      0.27      0.33        26

    accuracy                           0.92       362
   macro avg       0.68      0.62      0.64       362
weighted avg       0.91      0.92      0.91       362
 



: 

In [14]:
# distillbert_predictor.save('../../../../saved_models/resistance_bert_base_cased_model_08192024')

In [5]:
learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

    negative       0.98      0.95      0.97      1440
    positive       0.71      0.88      0.78       185

    accuracy                           0.94      1625
   macro avg       0.85      0.91      0.88      1625
weighted avg       0.95      0.94      0.95      1625



array([[1373,   67],
       [  23,  162]])