In [1]:
import ktrain
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [2]:
from ktrain import text

In [5]:
training_df = pd.read_csv('../../data-analysis/data/csv/attainment_sentence_level_training_data.csv',encoding='utf-8')

In [6]:
training_df.shape

(1271, 3)

In [7]:
training_df.head()

Unnamed: 0,sentence,label,phrase
0,why i am here is because i want to fulfill my ...,1,['Why I am here is because I want to fulfill m...
1,i like to relate what i learn to everyday acti...,0,['Im here because I want to pursue a degree in...
2,im here because i want to pursue a degree in b...,1,['Im here because I want to pursue a degree in...
3,i am in this class to learn about topics that ...,0,['Iím here because I want to pursue a career i...
4,right after 12th grade i knew i wanted to majo...,0,['I want to have a career as an epidemiologist...


In [8]:
test_df = pd.read_csv('../../data-analysis/data/csv/attainment_sentence_level_test_data.csv',encoding='utf-8')

In [9]:
test_df.shape

(142, 3)

In [10]:
test_df.head()

Unnamed: 0,sentence,label,phrase
0,also the reason why i am here in san francisco...,0,['I am in this class because I want to pass ph...
1,i am here to find myself and figure out my ide...,0,['I want to pursue a career in the medical fie...
2,i look forward to teaching history at the high...,1,['I look forward to teaching history at the hi...
3,i feel like they just do math all the time whi...,0,['I really want to be a video gamer programmer...
4,planning ahead is always fun but i have to tak...,0,['I am here because I need it to move on to my...


In [11]:
MAXLEN = 150

In [12]:
X = training_df['sentence']
y = training_df['label']

In [13]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18, stratify=y)

In [14]:
model_name = 'distilbert-base-uncased'

In [15]:
distillbert_transformer = text.Transformer(model_name, maxlen=MAXLEN, class_names=[0,1])
training_set = distillbert_transformer.preprocess_train(X_train.tolist(), y_train.tolist())
validation_set = distillbert_transformer.preprocess_test(X_test.tolist(), y_test.tolist())
distillbert_base_model = distillbert_transformer.get_classifier()

Metal device set to: Apple M2 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

preprocessing train...
language: en
train sequence lengths:
	mean : 20
	95percentile : 37
	99percentile : 50


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 21
	95percentile : 37
	99percentile : 47


In [16]:
distillbert_learner = ktrain.get_learner(distillbert_base_model, train_data=training_set, val_data=validation_set, batch_size=6)

In [17]:
distillbert_learner.autofit(2e-5, 5)



begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x39938a850>

In [16]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       209
           1       0.85      0.72      0.78        46

    accuracy                           0.93       255
   macro avg       0.89      0.84      0.87       255
weighted avg       0.92      0.93      0.92       255



array([[203,   6],
       [ 13,  33]])

In [18]:
distillbert_learner.validate(class_names=distillbert_transformer.get_classes())

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       209
           1       0.83      0.74      0.78        46

    accuracy                           0.93       255
   macro avg       0.89      0.85      0.87       255
weighted avg       0.92      0.93      0.92       255



array([[202,   7],
       [ 12,  34]])

In [19]:
distillbert_learner.model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
distillbert_predictor = ktrain.get_predictor(distillbert_learner.model, preproc=distillbert_transformer)

In [21]:
distillbert_test_data = test_df['sentence'].tolist()
distillbert_test_label = test_df['label'].tolist()

In [22]:
y_pred_distillbert = distillbert_predictor.predict(distillbert_test_data)

In [23]:
y_pred_distillbert = [int(x) for x in y_pred_distillbert]

In [24]:
tn, fp, fn, tp = confusion_matrix(distillbert_test_label, y_pred_distillbert).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))

True Negative: 108, False Positive: 9, False Negative: 2, True Positive: 23


In [23]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97       117
           1       0.81      0.88      0.85        25

    accuracy                           0.94       142
   macro avg       0.89      0.92      0.91       142
weighted avg       0.95      0.94      0.94       142
 



In [25]:
print('  Classification Report:\n',classification_report(distillbert_test_label,y_pred_distillbert),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.92      0.95       117
           1       0.72      0.92      0.81        25

    accuracy                           0.92       142
   macro avg       0.85      0.92      0.88       142
weighted avg       0.94      0.92      0.93       142
 



In [24]:
#distillbert_predictor.save('./model/distilbert_base_uncased_model') # 256 MB

In [26]:
print("AUC roc score for distillbert model: ", roc_auc_score(distillbert_test_label,y_pred_distillbert))

AUC roc score for distillbert model:  0.9215384615384615
