- Handling warnings

In [28]:
import warnings
warnings.filterwarnings("ignore")

- Loading modules

In [29]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import numpy as np
import shap
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
plt.rc('font', size=12)

- Auxiliar functions

In [30]:
def get_metrics(y_true, y_predict, method):
    metrics_estimated = {
        "Method" : method,
        "Accuracy" : accuracy_score(y_true=y_true, y_pred=y_predict),
        "Precision" : precision_score(y_true=y_true, y_pred=y_predict),
        "Recall" : recall_score(y_true=y_true, y_pred=y_predict),
        "F1-score" : f1_score(y_true=y_true, y_pred=y_predict),
        "MCC" : matthews_corrcoef(y_true=y_true, y_pred=y_predict)
    }
    return metrics_estimated

In [31]:
def apply_model(model, X_train, X_test, X_val, y_train, y_val, y_test, name_model):
    model.fit(X=X_train, y=y_train)
    model_prediction_val = model.predict(X=X_val)
    model_prediction_test = model.predict(X=X_test)

    performances_val = get_metrics(y_true=y_val, y_predict=model_prediction_val, method=name_model)
    performances_test = get_metrics(y_true=y_test, y_predict=model_prediction_test, method=name_model)

    return model, performances_val, performances_test

- Preprocessing dataset

In [32]:
df_data = pd.read_csv("../processed_data/1_processed_data_IgA.csv")
df_data.head(5)

Unnamed: 0,ID,Gender,age_baby_1,age_baby_2,age_baby_5,sIgA_Average_Morning,sIgA_Average_Afternoon,sIgA_Average_Difference,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log,Attachment
0,1,Male,5,7.0,15.0,21.76524,28.368055,6.602815,3.077296,3.338237,0.260941,Insecure
1,4,Male,10,10.0,16.0,28.473065,50.881961,22.408896,3.348951,3.687484,0.338533,Secure
2,5,Female,5,,14.0,161.424693,46.867951,-114.556742,5.084039,3.847334,-1.236705,Secure
3,7,Female,12,12.0,18.0,24.173487,35.942378,14.343241,3.179553,3.581917,0.402364,Insecure
4,11,Male,8,9.0,,78.013279,382.715268,345.590046,4.196295,5.947291,1.750996,Insecure


In [33]:
df_data.shape

(35, 12)

In [34]:
df_data_test = pd.read_csv("../raw_data/sIgA_data/new_information.csv")
df_data_test

Unnamed: 0,ID,Gender,Attachment,Attachment_predicted
0,11,Male,Secure,Insecure
1,15,Female,Insecure,Insecure
2,18,Female,Secure,Secure
3,36,Male,Insecure,Insecure
4,55,Female,Insecure,Insecure
5,57,Male,Secure,Secure
6,60,Male,Secure,Insecure
7,66,Female,Secure,Secure


In [35]:
df_data.columns

Index(['ID', 'Gender', 'age_baby_1', 'age_baby_2', 'age_baby_5',
       'sIgA_Average_Morning', 'sIgA_Average_Afternoon',
       'sIgA_Average_Difference', 'sIgA_Average_Morning_log',
       'sIgA_Average_Afternoon_log', 'sIgA_Average_Difference_log',
       'Attachment'],
      dtype='object')

In [36]:
df_data = df_data[['Gender', 'sIgA_Average_Morning_log', 'sIgA_Average_Afternoon_log',
       'sIgA_Average_Difference_log', 'Attachment', 'ID']]
df_data = df_data.dropna()
df_data["Attachment"] = df_data["Attachment"].replace({"Insecure": 1, "Secure":0, "Unknown":2})
print(df_data.shape)
df_data.head(5)

(35, 6)


Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log,Attachment,ID
0,Male,3.077296,3.338237,0.260941,1,1
1,Male,3.348951,3.687484,0.338533,0,4
2,Female,5.084039,3.847334,-1.236705,0,5
3,Female,3.179553,3.581917,0.402364,1,7
4,Male,4.196295,5.947291,1.750996,1,11


In [37]:
df_data["Gender"] = df_data["Gender"].replace({"Female":0, "Male": 1})

In [38]:
df_data["is_for_test"] = df_data["ID"].isin(df_data_test["ID"].values)
df_data["is_for_test"].value_counts()

is_for_test
False    27
True      8
Name: count, dtype: int64

In [39]:
data_train = df_data[df_data["is_for_test"] == False]
data_test = df_data[df_data["is_for_test"]]

In [40]:
response = data_train["Attachment"].values
response_test = data_test["Attachment"].values
data_train = data_train.drop(columns=['Attachment', 'ID', "is_for_test"])
data_test = data_test.drop(columns=['Attachment', 'ID', "is_for_test"])

- Apply standardization process

In [41]:
scaler_instance = MinMaxScaler()
scaler_instance.fit(data_train.values)
scaled_data = scaler_instance.transform(data_train.values)

df_prepared = pd.DataFrame(data=scaled_data, columns=data_train.columns)
df_prepared["Attachment"] = response
df_prepared.head(5)

Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log,Attachment
0,1.0,0.132332,0.264237,0.602167,1
1,1.0,0.249789,0.472386,0.633365,0
2,0.0,1.0,0.567656,0.0,0
3,0.0,0.176546,0.409469,0.65903,1
4,0.0,0.258437,0.413016,0.585271,0


- Using SMOTE for data augmentation

In [42]:
X = df_prepared.drop(columns=['Attachment'])
y = df_prepared['Attachment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)
print(y_test.shape, y_train.shape)

(7,) (20,)


- 100 per class will be generated

In [43]:
smote = SMOTE(
    random_state=42,
    sampling_strategy={1: 100, 0: 100})

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_train_resampled["Gender"] = X_train_resampled["Gender"].apply(lambda x: 1 if x > 0.5 else 0)

In [44]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.3, stratify=y_train_resampled, random_state=42
)

In [45]:
X_train.head()

Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log
1,0,1.0,0.567656,0.0
148,1,0.151184,0.115082,0.484012
141,0,0.083031,0.175659,0.588256
88,0,0.261019,0.402972,0.576094
188,0,0.113901,0.186393,0.566791


In [46]:
X_val.head()

Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log
107,1,0.165387,0.098842,0.459848
21,1,0.311089,0.481097,0.582238
18,1,0.372503,0.236477,0.3601
136,1,0.227594,0.608409,0.74577
133,0,0.254348,0.204714,0.448546


In [47]:
X_train.shape

(140, 4)

In [48]:
X_val.shape

(60, 4)

In [49]:
X_test.shape

(7, 4)

- Training models and get metrics

In [50]:
lr_model, lr_performances_val, lr_performances_test = apply_model(LogisticRegression(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "LogisticRegression")
rf_model, rf_performances_val, rf_performances_test = apply_model(RandomForestClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "RandomForestClassifier")
knn_model, knn_performances_val, knn_performances_test = apply_model(KNeighborsClassifier(), X_train, X_test, X_val, y_train, y_val, y_test, "KNeighborsClassifier")
dt_model, dt_performances_val, dt_performances_test = apply_model(DecisionTreeClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "DecisionTreeClassifier")
svm_model, svm_performances_val, svm_performances_test = apply_model(SVC(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "SVC")
adaboost_model, adaboost_performances_val, adaboost_performances_test = apply_model(AdaBoostClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "AdaBoostClassifier")
xgboost_model, xgboost_performances_val, xgboost_performances_test = apply_model(XGBClassifier(), X_train, X_test, X_val, y_train, y_val, y_test, "XGBClassifier")
lgbm_model, lgbm_performances_val, lgbm_performances_test = apply_model(LGBMClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "LGBMClassifier")
gradient_model, gradient_performances_val, gradient_performances_test = apply_model(GradientBoostingClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "GradientBoostingClassifier")
naive_model, naive_performances_val, naive_performances_test = apply_model(GaussianNB(), X_train, X_test, X_val, y_train, y_val, y_test, "GaussianNB")
gaussian_process_model, gaussian_process_performances_val, gaussian_process_performances_test = apply_model(GaussianProcessClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "GaussianProcessClassifier")
bagging_process_model, bagging_process_performances_val, bagging_process_performances_test = apply_model(BaggingClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "BaggingClassifier")


[LightGBM] [Info] Number of positive: 70, number of negative: 70
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 146
[LightGBM] [Info] Number of data points in the train set: 140, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


- Making summary of performances

In [51]:
data_performances_val = [
    lr_performances_val,
    knn_performances_val,
    dt_performances_val,
    rf_performances_val,
    adaboost_performances_val,
    gradient_performances_val,
    svm_performances_val,
    xgboost_performances_val,
    lgbm_performances_val,
    bagging_process_performances_val,
    naive_performances_val,
    gaussian_process_performances_val
]

df_perfomances_val = pd.DataFrame(data_performances_val)
df_perfomances_val

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
0,LogisticRegression,0.65,0.628571,0.733333,0.676923,0.304256
1,KNeighborsClassifier,0.85,0.818182,0.9,0.857143,0.703526
2,DecisionTreeClassifier,0.866667,0.923077,0.8,0.857143,0.73994
3,RandomForestClassifier,0.9,0.875,0.933333,0.903226,0.801784
4,AdaBoostClassifier,0.816667,0.827586,0.8,0.813559,0.633685
5,GradientBoostingClassifier,0.95,0.935484,0.966667,0.95082,0.9005
6,SVC,0.683333,0.627907,0.9,0.739726,0.40685
7,XGBClassifier,0.883333,0.925926,0.833333,0.877193,0.770529
8,LGBMClassifier,0.866667,0.84375,0.9,0.870968,0.734968
9,BaggingClassifier,0.883333,0.896552,0.866667,0.881356,0.767093


In [52]:
df_perfomances_val.sort_values(by="MCC", ascending=False)

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
5,GradientBoostingClassifier,0.95,0.935484,0.966667,0.95082,0.9005
3,RandomForestClassifier,0.9,0.875,0.933333,0.903226,0.801784
7,XGBClassifier,0.883333,0.925926,0.833333,0.877193,0.770529
9,BaggingClassifier,0.883333,0.896552,0.866667,0.881356,0.767093
2,DecisionTreeClassifier,0.866667,0.923077,0.8,0.857143,0.73994
8,LGBMClassifier,0.866667,0.84375,0.9,0.870968,0.734968
1,KNeighborsClassifier,0.85,0.818182,0.9,0.857143,0.703526
4,AdaBoostClassifier,0.816667,0.827586,0.8,0.813559,0.633685
6,SVC,0.683333,0.627907,0.9,0.739726,0.40685
10,GaussianNB,0.65,0.595745,0.933333,0.727273,0.364101


In [53]:
data_performances_test = [
lr_performances_test,
    knn_performances_test,
    dt_performances_test,
    rf_performances_test,
    adaboost_performances_test,
    gradient_performances_test,
    svm_performances_test,
    xgboost_performances_test,
    lgbm_performances_test,
    bagging_process_performances_test,
    naive_performances_test,
    gaussian_process_performances_test
]

df_perfomances_test = pd.DataFrame(data_performances_test)
df_perfomances_test

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
0,LogisticRegression,0.571429,0.333333,0.5,0.4,0.091287
1,KNeighborsClassifier,0.428571,0.0,0.0,0.0,-0.4
2,DecisionTreeClassifier,0.714286,0.5,0.5,0.5,0.3
3,RandomForestClassifier,0.571429,0.333333,0.5,0.4,0.091287
4,AdaBoostClassifier,0.571429,0.333333,0.5,0.4,0.091287
5,GradientBoostingClassifier,0.714286,0.5,1.0,0.666667,0.547723
6,SVC,0.428571,0.25,0.5,0.333333,-0.091287
7,XGBClassifier,0.428571,0.25,0.5,0.333333,-0.091287
8,LGBMClassifier,0.285714,0.0,0.0,0.0,-0.547723
9,BaggingClassifier,0.428571,0.0,0.0,0.0,-0.4


In [54]:
df_perfomances_test.sort_values(by="MCC", ascending=False)

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
10,GaussianNB,0.714286,0.5,1.0,0.666667,0.547723
5,GradientBoostingClassifier,0.714286,0.5,1.0,0.666667,0.547723
2,DecisionTreeClassifier,0.714286,0.5,0.5,0.5,0.3
0,LogisticRegression,0.571429,0.333333,0.5,0.4,0.091287
4,AdaBoostClassifier,0.571429,0.333333,0.5,0.4,0.091287
3,RandomForestClassifier,0.571429,0.333333,0.5,0.4,0.091287
7,XGBClassifier,0.428571,0.25,0.5,0.333333,-0.091287
6,SVC,0.428571,0.25,0.5,0.333333,-0.091287
11,GaussianProcessClassifier,0.428571,0.25,0.5,0.333333,-0.091287
1,KNeighborsClassifier,0.428571,0.0,0.0,0.0,-0.4


- Using the model to classify the unknown examples

In [55]:
model_to_use = gradient_model

In [57]:
model_to_use.predict(scaler_instance.transform(data_test.values))

array([1, 1, 0, 1, 1, 0, 1, 0])

In [61]:
response_test

array([1, 1, 0, 1, 1, 0, 1, 0])

In [58]:
model_to_use.predict_proba(scaler_instance.transform(data_test.values))

array([[0.00910049, 0.99089951],
       [0.40328016, 0.59671984],
       [0.98748682, 0.01251318],
       [0.00883063, 0.99116937],
       [0.01787666, 0.98212334],
       [0.68357143, 0.31642857],
       [0.13211631, 0.86788369],
       [0.99774317, 0.00225683]])

In [60]:
get_metrics(
    y_true=response_test, 
    y_predict=model_to_use.predict(scaler_instance.transform(data_test.values)),
    method="Testing")

{'Method': 'Testing',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-score': 1.0,
 'MCC': np.float64(1.0)}