- Handling warnings

In [187]:
import warnings
warnings.filterwarnings("ignore")

- Loading modules

In [188]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

- Auxiliar functions

In [189]:
def get_metrics(y_true, y_predict, method):
    metrics_estimated = {
        "Method" : method,
        "Accuracy" : accuracy_score(y_true=y_true, y_pred=y_predict),
        "Precision" : precision_score(y_true=y_true, y_pred=y_predict),
        "Recall" : recall_score(y_true=y_true, y_pred=y_predict),
        "F1-score" : f1_score(y_true=y_true, y_pred=y_predict),
        "MCC" : matthews_corrcoef(y_true=y_true, y_pred=y_predict)
    }
    return metrics_estimated

In [190]:
def apply_model(model, X_train, X_test, X_val, y_train, y_val, y_test, name_model):
    model.fit(X=X_train, y=y_train)
    model_prediction_val = model.predict(X=X_val)
    model_prediction_test = model.predict(X=X_test)

    performances_val = get_metrics(y_true=y_val, y_predict=model_prediction_val, method=name_model)
    performances_test = get_metrics(y_true=y_test, y_predict=model_prediction_test, method=name_model)

    return model, performances_val, performances_test

- Preprocessing dataset

In [191]:
df_data = pd.read_csv("../processed_data/1_processed_data_IgA.csv")
df_data.head(5)

Unnamed: 0,ID,Gender,age_baby_1,age_baby_2,age_baby_5,sIgA_Average_Morning,sIgA_Average_Afternoon,sIgA_Average_Difference,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log,Attachment
0,1,Male,5,7.0,15.0,21.76524,28.368055,6.602815,3.077296,3.338237,0.260941,Insecure
1,4,Male,10,10.0,16.0,28.473065,50.881961,22.408896,3.348951,3.687484,0.338533,Secure
2,5,Female,5,,14.0,161.424693,46.867951,-114.556742,5.084039,3.847334,-1.236705,Secure
3,7,Female,12,12.0,18.0,24.173487,35.942378,14.343241,3.179553,3.581917,0.402364,Insecure
4,11,Male,8,9.0,,78.013279,382.715268,345.590046,4.196295,5.947291,1.750996,Insecure


In [192]:
df_data.shape

(35, 12)

In [193]:
df_data.columns

Index(['ID', 'Gender', 'age_baby_1', 'age_baby_2', 'age_baby_5',
       'sIgA_Average_Morning', 'sIgA_Average_Afternoon',
       'sIgA_Average_Difference', 'sIgA_Average_Morning_log',
       'sIgA_Average_Afternoon_log', 'sIgA_Average_Difference_log',
       'Attachment'],
      dtype='object')

In [194]:
df_data = df_data[['Gender', 'sIgA_Average_Morning_log', 'sIgA_Average_Afternoon_log',
       'sIgA_Average_Difference_log', 'Attachment', 'ID']]
df_data = df_data.dropna()
df_data["Attachment"] = df_data["Attachment"].replace({"Insecure": 1, "Secure":0, "Unknown":2})
print(df_data.shape)
df_data.head(5)

(35, 6)


Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log,Attachment,ID
0,Male,3.077296,3.338237,0.260941,1,1
1,Male,3.348951,3.687484,0.338533,0,4
2,Female,5.084039,3.847334,-1.236705,0,5
3,Female,3.179553,3.581917,0.402364,1,7
4,Male,4.196295,5.947291,1.750996,1,11


In [195]:
df_data["Gender"] = df_data["Gender"].replace({"Female":0, "Male": 1})

In [196]:
response = df_data["Attachment"].values
df_for_training = df_data.drop(columns=['Attachment', 'ID'])

In [197]:
X_train, X_test, y_train, y_test = train_test_split(df_for_training, response, 
                                                    random_state=42, test_size=.3,
                                                    stratify=response)

- Apply standardization process

In [198]:
scaler_instance = RobustScaler()
scaler_instance.fit(X_train.values)
X_train_scaled = scaler_instance.transform(X_train.values)
X_train_scaled = pd.DataFrame(data=X_train_scaled, columns=X_train.columns)

- Using SMOTE for data augmentation

In [199]:
smote = SMOTE(
    random_state=42,
    sampling_strategy={1: 1000, 0: 1000})

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
X_train_resampled["Gender"] = X_train_resampled["Gender"].apply(lambda x: 1 if x > 0.5 else 0)

In [200]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.3, stratify=y_train_resampled, random_state=42
)

In [201]:
X_train.head()

Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log
603,0,-0.152338,0.15453,0.472893
1785,0,-0.464931,0.39192,1.037741
1729,0,-0.571431,-0.264699,0.460946
1403,0,0.146852,-0.434657,-0.445292
1890,0,-0.563874,0.073539,0.806072


In [202]:
X_val.head()

Unnamed: 0,Gender,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log
650,0,2.254123,0.409516,-1.703393
963,0,1.497668,0.396817,-0.948935
888,0,0.018778,-0.723749,-0.616847
1731,0,-0.53577,-0.540351,0.137239
340,0,0.085673,0.047068,0.119255


In [203]:
X_train.shape

(1400, 4)

In [204]:
X_val.shape

(600, 4)

In [205]:
X_test.shape

(11, 4)

- Training models and get metrics

In [206]:
X_test = scaler_instance.transform(X_test)

lr_model, lr_performances_val, lr_performances_test = apply_model(LogisticRegression(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "LogisticRegression")
rf_model, rf_performances_val, rf_performances_test = apply_model(RandomForestClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "RandomForestClassifier")
knn_model, knn_performances_val, knn_performances_test = apply_model(KNeighborsClassifier(), X_train, X_test, X_val, y_train, y_val, y_test, "KNeighborsClassifier")
dt_model, dt_performances_val, dt_performances_test = apply_model(DecisionTreeClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "DecisionTreeClassifier")
svm_model, svm_performances_val, svm_performances_test = apply_model(SVC(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "SVC")
adaboost_model, adaboost_performances_val, adaboost_performances_test = apply_model(AdaBoostClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "AdaBoostClassifier")
xgboost_model, xgboost_performances_val, xgboost_performances_test = apply_model(XGBClassifier(), X_train, X_test, X_val, y_train, y_val, y_test, "XGBClassifier")
lgbm_model, lgbm_performances_val, lgbm_performances_test = apply_model(LGBMClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "LGBMClassifier")
gradient_model, gradient_performances_val, gradient_performances_test = apply_model(GradientBoostingClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "GradientBoostingClassifier")
naive_model, naive_performances_val, naive_performances_test = apply_model(GaussianNB(), X_train, X_test, X_val, y_train, y_val, y_test, "GaussianNB")
gaussian_process_model, gaussian_process_performances_val, gaussian_process_performances_test = apply_model(GaussianProcessClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "GaussianProcessClassifier")
bagging_process_model, bagging_process_performances_val, bagging_process_performances_test = apply_model(BaggingClassifier(random_state=42), X_train, X_test, X_val, y_train, y_val, y_test, "BaggingClassifier")


[LightGBM] [Info] Number of positive: 700, number of negative: 700
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


- Making summary of performances

In [207]:
data_performances_val = [
    lr_performances_val,
    knn_performances_val,
    dt_performances_val,
    rf_performances_val,
    adaboost_performances_val,
    gradient_performances_val,
    svm_performances_val,
    xgboost_performances_val,
    lgbm_performances_val,
    bagging_process_performances_val,
    naive_performances_val,
    gaussian_process_performances_val
]

df_perfomances_val = pd.DataFrame(data_performances_val)
df_perfomances_val

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
0,LogisticRegression,0.673333,0.667742,0.69,0.678689,0.346859
1,KNeighborsClassifier,0.908333,0.912458,0.903333,0.907873,0.816708
2,DecisionTreeClassifier,0.913333,0.924658,0.9,0.912162,0.826961
3,RandomForestClassifier,0.928333,0.92691,0.93,0.928453,0.856671
4,AdaBoostClassifier,0.855,0.833856,0.886667,0.859451,0.711428
5,GradientBoostingClassifier,0.888333,0.889632,0.886667,0.888147,0.776671
6,SVC,0.811667,0.800643,0.83,0.815057,0.623753
7,XGBClassifier,0.923333,0.923333,0.923333,0.923333,0.846667
8,LGBMClassifier,0.916667,0.928082,0.903333,0.915541,0.83363
9,BaggingClassifier,0.918333,0.931271,0.903333,0.91709,0.837043


In [208]:
df_perfomances_val.sort_values(by="MCC", ascending=False)

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
3,RandomForestClassifier,0.928333,0.92691,0.93,0.928453,0.856671
7,XGBClassifier,0.923333,0.923333,0.923333,0.923333,0.846667
9,BaggingClassifier,0.918333,0.931271,0.903333,0.91709,0.837043
8,LGBMClassifier,0.916667,0.928082,0.903333,0.915541,0.83363
2,DecisionTreeClassifier,0.913333,0.924658,0.9,0.912162,0.826961
1,KNeighborsClassifier,0.908333,0.912458,0.903333,0.907873,0.816708
5,GradientBoostingClassifier,0.888333,0.889632,0.886667,0.888147,0.776671
4,AdaBoostClassifier,0.855,0.833856,0.886667,0.859451,0.711428
11,GaussianProcessClassifier,0.836667,0.827922,0.85,0.838816,0.673573
6,SVC,0.811667,0.800643,0.83,0.815057,0.623753


In [209]:
data_performances_test = [
lr_performances_test,
    knn_performances_test,
    dt_performances_test,
    rf_performances_test,
    adaboost_performances_test,
    gradient_performances_test,
    svm_performances_test,
    xgboost_performances_test,
    lgbm_performances_test,
    bagging_process_performances_test,
    naive_performances_test,
    gaussian_process_performances_test
]

df_perfomances_test = pd.DataFrame(data_performances_test)
df_perfomances_test

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
0,LogisticRegression,0.909091,0.8,1.0,0.888889,0.828079
1,KNeighborsClassifier,0.818182,0.75,0.75,0.75,0.607143
2,DecisionTreeClassifier,0.909091,1.0,0.75,0.857143,0.810093
3,RandomForestClassifier,0.909091,1.0,0.75,0.857143,0.810093
4,AdaBoostClassifier,0.818182,1.0,0.5,0.666667,0.62361
5,GradientBoostingClassifier,0.909091,1.0,0.75,0.857143,0.810093
6,SVC,0.636364,0.5,0.25,0.333333,0.133631
7,XGBClassifier,0.818182,0.75,0.75,0.75,0.607143
8,LGBMClassifier,0.727273,0.666667,0.5,0.571429,0.385758
9,BaggingClassifier,0.909091,1.0,0.75,0.857143,0.810093


In [210]:
df_perfomances_test.sort_values(by="MCC", ascending=False)

Unnamed: 0,Method,Accuracy,Precision,Recall,F1-score,MCC
0,LogisticRegression,0.909091,0.8,1.0,0.888889,0.828079
2,DecisionTreeClassifier,0.909091,1.0,0.75,0.857143,0.810093
3,RandomForestClassifier,0.909091,1.0,0.75,0.857143,0.810093
5,GradientBoostingClassifier,0.909091,1.0,0.75,0.857143,0.810093
9,BaggingClassifier,0.909091,1.0,0.75,0.857143,0.810093
4,AdaBoostClassifier,0.818182,1.0,0.5,0.666667,0.62361
1,KNeighborsClassifier,0.818182,0.75,0.75,0.75,0.607143
7,XGBClassifier,0.818182,0.75,0.75,0.75,0.607143
11,GaussianProcessClassifier,0.727273,0.666667,0.5,0.571429,0.385758
8,LGBMClassifier,0.727273,0.666667,0.5,0.571429,0.385758


In [211]:
df_perfomances_val.columns = ["Method", 'Accuracy-Val', 'Precision-Val', 'Recall-Val', 'F1-score-Val', 'MCC-Val']
df_perfomances_test.columns = ["Method", 'Accuracy-Test', 'Precision-Test', 'Recall-Test', 'F1-score-Test', 'MCC-Test']

In [212]:
df_perfomances = df_perfomances_test.merge(df_perfomances_val, on="Method")
df_perfomances

Unnamed: 0,Method,Accuracy-Test,Precision-Test,Recall-Test,F1-score-Test,MCC-Test,Accuracy-Val,Precision-Val,Recall-Val,F1-score-Val,MCC-Val
0,LogisticRegression,0.909091,0.8,1.0,0.888889,0.828079,0.673333,0.667742,0.69,0.678689,0.346859
1,KNeighborsClassifier,0.818182,0.75,0.75,0.75,0.607143,0.908333,0.912458,0.903333,0.907873,0.816708
2,DecisionTreeClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.913333,0.924658,0.9,0.912162,0.826961
3,RandomForestClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.928333,0.92691,0.93,0.928453,0.856671
4,AdaBoostClassifier,0.818182,1.0,0.5,0.666667,0.62361,0.855,0.833856,0.886667,0.859451,0.711428
5,GradientBoostingClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.888333,0.889632,0.886667,0.888147,0.776671
6,SVC,0.636364,0.5,0.25,0.333333,0.133631,0.811667,0.800643,0.83,0.815057,0.623753
7,XGBClassifier,0.818182,0.75,0.75,0.75,0.607143,0.923333,0.923333,0.923333,0.923333,0.846667
8,LGBMClassifier,0.727273,0.666667,0.5,0.571429,0.385758,0.916667,0.928082,0.903333,0.915541,0.83363
9,BaggingClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.918333,0.931271,0.903333,0.91709,0.837043


In [213]:
df_perfomances.sort_values(by=["Precision-Test", "Precision-Val"], ascending=[False, False])

Unnamed: 0,Method,Accuracy-Test,Precision-Test,Recall-Test,F1-score-Test,MCC-Test,Accuracy-Val,Precision-Val,Recall-Val,F1-score-Val,MCC-Val
9,BaggingClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.918333,0.931271,0.903333,0.91709,0.837043
3,RandomForestClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.928333,0.92691,0.93,0.928453,0.856671
2,DecisionTreeClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.913333,0.924658,0.9,0.912162,0.826961
5,GradientBoostingClassifier,0.909091,1.0,0.75,0.857143,0.810093,0.888333,0.889632,0.886667,0.888147,0.776671
4,AdaBoostClassifier,0.818182,1.0,0.5,0.666667,0.62361,0.855,0.833856,0.886667,0.859451,0.711428
0,LogisticRegression,0.909091,0.8,1.0,0.888889,0.828079,0.673333,0.667742,0.69,0.678689,0.346859
7,XGBClassifier,0.818182,0.75,0.75,0.75,0.607143,0.923333,0.923333,0.923333,0.923333,0.846667
1,KNeighborsClassifier,0.818182,0.75,0.75,0.75,0.607143,0.908333,0.912458,0.903333,0.907873,0.816708
8,LGBMClassifier,0.727273,0.666667,0.5,0.571429,0.385758,0.916667,0.928082,0.903333,0.915541,0.83363
11,GaussianProcessClassifier,0.727273,0.666667,0.5,0.571429,0.385758,0.836667,0.827922,0.85,0.838816,0.673573


In [214]:
from joblib import dump

In [215]:
dump(rf_model, "../generated_models/rf_model.joblib")
dump(scaler_instance, "../generated_models/scaler_instance.joblib")

['../generated_models/scaler_instance.joblib']

In [219]:
X_train["label"] = y_train
X_val["label"] = y_val
X_train.to_csv("../data_for_figures/X_train.csv", index=False)
X_val.to_csv("../data_for_figures/X_val.csv", index=False)

X_test = pd.DataFrame(X_test, columns=X_train.columns)
X_test["label"] = y_test
X_test.to_csv("../data_for_figures/X_test.csv", index=False)