In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import plotly.express as px
import math

from sklearn import metrics, model_selection
from sklearn.preprocessing import scale, normalize
from sklearn import datasets, cluster
from sklearn import neighbors, ensemble, tree, linear_model
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings('ignore')

In [72]:
imputed_data_age_average = pd.read_csv('../data/preprocessed_data/tables/age_avg/trainClinDataImputedAgeAvg.csv')
imputed_data_age_average['Prognosis'] = imputed_data_age_average['Prognosis'].apply(lambda x: 0 if x =='MILD' else 1)
cols_top_drop = ['Prognosis', 'Death', 'ImageFile']

In [68]:
trainClinDataImputedAgeAvg_cv1 = pd.read_csv('../data/preprocessed_data/tables/age_avg/trainClinDataImputedAgeAvg_cv1.csv')
trainClinDataImputedAgeAvg_cv2 = pd.read_csv('../data/preprocessed_data/tables/age_avg/trainClinDataImputedAgeAvg_cv2.csv')
trainClinDataImputedAgeAvg_cv3 = pd.read_csv('../data/preprocessed_data/tables/age_avg/trainClinDataImputedAgeAvg_cv3.csv')
trainClinDataImputedAgeAvg_cv4 = pd.read_csv('../data/preprocessed_data/tables/age_avg/trainClinDataImputedAgeAvg_cv4.csv')
trainClinDataImputedAgeAvg_cv5 = pd.read_csv('../data/preprocessed_data/tables/age_avg/trainClinDataImputedAgeAvg_cv5.csv')

validClinDataImputedAgeAvg_cv1 = pd.read_csv('../data/preprocessed_data/tables/age_avg/validClinDataImputedAgeAvg_cv1.csv')
validClinDataImputedAgeAvg_cv2 = pd.read_csv('../data/preprocessed_data/tables/age_avg/validClinDataImputedAgeAvg_cv2.csv')
validClinDataImputedAgeAvg_cv3 = pd.read_csv('../data/preprocessed_data/tables/age_avg/validClinDataImputedAgeAvg_cv3.csv')
validClinDataImputedAgeAvg_cv4 = pd.read_csv('../data/preprocessed_data/tables/age_avg/validClinDataImputedAgeAvg_cv4.csv')
validClinDataImputedAgeAvg_cv5 = pd.read_csv('../data/preprocessed_data/tables/age_avg/validClinDataImputedAgeAvg_cv5.csv')

In [47]:
def get_explanining_variables( selector, original_feature_columns, X_test, y_test ):
    return sorted( original_feature_columns[selector.support_].values.tolist() ),\
           selector.score(X_test, y_test),\
           selector.predict(X_test)

In [48]:
def do_rfe_with_cv( regressor, X, y, n_features_to_select=20 ):
    rfe = RFE( estimator=regressor, n_features_to_select=n_features_to_select, step=1 )
    rfe = rfe.fit(X, y)
    return rfe

In [49]:
X_train, X_test, y_train, y_test = train_test_split( imputed_data_age_average.drop(cols_top_drop, axis=1), imputed_data_age_average['Prognosis'], test_size=.1, random_state=42 )

In [56]:
exp_logi, logi_test_score, logi_preds = get_explanining_variables(
    do_rfe_with_cv(ensemble.RandomForestClassifier(), X_train, y_train, n_features_to_select=30),
                    imputed_data_age_average.drop(cols_top_drop, axis=1).columns, X_test, y_test)
exp_logi

['Age',
 'BPCO',
 'CRP',
 'CardiovascularDisease',
 'Cough',
 'D_dimer',
 'DaysFever',
 'Diabetes',
 'DifficultyInBreathing',
 'Fibrinogen',
 'Glucose',
 'HighBloodPressure',
 'Hospital_A',
 'Hospital_C',
 'Hospital_D',
 'Hospital_F',
 'INR',
 'IschemicHeartDisease',
 'LDH',
 'Ox_percentage',
 'PCT',
 'PaCO2',
 'PaO2',
 'Position',
 'RBC',
 'SaO2',
 'Sex',
 'Temp_C',
 'WBC',
 'pH']

### Calculating metrics

In [58]:

#init models
kmeans = cluster.KMeans(n_clusters=2, random_state=42)
rf     = ensemble.RandomForestClassifier(random_state=42)
dt     = tree.DecisionTreeClassifier()
lr     = linear_model.LogisticRegression()
knn    = neighbors.KNeighborsClassifier(5)
xgb    = XGBClassifier(n_estimators=50, max_depth=10, objective='binary:logistic', use_label_encoder=False)


#CV
#cols_to_keep = ['Age', 'CRP', 'ChronicKidneyDisease', 'Diabetes', 'DifficultyInBreathing', 'Hospital_A', 'Hospital_B', 'Hospital_C', 'Hospital_D', 'Hospital_E', 'Hospital_F', 'Ictus', 'LDH', 'Obesity', 'Ox_percentage', 'PCT', 'Position', 'PositivityAtAdmission', 'SaO2', 'Sex']
cols_to_keep = ['Age',
 'BPCO',
 'CRP',
 'CardiovascularDisease',
 'Cough',
 'D_dimer',
 'DaysFever',
 'Diabetes',
 'DifficultyInBreathing',
 'Fibrinogen',
 'Glucose',
 'HighBloodPressure',
 'Hospital_A',
 'Hospital_C',
 'Hospital_D',
 'Hospital_F',
 'INR',
 'IschemicHeartDisease',
 'LDH',
 'Ox_percentage',
 'PCT',
 'PaCO2',
 'PaO2',
 'Position',
 'RBC',
 'SaO2',
 'Sex',
 'Temp_C',
 'WBC',
 'pH']
rf_preds   = model_selection.cross_val_predict(rf, imputed_data_age_average[cols_to_keep], imputed_data_age_average['Prognosis'], method='predict', cv=5)
tree_preds = model_selection.cross_val_predict(dt, imputed_data_age_average[cols_to_keep], imputed_data_age_average['Prognosis'], method='predict', cv=5)
lr_preds   = model_selection.cross_val_predict(lr, imputed_data_age_average[cols_to_keep], imputed_data_age_average['Prognosis'], method='predict', cv=5)
knn_preds  = model_selection.cross_val_predict(knn, imputed_data_age_average[cols_to_keep], imputed_data_age_average['Prognosis'], method='predict', cv=5)
xgb_preds  = model_selection.cross_val_predict(xgb, imputed_data_age_average[cols_to_keep], imputed_data_age_average['Prognosis'], method='predict', cv=5)

balanced_accuracy_all_list = []
for idx, preds in enumerate([rf_preds, tree_preds, lr_preds, knn_preds, xgb_preds]):
    balanced_accuracy = np.round(metrics.balanced_accuracy_score(y_pred=preds, y_true=imputed_data_age_average['Prognosis']), 3)
    balanced_accuracy_all_list.append(balanced_accuracy)
    
balanced_accuracy_all = pd.DataFrame(np.array(balanced_accuracy_all_list).reshape(1,-1), columns=['random forest', 'decision tree', 'logistic_regression', 'knn', 'xgb'])
balanced_accuracy_all



Unnamed: 0,random forest,decision tree,logistic_regression,knn,xgb
0,0.736,0.628,0.728,0.67,0.719


In [70]:

random forest	decision tree	logistic_regression	knn	xgb
0	0.747	0.671	0.73	0.698	0.744

SyntaxError: invalid syntax (2383861795.py, line 1)

In [90]:
# check on folds from janos server

In [95]:
rf_preds   = ensemble.RandomForestClassifier(random_state=42).fit(trainClinDataImputedAgeAvg_cv1.drop(cols_top_drop, axis=1), trainClinDataImputedAgeAvg_cv1['Prognosis'].apply(lambda x: 0 if x =='MILD' else 1)).predict(validClinDataImputedAgeAvg_cv1.drop(cols_top_drop, axis=1))

In [96]:
balanced_accuracy = np.round(metrics.balanced_accuracy_score(y_pred=rf_preds, y_true=validClinDataImputedAgeAvg_cv1['Prognosis'].apply(lambda x: 0 if x =='MILD' else 1)), 3)
balanced_accuracy

0.724