In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score

In [3]:
diabetes = pd.read_csv('diabetes_cleaned_balanced.csv')
diabetes.drop('HbA1c', axis=1, inplace=True)

In [4]:
X = np.array(diabetes.loc[:, diabetes.columns != 'HbA1c_category'])  
y = np.array(diabetes.loc[:, diabetes.columns == 'HbA1c_category']) 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=37)

In [6]:
estimator = RandomForestClassifier(n_estimators=1000, random_state=0)

In [7]:
selector = RFE(estimator, 5)

In [8]:
selector = selector.fit(X_train, y_train.ravel())
print(selector.ranking_)

[31 36 46 28 35 29 40 13 23 19  3  5  1  1  7 10 18 25 39 43 32 27  9  1 17
 12 20 11 34 16 21 41  1  4  1 42 47 44 33  2  6  8 30 26 37 38 45 24 22 15
 14]


In [10]:
# Selected non-medical attributes: ['Weight', 'BMI', 'Taking medication for hypertenstion', 'Exercise
# more than 30 minutes-category', 'Waist circumference (cm)', 'Improve lifestyle habits',
# 'Walking or physical activity-category','Quick walking-category', 'Age', 'HbA1c_category'] (In order)

In [8]:
diabetes = diabetes[['Weight', 'BMI', 'Taking medication for hypertenstion', 'Exercise more than 30 minutes-category', 'Waist circumference (cm)', 'Improve lifestyle habits','Walking or physical activity-category','Quick walking-category', 'Age', 'HbA1c_category']]

In [9]:
X = np.array(diabetes.loc[:, diabetes.columns != 'HbA1c_category'])  
y = np.array(diabetes.loc[:, diabetes.columns == 'HbA1c_category']) 

In [10]:
estimator = RandomForestClassifier(n_estimators=1000, random_state=0)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=37)

In [12]:
estimator.fit(X_train,y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [13]:
pred = estimator.predict(X_test)

In [15]:
print("Recall:",recall_score(y_test,pred))
print("Precision:",precision_score(y_test,pred))
print("F1-Score:",f1_score(y_test,pred))
print("Accuracy:",accuracy_score(y_test,pred))

Recall: 0.796246648794
Precision: 0.962203023758
F1-Score: 0.871393643032
Accuracy: 0.881371222373


In [16]:
from xgboost import XGBClassifier

In [17]:
xgb = XGBClassifier()

In [18]:
xgb.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [20]:
pred = xgb.predict(X_test)

In [22]:
print("Recall:",recall_score(y_test,pred))
print("Precision:",precision_score(y_test,pred))
print("F1-Score:",f1_score(y_test,pred))
print("Accuracy:",accuracy_score(y_test,pred))

Recall: 0.769436997319
Precision: 0.994226327945
F1-Score: 0.867506297229
Accuracy: 0.881371222373
