In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV

In [None]:
data = pd.read_csv('data.csv')

In [None]:
data

In [None]:
data['diabetes'] = data['diabetes'].apply(lambda x:1 if x==True else 0)

In [None]:
data.info()

In [None]:
data.shape

In [None]:
sns.pairplot(data, hue='diabetes',vars = data.columns[1:-1])

In [None]:
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
sns.countplot(data['diabetes'])   

In [None]:
data.corr()

In [None]:
na_columns = ['glucose_conc','thickness','insulin','bmi','diastolic_bp']
data[na_columns] = data[na_columns].replace(0,np.nan)

In [None]:
df = data.copy()

for column in ['glucose_conc','thickness','insulin']:
    median_0 = data[column][data['diabetes']==0].median()
    median_1 = data[column][data['diabetes']==1].median()
    
    df[column][df['diabetes']==0] = data[column][df['diabetes']==0].fillna(median_0)
    df[column][df['diabetes']==1] = data[column][df['diabetes']==1].fillna(median_1)

In [None]:
df

In [None]:
df.diastolic_bp.fillna(df.diastolic_bp.median(),inplace=True)
df.bmi.fillna(df.bmi.median(),inplace=True)

In [None]:
X = df.drop('diabetes',axis=1)
X

In [None]:
y = df.diabetes
y

In [None]:
data.hist(column='bmi',bins=50,by='diabetes',figsize=(12,8))

In [None]:
data.hist(column='age',bins=50,by='diabetes',figsize=(12,8))

In [None]:
data.hist(column='glucose_conc',bins=50,by='diabetes',figsize=(12,8))

In [None]:
data.hist(column='insulin',bins=50,by='diabetes',figsize=(12,8))

In [None]:
from sklearn import metrics
f1 = metrics.make_scorer(metrics.f1_score)
accuracy = metrics.make_scorer(metrics.accuracy_score)
precision = metrics.make_scorer(metrics.precision_score)
recall = metrics.make_scorer(metrics.recall_score)
auc = metrics.make_scorer(metrics.roc_auc_score)

In [None]:
scoring = {
    "accuracy":accuracy,
    "precision":precision,
    "recall": recall,
    "f1":f1,
}

In [None]:
def printResults(cv):
    print("Accuracy  {:.3f} ({:.3f})".format(cv["test_accuracy"].mean(), cv["test_accuracy"].std()))
    print("Precision {:.3f} ({:.3f})".format(cv["test_precision"].mean(), cv["test_precision"].std()))
    print("Recall    {:.3f} ({:.3f})".format(cv["test_recall"].mean(), cv["test_recall"].std()))
    print("F1        {:.3f} ({:.3f})".format(cv["test_f1"].mean(), cv["test_f1"].std()))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X_train,y_train)

In [None]:
y_pred = gbc.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="Blues");

In [None]:
cv_gbc = cross_validate(gbc, X, y, scoring=scoring, cv=5)

In [None]:
printResults(cv_gbc)

In [None]:
params = {
    'loss': ['deviance','exponential'],
    'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'n_estimators': [100,200,300,400,500,600,700,800,800,1000],
}
gs = GridSearchCV(estimator = gbc,param_grid=params,cv=5)

In [None]:
gs.fit(X,y)

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
 gbc_best = GradientBoostingClassifier(learning_rate=0.1,loss='deviance',n_estimators=400)
    gbc_best.fit(X_train,y_train)
    

In [None]:
y_pred = gbc_best.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="Blues");

In [None]:
cv_gbc_best = cross_validate(gbc_best, X, y, cv=5, scoring=scoring)

In [None]:
printResults(cv_gbc_best)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(gbc_best, X, y,cv = 10)
#cv is the number of folds you want your training set to split in

In [None]:
import pickle

filename = 'diabetes-model.pkl'
pickle.dump(gbc_best,open(filename,'wb'))