# DIABETES PREDICTION

**This script preprocessing includes how to check for missing values,as well as replace missing values, categorize, and process non-numeric values (OUTCOME (object))and duplicate values, and outliers and replace unreasonable values (0).**   

**The training data were compared with and without scaling.**  

**The model uses logistic regression, k-NN, random forest and MLP. PCA and the model were compared with and without scaling, and the confusion matrix was evaluated in each link.**  

**The hyperparametric tuning is optimized by grid search and random search, and the optimal model is selected by cross-validation**

## Importing Modules

In [7]:
#import  Essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,roc_auc_score, precision_score,f1_score,plot_roc_curve,plot_roc_curve, plot_confusion_matrix,classification_report
from HF_Functions import correlated_map,label_encoder,grab_col_names,cat_summary,detect_outliers
import warnings 
warnings.filterwarnings("ignore", category=UserWarning)
%matplotlib inline

## Data Analyse and preprocessing 

In [None]:
# Load Dataset Showing the dataset information
df = pd.read_csv("datasets/diabetes-dataset.csv")
df.head()# check first 5 rows of dataset

In [None]:
df.shape# check No. of columns and rows

In [None]:
df.info()#Check feature information

In [None]:
# Descriptive statistics of the data set accessed.
df.describe([0.10,0.25,0.50,0.75,0.90,0.95,0.99]).T

###  Checking Missing values and Handling Missing values

In [None]:
### Check Missing values

# Does data has some missing values?
dataset = df.isnull().sum().sum()
if dataset == 0:
    print('Data has no missing values')
else:
    print('Data has missing values')#checks variables have any Nan values

In [None]:
df.info()# recheck 
'''When we examine NaN values with isnull() in the data set,
 no records are found; however, too many 0's stand out in the columns such as blood pressure, BMI, skin thickness.
  This is illogical, so these values should be treated as missing values.
'''

In [None]:
#Replace the value of 0 with NAN
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)


In [None]:
# Now, we can  check where are missing and (0)NAN values   
df.isnull().sum()

In [None]:
df.head()

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)#

In [None]:
# Change to upper case for ease of input
df.columns = [col.upper() for col in df.columns]
cat_cols = [x.upper() for x in cat_cols]
num_cols = [x.upper() for x in num_cols]
cat_but_car = [x.upper() for x in cat_but_car]

In [None]:
columns = df.columns
columns = columns.drop("OUTCOME")

In [None]:

"""1 .Violent method: we can delete rows with missing values: missing values can be dealt with by deleting rows or columns with null values. 
The disadvantage is that a large amount of information is lost and the percentage of missing values is too large to be effective.""" 
"""2. We estimated the missing values using the mean/median.
Prevents data loss leading to deleted rows or columns and works well on a small data set and is easy to implement. """
'''To fill in the missing values, 
we will group the columns with empty values according to OUTCOME 
and add the median value of the target variable corresponding to the relevant blank value.'''
for i in columns:   
    #The action of taking the median value for values with a partial characteristic of 0.
  df[i] = df[i].fillna(df.groupby("OUTCOME")[i].transform("median"))

In [None]:
df.head()# some  0 values  has replace to median values

In [None]:
df.nunique()# Checking unique values

###   Check duplicated values and Handling duplicate values

In [None]:
Dplicated=df.duplicated().sum()   # check  dups value in file
Dplicated

In [None]:
'''    !!! IMPORTANT!!!
 Because the sample is less than 1000 after removing the duplicates values and to get a better model,
 only the code is shown here without removing the duplicate values'''
#df.drop_duplicates()  

### Outlier Detection and Handling outliers

In [None]:
outliers_to_drop = detect_outliers(df, 2 ,["PREGNANCIES", 'GLUCOSE', 'BLOODPRESSURE', 'SKINTHICKNESS', 'INSULIN', 'BMI', 'DIABETESPEDIGREEFUNCTION', 'AGE'])

In [None]:
df.loc[outliers_to_drop] # Show the outliers rows

In [None]:
df.drop(df.loc[outliers_to_drop].index, inplace=True)# drop  outlier values 

In [None]:
df.describe().T

In [None]:
# check the outliers
fig,axes=plt.subplots(figsize = (15,10))
sns.boxplot(data=df, ax=axes,width=0.5) #draw the grapg of box plot

### LabelEncoder 

In [None]:
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float] and df[col].nunique() == 2]
len(binary_cols)#Using LabelEncoder, change the binary nominal feature to a binary integer 0 or 1 

In [None]:
for col in binary_cols:
    label_encoder(df, col)

df.head()
#replace OUTCOME values to binary

## Data Visualizations 

In [None]:
features = df.hist(orientation='horizontal', figsize=(25,20)) #Plotting horizontal features

In [None]:
 sns.pairplot(df, hue ='OUTCOME')# # Distribution of results on each feature

In [None]:
correlated_map(df, plot=True)
'''
#If the correlation value is bigger than 0, there is a positive correlation. 
 While the value of one variable increases, the value of the other variable also increases.  
 When there is equality of Correlation = 0 means no correlation. 
 If the correlation is smaller than 0, there is a negative correlation. While one variable increases, the other variable decreases. 
 When the correlations are examined, there are 2 variables that act as a positive correlation to the Outcome dependent variable. 
 These variables are Glucose. As these increase, Outcome variable increases.
'''

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df["OUTCOME"])
plt.title("Quantity of Diabetes", size=10)
plt.show()#View results histogram

## Model Creation and training datasets without scaling

In [None]:
#Selection of data sets
X = df.iloc[:,:-1].values#allocates the data
y = df.iloc[:,-1].values#allocates the target

In [None]:
#splitting the dataset
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.3,random_state=0)

### LogisticRegression

In [None]:
# using Logistic Regression algorithm
model_LR = LogisticRegression()
model_LR.fit(X_train, y_train)#Fitting the values of x and y
train_accuracy = model_LR.score(X_train, y_train)# Assign the training score 
test_accuracy = model_LR.score(X_test, y_test)#Assign testing score
print("Logistic Regression model:")
print("Training model accuracy:{:.3f}".format(train_accuracy))#Print Training Accuracy
print("Testing model accuracy :{:.3f}".format(test_accuracy))#Print Testing Accuracy

### KNearestNeighbors

In [None]:
model_knn = KNeighborsClassifier()                #knn classifier
model_knn.fit(X_train,y_train)#Fitting the values of x and y with the KNN model
# Assign the training score 
train_accuracy = model_knn.score(X_train, y_train)
test_accuracy = model_knn.score(X_test, y_test)#Assign testing score
print(" K-NN model:")
print("Training model_knn Accuracy:{:.3f}".format(train_accuracy))#Print Training Accuracy
print("Testing model_knn Accuracy: {:.3f}".format(test_accuracy))#Print Testing Accuracy

### Random Forest

In [None]:
# train Random Forest Classifier model
model_RF = RandomForestClassifier(n_estimators=100,random_state=0)                
model_RF.fit(X_train,y_train)#Fitting the values of x and y with the RandomForestClassifier model
train_accuracy = model_RF.score(X_train, y_train)# Assign the training score 
test_accuracy = model_RF.score(X_test, y_test)#Assign testing score
print(" RandomForestClassifier model:")
print("Training model_RF Accuracy:{:.3f}".format(train_accuracy))#Print Training Accuracy
print("Testing model_RF Accuracy: {:.3f}".format(test_accuracy))#Print Testing Accuracy

### MLPClassifier

In [None]:
MLP = MLPClassifier(random_state=0)    # using MLPClassifier      
MLP.fit(X_train,y_train) #Fitting the values of x and y with theMLPClassifier model
MLP_train_accuracy = MLP.score(X_train, y_train)# Assign the training score 
MLP_test_accuracy = MLP.score(X_test, y_test)#Assign testing score
print("MLPClassifierr model:")
print("Training model_tree without scalling Accuracy:{:.3f}".format(MLP_train_accuracy))#Print Training Accuracy
print("Testing model_tree without scalling Accuracy: {:.3f}".format(MLP_test_accuracy))#Print Testing Accuracy

### PCA

In [None]:
X = df.iloc[:,:-1].values#assign feature values
pca = PCA(n_components = 2)#PCA class instance, the parameter value of the representative component is 2
pComp = pca.fit_transform(X)#Fitted data
PDF = pd.DataFrame(data = pComp, columns = ['pc1','pc2'])#generates a data frame from the 2 components
PCA_df = pd.concat([PDF, df['OUTCOME']], axis = 1)#generates a data frame from the two components and target
sns.relplot(data = PCA_df, x='pc1', y = 'pc2', hue = 'OUTCOME')#Plotting the distribution of "OUTCOME"

## Evaluation metrics and Confusion matrix (without Scaling)

### Logistic Regression

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_LR = model_LR.predict(X_test)# make a prediction
#creates the confusion matrix
cfm_LR = confusion_matrix(y_test,y_test_LR)
print("Logistic Regression model:")
print('confusion matrix:')
print(cfm_LR)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_LR),#Accuracy classification score
       recall_score(y_test,y_test_LR),# recall rate
       precision_score(y_test,y_test_LR),#Precision rate
       roc_auc_score(y_test,y_test_LR),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_LR, average='micro')  #same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])#The name of each column and the Score associated with each value
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_LR)
print('')
print(Report)#print the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_LR, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### K-NN model

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_knn = model_knn.predict(X_test)# make a prediction
#creates the confusion matrix
cfm_knn = confusion_matrix(y_test,y_test_knn)
print("K-NN model:")
print('confusion matrix:')
print(cfm_knn)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_knn),#Accuracy classification score
       recall_score(y_test,y_test_knn),# recall rate
       precision_score(y_test,y_test_knn),#Precision rate
       roc_auc_score(y_test,y_test_knn),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_knn, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_knn)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_knn, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### MLPClassifier

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_MLP = MLP.predict(X_test)# make a prediction
#creates the confusion matrix
cm_MLP = confusion_matrix(y_test,y_test_MLP)
print("MLPClassifier model: ")
print('confusion matrix:')
print(cm_MLP)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_MLP),#Accuracy classification score
       recall_score(y_test,y_test_MLP),# recall rate
       precision_score(y_test,y_test_MLP),#Precision rate
       roc_auc_score(y_test,y_test_MLP),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_MLP, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_MLP)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cm_MLP, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


###   Random Forest Classifier 

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_RF = model_RF.predict(X_test)# make a prediction
#creates the confusion matrix
cm_RF = confusion_matrix(y_test,y_test_RF)
print("RandomForestClassifier  model: ")
print('confusion matrix:')
print(cm_RF)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_RF),#Accuracy classification score
       recall_score(y_test,y_test_RF),# recall rate
       precision_score(y_test,y_test_RF),#Precision rate
       roc_auc_score(y_test,y_test_RF),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_RF, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_RF)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cm_RF, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### curve

In [None]:
disp = plot_roc_curve(model_LR, X_test, y_test)
plot_roc_curve(model_knn,X_test, y_test, ax = disp.ax_)
plot_roc_curve(MLP,X_test, y_test, ax = disp.ax_)
plot_roc_curve(model_RF, X_test, y_test,ax = disp.ax_)


## Training datasets with scaling 

In [None]:
X = df.iloc[:,:-1].values#data selectedand  give a  pandas series 

scaler = StandardScaler()#Pre-processed data
df[num_cols] = scaler.fit_transform(df[num_cols])#Fit model with scaling features

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 0)#Split dataset

### Logistic Regression algorithm

In [None]:
# using Logistic Regression algorithm
model_LRSC= LogisticRegression(C=100,max_iter=1000)#Use logistic regression and max_iter to 1000
model_LRSC.fit(X_train, y_train)#Fit model with scaling features

LRSC_train_accuracy = model_LRSC.score(X_train, y_train)# Assign the training score 
LRSC_test_accuracy = model_LRSC.score(X_test, y_test)#Assign testing score
print("Logistic Regression model:")
print("Training model with scalling accuracy:{:.3f}".format(LRSC_train_accuracy))#Print Training Accuracy
print("Testing model with scalling accuracy :{:.3f}".format(LRSC_test_accuracy))#Print Testing Accuracy

### K-NN algorithm

In [None]:
# using K-NN algorithm
model_knnSC = KNeighborsClassifier()       
model_knnSC.fit(X_train,y_train)#Fitting the values of x and y with the KNN model
# Assign the training score 
knnSC_train_accuracy = model_knnSC.score(X_train, y_train)
knnSC_test_accuracy = model_knnSC.score(X_test, y_test)#Assign testing score
print(" K-NN model:")
print(" Training model_knn with scalling Accuracy:{:.3f}".format(knnSC_train_accuracy))#Print Training Accuracy
print(" Testing model_knn with scalling Accuracy: {:.3f}".format(knnSC_test_accuracy))#Print Testing Accuracy

### MLPClassifier

In [None]:
MLPSC = MLPClassifier(random_state=0)    # using MLPClassifier      
MLPSC.fit(X_train,y_train) #Fitting the values of x and y with theMLPClassifier model
MLPSC_train_accuracy = MLPSC.score(X_train, y_train)# Assign the training score 
MLPSC_test_accuracy = MLPSC.score(X_test, y_test)#Assign testing score
print("MLPClassifierr model:")
print("Training model_tree with scalling Accuracy:{:.3f}".format(MLPSC_train_accuracy))#Print Training Accuracy
print("Testing model_tree with scalling Accuracy: {:.3f}".format(MLPSC_test_accuracy))#Print Testing Accuracy

###  Random Forest Classifier model

In [None]:
#  Random Forest Classifier model
model_RFSC = RandomForestClassifier()                
model_RFSC.fit(X_train,y_train)#Fitting the values of x and y with the RandomForestClassifier model

RFSC_train_accuracy = model_RFSC.score(X_train, y_train)# Assign the training score 
RFSC_test_accuracy = model_RFSC.score(X_test, y_test)#Assign testing score
print("Random Forest Classifier model:")
print("Training model_RF with scalling Accuracy:{:.3f}".format(RFSC_train_accuracy))#Print Training Accuracy
print("Testing model_RF with scalling Accuracy: {:.3f}".format(RFSC_test_accuracy))#Print Testing Accuracy

### PCA

In [None]:
X = df.iloc[:,:-1].values#assign feature values
pca = PCA(n_components = 2)#PCA class instance, the parameter value of the representative component is 2
pComp = pca.fit_transform(X)#Fitted data
PDF = pd.DataFrame(data = pComp, columns = ['pc1','pc2'])#generates a data frame from the 2 components
PCA_df = pd.concat([PDF, df['OUTCOME']], axis = 1)#generates a data frame from the two components and target
sns.relplot(data = PCA_df, x='pc1', y = 'pc2', hue = 'OUTCOME')#Plotting the distribution of "OUTCOME"

## Evaluation metrics and Confusion matrix (Scaling)

###  Logistic Regression 

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_LRSC = model_LRSC.predict(X_test)# make a prediction
#creates the confusion matrix
cfm_LRSC = confusion_matrix(y_test,y_test_LRSC)
print("Logistic Regression model:")
print('confusion matrix:')
print(cfm_LRSC)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_LRSC),#Accuracy classification score
       recall_score(y_test,y_test_LRSC),# recall rate
       precision_score(y_test,y_test_LRSC),#Precision rate
       roc_auc_score(y_test,y_test_LRSC),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_LRSC, average='micro')  #same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])#The name of each column and the Score associated with each value
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_LRSC)
print('')
print(Report)#print the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_LRSC, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### knn

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_knnSC = model_knnSC.predict(X_test)# make a prediction
#creates the confusion matrix
cfm_KNNSC = confusion_matrix(y_test,y_test_knnSC)
print("K-NN model:")
print('confusion matrix:')
print(cfm_KNNSC)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_knnSC),#Accuracy classification score
       recall_score(y_test,y_test_knnSC),# recall rate
       precision_score(y_test,y_test_knnSC),#Precision rate
       roc_auc_score(y_test,y_test_knnSC),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_knnSC, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_knnSC)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_KNNSC, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### MLPClassifier

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_MLPSC = MLPSC.predict(X_test)# make a prediction
#creates the confusion matrix
cfm_MLPSC = confusion_matrix(y_test,y_test_MLPSC)
print("MLPClassifier model: ")
print('confusion matrix:')
print(cfm_MLPSC)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_MLPSC),#Accuracy classification score
       recall_score(y_test,y_test_MLPSC),# recall rate
       precision_score(y_test,y_test_MLPSC),#Precision rate
       roc_auc_score(y_test,y_test_MLPSC),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_MLPSC, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_MLPSC)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_MLPSC, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### Random Forest Classifier model

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_RFSC = model_RFSC.predict(X_test)# make a prediction
#creates the confusion matrix
cfm_RFSC = confusion_matrix(y_test,y_test_RFSC)
print("RandomForestClassifier  model: ")
print('confusion matrix:')
print(cfm_RFSC)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_RFSC),#Accuracy classification score
       recall_score(y_test,y_test_RFSC),# recall rate
       precision_score(y_test,y_test_RFSC),#Precision rate
       roc_auc_score(y_test,y_test_RFSC),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_RFSC, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_RFSC)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_RFSC, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


## Hyperparameter tuning and Optimisation models

### Logistic Regression 

In [None]:
model = LogisticRegression()                      #Solution help by Mark Elshaw 
LR_Dic=dict()# set dic and Getting indexes of values per hyper-parameter
#Add Hyperparametrs
LR_Dic['multi_class']= ['auto','ovr','multionmial']
LR_Dic['solver']=['liblinear', 'saga','newton-cg', 'lbfgs', 'sag',]
LR_Dic['penalty']= ['l1','l2','elasticnet','none']
# LR_Dic['c_values']= [100, 10, 1.0, 0.1, 0.01, 0.001]  
print(LR_Dic)#prints dictionary 

 #### GridSearch

In [None]:
#!!!  Dr. Trang Doan week6 solution

grid_LR = GridSearchCV(model, LR_Dic, cv=5, scoring = "accuracy", return_train_score = False)# Perform a grid search
grid_LR.fit(X,y)#Fit instances of Gridsearch

In [None]:
print(grid_LR.best_score_)#print Best Score
print(grid_LR.best_params_)#print Best parameter
print(grid_LR.best_estimator_)#print estimator

In [None]:
pd.DataFrame(grid_LR.cv_results_)[["mean_test_score","params"]] 

#### Random Search

In [None]:
# Dr. Trang Doan weekk6 solution

rand = RandomizedSearchCV(model, LR_Dic, cv =5, scoring ="accuracy", n_iter = 20, random_state =5, return_train_score = False)#performs random search on KNN
rand.fit(X,y)#fits the X and y data

In [None]:
print(rand.best_score_)#print Best Score
print(rand.best_params_)#print Best para
print(rand.best_estimator_)#print estimatormeter

#### Optimised Logistic Regression  model and cross-validation

In [None]:
model_OP = LogisticRegression(C=100,multi_class='auto',penalty='l2',max_iter=1000,solver='liblinear',)
model_OP.fit(X_train, y_train) 
train_accuracy_OP= model_OP.score(X_train, y_train)
test_accuracy_OP = model_OP.score(X_test, y_test)
#Cross-validation: evaluating estimator performance
scores = cross_val_score(model_OP,X,y,cv = 5,scoring = 'accuracy')
print("Logistic Regression model:")
print("Training model accuracy:{:.3f}".format(train_accuracy_OP))
print("Testing model accuracy :{:.3f}".format(test_accuracy_OP))
print(scores)
print("Max score:{:.3f}".format(scores.mean()))


#### Evaluation metrics and Confusion matrix (Hyperparameter)

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_OP = model_OP.predict(X_test)
#creates the confusion matrix
cfm_LROP = confusion_matrix(y_test,y_test_OP)
print("Logistic Regression model:")
print('confusion matrix:')
print(cfm_LROP)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_OP),#Accuracy classification score
       recall_score(y_test,y_test_OP),# recall rate
       precision_score(y_test,y_test_OP),#Precision rate
       roc_auc_score(y_test,y_test_OP),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_OP, average='micro')  #same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])#The name of each column and the Score associated with each value
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_OP)
print('')
print(Report)#print the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_LROP, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix


### KNN

In [None]:
#hyperparameter tuning for KNN
model = KNeighborsClassifier()
model.fit(X_train,y_train)#fit the model using X_train as training data and Y_train as target values
k_range = list(range(1,31))# set n_neighbors range
weights_options = ['uniform','distance']
param_grid = dict(n_neighbors = k_range, weights = weights_options)
print(param_grid)#prints dictionary 

 #### GridSearch

In [None]:
#!!!  Dr. Trang Doan week6 solution
grid = GridSearchCV(model, param_grid, cv=5, scoring = "accuracy", return_train_score = False)#instance of Grid Search
grid.fit(X,y)#fit the instance of gridsearch using X as training data and Y as target values

In [None]:
print(grid.best_score_)#print Best Score
print(grid.best_params_)#print Best parameter
print(grid.best_estimator_)#print estimator

In [None]:
pd.DataFrame(grid.cv_results_)[["mean_test_score","params"]]

#### Random Search

In [None]:
rand = RandomizedSearchCV(model, param_grid, cv =5, scoring = "accuracy", n_iter = 20, random_state =5, return_train_score = False)#performs random search on KNN
rand.fit(X,y)#fits the X and y data

In [None]:
print(rand.best_score_)#print Best Score
print(rand.best_params_)#print Best para
print(rand.best_estimator_)#print estimatormeter

#### Optimised Logistic Regression  model and cross-validation 

In [None]:
# using K-NN algorithm
model_knnOP = KNeighborsClassifier(n_neighbors=15)                #knn classifier
model_knnOP.fit(X_train,y_train)
#Cross-validation: evaluating estimator performance
scores = cross_val_score(model_knnOP,X,y,cv = 5,scoring = 'accuracy')
knnOP_train_accuracy = model_knnOP.score(X_train, y_train)
knnOP_test_accuracy = model_knnOP.score(X_test, y_test)
print(" K-NN model:")
print(scores)
print(" Training model_knn with scalling Accuracy:{:.3f}".format(knnOP_train_accuracy))
print(" Testing model_knn withing Accuracy: {:.3f}".format(knnOP_test_accuracy)) 

#### Evaluation metrics and Confusion matrix (Hyperparameter)

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_knnOP = model_knnOP.predict(X_test)
#creates the confusion matrix
cfm_KNNOP = confusion_matrix(y_test,y_test_knnOP)
print("K-NN model:")
print('confusion matrix:')
print(cfm_KNNOP)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_knnOP),#Accuracy classification score
       recall_score(y_test,y_test_knnOP),# recall rate
       precision_score(y_test,y_test_knnOP),#Precision rate
       roc_auc_score(y_test,y_test_knnOP),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_knnOP, average='micro')#same like the accuracy_score
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_knnOP)#creates the cReport of the model, which is a report showing the main classification metrics.
print('')
print(Report)#prints the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_KNNOP, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix



### Random Forest

In [None]:
model = RandomForestClassifier(n_estimators=100)#hyperparameter tuning for RF
model.fit(X_train,y_train)#fit the model using X_train as training data and Y_train as target values
RF_DIC=dict()
RF_DIC['class_weight']=['balanced', 'balanced_subsample']
RF_DIC['criterion'] = ['gini','entropy']
RF_DIC['max_features']=['auto', 'sqrt','log2']
RF_DIC['max_leaf_nodes'] = range(2,10)
RF_DIC['min_samples_split'] = [2,3,4]
RF_DIC['min_samples_leaf'] = [1, 3, 5]
RF_DIC['max_depth'] = [3, 6, 10, None]
RF_DIC['n_estimators']= [100, 500, 700]
print(RF_DIC)#prints dictionary 

 #### GridSearch

In [None]:
#!!!  Dr. Trang Doan week6 solution
grid = GridSearchCV(model, RF_DIC, cv=5, scoring = "accuracy", return_train_score = False)#instance of Grid Search
grid.fit(X,y)#fit the instance of gridsearch using X as training data and Y as target values

In [None]:
print(grid.best_score_)#print Best Score
print(grid.best_params_)#print Best parameter
print(grid.best_estimator_)#print estimator

#### Random Search

In [None]:
#!!!  Dr. Trang Doan week6 solution
rand = RandomizedSearchCV(model, RF_DIC, cv =5, scoring = "accuracy", n_iter = 20, random_state =5, return_train_score = False)#performs random search on KNN
rand.fit(X,y)#fits the X and y data

In [None]:
print(rand.best_score_)#print Best Score
print(rand.best_params_)#print Best para
print(rand.best_estimator_)#print estimatormeter

#### Optimised RandomForestClassifier model and cross-validation

In [None]:
# train Random Forest Classifier model
model_RFOP = RandomForestClassifier(class_weight='balanced',criterion='entropy',max_depth=9, max_features='log2',n_estimators=100,random_state=0)                
model_RFOP.fit(X_train,y_train)
#Cross-validation: evaluating estimator performance
scores = cross_val_score(model_RFOP,X,y,cv=5,scoring='accuracy')
train_accuracy = model_RFOP.score(X_train, y_train)
test_accuracy = model_RFOP.score(X_test, y_test)
print(scores)
print("Training model_RFOP Accuracy:{:.3f}".format(train_accuracy))
print("Testing model_RFOP Accuracy: {:.3f}".format(test_accuracy))

#### Evaluation metrics and Confusion matrix (Hyperparameter)

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_RFOP = model_RFOP.predict(X_test)

cfm_RFOP = confusion_matrix(y_test,y_test_RFOP)#creates the confusion matrix
print("Logistic Regression model:")
print('confusion matrix:')
print(cfm_RFOP)
print('Evaluation metrics:')
#evaluation metrics for the model
#creates a dataframe which contains the value of the accuracy, recale, precision and roc_auc score
PDF = pd.DataFrame(data=
[
       [
       accuracy_score(y_test,y_test_RFOP),
       recall_score(y_test,y_test_RFOP),
       precision_score(y_test,y_test_RFOP),
       roc_auc_score(y_test,y_test_RFOP),
       f1_score(y_test, y_test_RFOP, average='micro')
       ]
],
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])

#creates a table with the accuracy, recall, precision and roc_auc scores
labels = ['Probability of NOT having diabetes','Probability of having diabetes']#target names for classification report
Report =classification_report(y_test, y_test_RFOP, target_names=labels)#creates the cReport of the model, which is a report showing the main classification metrics.
print(Report)#prints the classification_report
print(PDF)#prints the df with the score of the evaluation metrics

plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_RFOP, cmap="GnBu",annot=True,fmt=".0f")


### MLPClassifier

In [None]:
model = MLPClassifier()    
MLP_DIC=dict()# set dic and Getting indexes of values per hyper-parameter
MLP_DIC['activation']=['identity', 'logistic', 'tanh', 'relu'] # set 
MLP_DIC['solver']=[ 'lbfgs', 'sgd','adam']
MLP_DIC['learning_rate']=['constant','invscaling','adaptive']
print(MLP_DIC) #prints dictionary 

 #### GridSearch

In [None]:
#!!!  Dr. Trang Doan week6 solution
grid = GridSearchCV(model, MLP_DIC, cv=5, scoring = "accuracy", return_train_score = False)#instance of Grid Search
grid.fit(X,y)#fit the instance of gridsearch using X as training data and Y as target values

In [None]:
print(grid.best_score_)#print Best Score
print(grid.best_params_)#print Best parameter
print(grid.best_estimator_)#print estimator

#### Random Search

In [None]:
#!!!  Dr. Trang Doan week6 solution
rand = RandomizedSearchCV(model, MLP_DIC, cv =5, scoring = "accuracy", n_iter = 20, random_state =5, return_train_score = False)#performs random search on KNN
rand.fit(X,y)#fits the X and y data

In [None]:
print(rand.best_score_)#print Best Score
print(rand.best_params_)#print Best para
print(rand.best_estimator_)#print estimatormeter

#### Optimised RandomForestClassifier model and cross-validation

In [None]:
# train MLPClassifier
MLP_OP = MLPClassifier(activation='tanh', learning_rate='invscaling', solver='lbfgs',random_state=0,max_iter=1000,alpha=1)            
MLP_OP .fit(X_train,y_train)
#Cross-validation: evaluating estimator performance
scores = cross_val_score(MLP_OP,X,y,cv = 5,scoring = 'accuracy')
MLPOP_train_accuracy = MLP_OP.score(X_train, y_train)
MLPOP_test_accuracy = MLP_OP .score(X_test, y_test)
print("MLPClassifierr model:")
print(scores)
print("Training MLP_OP  with scalling Accuracy:{:.3f}".format(MLPOP_train_accuracy))
print("Testing MLP_OP  with scalling Accuracy: {:.3f}".format(MLPOP_test_accuracy))

#### Evaluation metrics and Confusion matrix (Hyperparameter)

In [None]:
#Confusion Matrix and Evaluation metrics
y_test_MLPOP = MLP_OP.predict(X_test)
#creates the confusion matrix
cfm_MLPOP = confusion_matrix(y_test,y_test_MLPOP)
print("MLPClassifier model:")
print('confusion matrix:')
print(cfm_LROP)#print confusion matrix result
print('Evaluation metrics:')#evaluation metrics for the model
PDF = pd.DataFrame(data=#creates a dataframe 
[
       [
       accuracy_score(y_test,y_test_MLPOP),#Accuracy classification score
       recall_score(y_test,y_test_MLPOP),# recall rate
       precision_score(y_test,y_test_MLPOP),#Precision rate
       roc_auc_score(y_test,y_test_MLPOP),#Predicted Receiver Operating Characteristic Curve  (ROC AUC) 
       f1_score(y_test, y_test_MLPOP, average='micro')  #same like the accuracy_score
       ]
]
       columns=['accuracy','recall','precision','roc_auc_score','f1_score'],index = ['Score'])#The name of each column and the Score associated with each value
print(PDF)#prints the PDF with the evaluation metrics' scores
Report =classification_report(y_test, y_test_MLPOP)
print('')
print(Report)#print the classification_report
plt.figure(figsize = (10,10))#sets the size of the figure
sns.heatmap(data = cfm_MLPOP, cmap="GnBu",annot=True,fmt=".0f")#The heatmap contains the values within the PDF and is displayed inside the matrix
