In [None]:
#Task 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report

#for task 11 confusion_matrix plot
import itertools

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Task 2: Import dataset
diabetes = pd.read_csv('Diabetes_Data.csv')
diabetes.head()

In [None]:
# More on Task 2: Check the dimension of your dataset
print("Cancer data set dimensions : {}".format(diabetes.shape))
# Our dataset has 768 observations/instances/rows and 9 attributes/columns
#Check for missing or Null Data points
diabetes.isnull().sum()
diabetes.isna().sum()
# There is no missing values in dataset

In [None]:
# Task 3: Set the input (X) and output/target (y) 
X = diabetes.iloc[:, 1:8].values 
y = diabetes.iloc[:, 8].values

In [None]:
# Task 4.1: counting values of variables in 'Outcome'
ax = sns.countplot(x="Outcome", data=diabetes, palette="flare")
ax.set_title('Number of samples in each class (0: Healthy, 1: Diabetic)', fontsize=20)
for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=18)
plt.show()

In [None]:
#Task 4.2 
plt.figure(figsize=(12,10))  # Set the size of figure to 12 by 10.
p=sns.heatmap(diabetes.corr(), annot=True,cmap ='PiYG')

In [None]:
#Task 5 Sacle input data
# Show summary statistics
print(diabetes.describe())
#looking and the min and max we can see that the range 
#of values are very diffrent (e.g., DiabetesPedigreeFunction compared with Glucose level)
sc = StandardScaler()
X = sc.fit_transform(X)

#To make sure we accomodate the problem of large diffrence between our features' range

In [None]:
#Task 6: Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=3)

#print("X_train set dimensions : {}".format(X_train.shape))
#print("X_test set dimensions : {}".format(X_test.shape))
#print("y_train set dimensions : {}".format(y_train.shape))
#print("y_test set dimensions : {}".format(y_test.shape))

In [None]:
# Task 7.1 and 7.2: Build the Model and fit to train set
mlp = MLPClassifier(
    max_iter=200,
    alpha=0.01,
    activation='logistic',
    solver='adam')

#Fit the model
mlp.fit(X_train, y_train)
#Make prediction
mlp_predict = mlp.predict(X_test)

In [None]:
# Task 7.3: Adjust the Configuration 

# ???? Study the Multi-layer Perceptron classifier's attributes and find out the number of iterations the solver has run.
print("Number of iterations: ", mlp.????)
#Q: Why did the iterations stopped before 200?
#Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.

loss_values = mlp.loss_curve_
plt.plot(loss_values)
plt.show()

In [None]:
#Task 8
print('MLP Accuracy: {:.2f}%'.format(accuracy_score(y_test, mlp_predict) * 100))
print('MLP Classification report:\n\n', classification_report(y_test, mlp_predict))
print('MLP Training set score: {:.2f}%'.format(mlp.score(X_train, y_train) * 100))
print('MLP Testing set score: {:.2f}%'.format(mlp.score(X_test, y_test) * 100))

In [None]:
# Task 9: Confusion Matrix for MLPClassifier using train test split
outcome_labels = sorted(diabetes.Outcome.unique())
sns.heatmap(
    confusion_matrix(y_test, mlp_predict),
    annot=True,
    fmt="d",#fmt=".1f",
    xticklabels=outcome_labels,
    yticklabels=outcome_labels
)

In [None]:
# Task 10: prepare kfold cross validation
kfold = StratifiedKFold(10, shuffle=True)
# enumerate splits
scores = list()
predicted_targets = np.array([])
actual_targets = np.array([])
mlp_predict_proba= np.array([])
mlp_predict = np.array([])
X_test=np.array([])
for train_ix, test_ix in kfold.split(X, y):
    # split data
    X_train, X_test, y_train, y_test = X[train_ix], X[test_ix], y[train_ix], y[test_ix]
    # fit the model
    mlp.fit(X_train, y_train)
    # predict test set  
    mlp_predict = mlp.predict(X_test)    
    predicted_targets = np.append(predicted_targets, mlp_predict)
    actual_targets = np.append(actual_targets, y_test)
    print('----------------------')
    '''
     # evaluate on train
    score_train = mlp.score(X_train, y_train)
    print('score_train:%.2f' % score_train)
    scores.append(score_train)
    '''
   
       
    # evaluate on predictions
    score_test = accuracy_score(y_test, mlp_predict)
    print('score_test: %.2f' % score_test)
    scores.append(score_test)

# summarize all score_test
print('MLP Accuracy: {:.2f}%'.format(accuracy_score(actual_targets, predicted_targets) * 100))

In [None]:
## Task 11: Function to generate Kfold's confusion_matrix
def generate_confusion_matrix(cnf_matrix, classes, normalize=False, title='Confusion matrix'):
    if normalize:
        cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        
    plt.imshow(cnf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    plt.title(title)
    plt.colorbar()

    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cnf_matrix.max() / 2.

    for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, format(cnf_matrix[i, j], fmt), horizontalalignment="center",
                 color="white" if cnf_matrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    return cnf_matrix

In [None]:
# Task 11: Function to plot Kfold's confusion_matrix
def plot_myconfusion_matrix(predicted_labels_list, y_test_list):
    cnf_matrix = confusion_matrix(y_test_list, predicted_labels_list)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    generate_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix from K-fold result')
    plt.show()

In [None]:
#Q1
class_names=[0, 1]

print(confusion_matrix(actual_targets,predicted_targets))

#Call function(s) above (Task 8) to generate and plot the confusion matrix