#Data Import

In [None]:
import pandas as pd
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1L67Wr-K0xjJErTPqw7ByfUfJMmHVqPiw")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


#Object Orient Programming

In [None]:
#creating a class called "Diabetes Dataset" with attributes and methods to be used in my file
class DiabetesDataset:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)

    def clean_data(self): #this method cleans the dataset by dropping columns with missing values & drops columns with duplicates
        self.df.dropna(inplace=True)
        self.df.drop_duplicates(inplace=True)

    def get_features(self):
        return self.df.drop('Outcome', axis=1)#Axis=1 will allow the 'drop' fucntion to work with the data as columns

    def get_target(self):
        return self.df['Outcome'] #Axis=0 will work with the data as rows

#Data Exploration

I used the .shape attribute from the Pandas to reveal the rows and columns of the dataset.

##Data Cleaning using OOP


In [None]:
file_path = "https://docs.google.com/spreadsheets/d/15JHtRlAIu20NKTo5S-oqrQ_J24TBURN7C0fozCS-hRM/export?format=csv"  #file path
dataset = DiabetesDataset(file_path)
dataset.clean_data() #this method removed all the msising values across the entire dataset & removing duplicates with the "clean_data" function i described above in the Diabetes Dataset class above

In [None]:
print(df.isnull().sum()) #checking for missing values for the whole dataset by using the integrated .sum function from Pandas

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [None]:
# number of rows and Columns in this dataset
df.shape

(768, 9)

In [None]:
# printing the first 5 rows of the dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# getting the statistical measures of the data
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
#getting the number of how many are classed as Diabetic and how many aren't
df['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


**0 --> Non-Diabetic**

**1 --> Diabetic**

##Data Preprocessing

##Data split as X features and y targets using OOP

In [None]:
y = dataset.get_target() # I used the "get_target" method in the defined class above to get the target column for y
y

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [None]:
#assigning all the features to X by leaving out the "Outcome" column of the dataset
X = dataset.get_features() # I used the "get_features" method in the defined class above to get the features for X
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


##Data Standardising

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

standardized_data = scaler.transform(X)
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


###Data Splitting

In [None]:
from sklearn.model_selection import train_test_split #imported the "train_test_split" the Scikit Learn package, to split the data

#Splitting the data as; 20% of the X features & Y targets ("Outcomes" column)for testing and remaining 80% for training.
#"random_state" is used so that everytime the code cell runs, I'll get the same data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [None]:
X_train #displaying the split X features to be used for training

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
609,1,111,62,13,182,24.0,0.138,23
610,3,106,54,21,158,30.9,0.292,24
611,3,174,58,22,194,32.9,0.593,36
612,7,168,88,42,321,38.2,0.787,40


In [None]:
X_test #displaying the split X features to be used for testing

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
614,11,138,74,26,144,36.1,0.557,50
615,3,106,72,0,0,25.8,0.207,27
616,6,117,96,0,0,28.7,0.157,30
617,2,68,62,13,15,20.1,0.257,23
618,9,112,82,24,0,28.2,1.282,50
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


#Models

##Logistic Regression

####**Training the Logistic Regression model**

In [None]:
from sklearn.linear_model import LogisticRegression #Importing the Logistic Regression model from Scikit Learn

#Training the Linear Regression model**

# Training with Logistic Regression Algorithm
lr = LogisticRegression(random_state=42, max_iter=1000) #setting the logistic regression to a randomised state of 42 and a maximum iteration of 1000 loops
lr.fit(X_train, y_train) #using the .fit function to initialise the X training data and the y-target data

####**Applying the model to make a prediction**

In [None]:
#setting the variables for the predictions on the X training data and test data
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

In [None]:
y_lr_train_pred #training the data

array([ 6.13208723e-01,  2.81451226e-03,  7.07350903e-01, -6.23548170e-03,
        8.59181837e-01,  2.11846623e-01,  6.45847859e-02,  5.81443207e-01,
        6.23897399e-01, -5.39125395e-02,  2.97598905e-01,  8.41940519e-01,
        6.93703393e-01,  5.98425172e-01,  5.61074905e-01,  4.22317541e-01,
        4.09052000e-01,  2.71408831e-01,  3.79140797e-01,  2.96922654e-01,
        4.31728346e-01,  3.64123982e-01,  9.27564080e-01,  3.39159640e-01,
        6.12942550e-01,  4.43232154e-01,  6.50488015e-01,  1.84843883e-02,
        4.95267094e-01,  3.41245410e-01,  4.13911884e-01,  5.45482895e-01,
        2.39124374e-02, -2.40886524e-02,  4.26482652e-01,  2.16320533e-01,
        6.05048763e-01,  4.01294394e-01,  2.29027771e-01,  5.25214809e-01,
        6.52314932e-01,  6.31787005e-01,  1.44248286e-01,  8.99137874e-01,
        5.65666774e-01,  9.69243949e-01,  4.49315774e-01, -1.60567995e-02,
        4.03657588e-01, -3.11587432e-02, -2.24622624e-02,  1.10544495e-01,
        6.60790060e-02,  

In [None]:
y_lr_test_pred #prediction of the data

array([ 0.668565  ,  0.11956052,  0.23279981, -0.17983979,  0.48881625,
        0.35258373,  0.2598292 ,  0.22624878,  1.0135107 ,  0.24404459,
        0.18201879,  0.19470264,  0.14598038,  0.30832235,  0.41194578,
        0.0418547 ,  0.3798912 ,  0.12411682,  0.17224134,  0.15462173,
        0.20435421,  0.44855224,  0.2125507 ,  0.16044164,  0.45480888,
       -0.0688029 ,  0.1207369 ,  0.40042875,  0.47537711,  0.29637316,
        0.18355167,  0.50285201,  0.40056567,  0.67798089,  0.46023613,
        0.0666391 , -0.0018573 ,  0.29997447,  0.35538663,  0.26678818,
        0.15951478,  0.52679266, -0.00225509,  0.46495802,  0.55797113,
        0.23429596,  0.6114119 ,  0.99984579,  0.63334676,  0.67109915,
        0.37662949,  0.18941746,  0.49565022,  0.31369471,  0.3077593 ,
        0.56354583,  0.69937583,  0.10421767,  0.15435679,  0.67872909,
        0.38712006,  0.77506224,  0.51806213,  0.15603316,  0.37457254,
        0.09838215, -0.17394852,  0.71540753,  0.27745056,  0.37

####**Evaluate model performance**

In [None]:
from sklearn.metrics import accuracy_score # Importing accuracy_score for model evaluation

y_lr_test_pred = lr.predict(X_test) #using the .prediction function to initialise the X test data to make predictions
lr_test_accuracy = accuracy_score(y_test, y_lr_test_pred) #setting the variable "lr_test_accuracy"  to use the "accuracy_score" function to get the accuracy of the test data
print(f"The accuracy the optimal model of Logistic Regression is {lr_test_accuracy:.2f}")#printing the accuracy score and setting it to 2 decimal places
#model_results.append(['Logistic Regression', lr_test_accuracy, 'N/A']) #appending the accuracy to the list of model results so I can print it

The accuracy the optimal model of Logistic Regression is 0.77


##Random Forest

####**Training the model**

In [None]:
#Importing the Random Forest Classifier model from Scikit Learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

rf = RandomForestClassifier(max_depth=2, random_state=42)
rf.fit(X_train, y_train)

####**Applying the model to make a prediction**

In [None]:
#setting the variables for the predictions on the X training data and test data
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

####**Evaluating the model**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score #calculatin the how good/bad the predictions are using the mean squared error

#for the training set
rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred) #squared correlation coefficient using the r2_score function

#for the test set
rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [None]:
#tidying the layout so it shows them in a table
rf_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]). transpose()
#changing the column names so that the data matches a column name so it maskes sense
rf_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

In [None]:
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random Forest,0.224756,0.007973,0.246753,-0.074747


In [None]:
regr = RandomForestClassifier(max_depth=49)# Random Forest Regressor function will go here.
regr.fit(X_train, y_train)

#Getting the accuracy score of the model
from sklearn.metrics import accuracy_score

res = regr.predict(X_test)
model_error = accuracy_score(y_test, res)
print(f"The accuracy the optimal model is {model_error:.2f}")

The accuracy the optimal model is 0.75


##KNN

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

col = "Outcome"
data = df.loc[:, df.columns != col]
target=df.loc[:, df.columns == col]

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, shuffle=False)


neigh = KNeighborsClassifier(n_neighbors=33)# Find the kNN Classifier algorithm function from scikit and test it with different neighbor numbers between 1-15.
#Report the best mean absolute error value and the number of neighbours. ❓
neigh.fit(X_train, y_train)


from sklearn.metrics import accuracy_score

res = neigh.predict(X_test)
model_error = accuracy_score(y_test, res)
print(f"The accuracy of the optimal model is {model_error:.2f}")



The accuracy of the optimal model is 0.75


  return self._fit(X, y)


##Support Vector Machine

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

col = "Outcome"
data = df.loc[:, df.columns != col]
target=df.loc[:, df.columns == col]

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, shuffle=False)


regr = make_pipeline(StandardScaler(), SVC(C=1.0))
regr.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),('svr', SVC(C=1.0, kernel = "rbf"))])

from sklearn.metrics import accuracy_score

res = regr.predict(X_test)
model_error = accuracy_score(y_test, res)
print(f"The accuracy of the optimal model is {model_error:.2f}")


  y = column_or_1d(y, warn=True)


The accuracy of the optimal model is 0.77


Corresponding kernel type: Linear
*   The best accuracy:77

Corresponding kernel type: Poly
*   The best accuracy:77

Corresponding kernel type: Precomputed
*   The best accuracy:77

Corresponding kernel type: rbf
*   The best accuracy:77


**All the kernel types have an optimal accuracy of 77.**

#Program

In [None]:
import pandas as pd # Importing pandas for data manipulation
from sklearn.metrics import accuracy_score # Importing accuracy_score for model evaluation
from sklearn.model_selection import train_test_split   #imported the "train_test_split" the Scikit Learn package, to split the data
from sklearn.preprocessing import StandardScaler # Importing the StandardScaler class from sklearn.preprocessing for feature scaling
from sklearn.linear_model import LogisticRegression #Importing the Logistic Regression model from Scikit Learn
from sklearn.ensemble import RandomForestClassifier #Importing the Random Forest Classifier model from Scikit Learn
from sklearn.neighbors import KNeighborsClassifier #Importing the KNN model from Scikit Learn
from sklearn.svm import SVC #Importing the Support Vector Machine model from Scikit Learn
from sklearn.pipeline import make_pipeline, Pipeline # For SVM, I'm importing the make_pipeline and Pipeline classes from sklearn.pipeline for creating a pipeline of operations to chain multiple data transformations and a final estimator into a single object

#df = pd.read_csv("https://docs.google.com/spreadsheets/d/15JHtRlAIu20NKTo5S-oqrQ_J24TBURN7C0fozCS-hRM/export?format=csv") #setting the dataframe as the csv file
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1L67Wr-K0xjJErTPqw7ByfUfJMmHVqPiw")


#creating a class called "Diabetes Dataset" with attributes and methods to be used in my file
class DiabetesDataset:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)

    def clean_data(self): #this method cleans the dataset by dropping columns with missing values & drops columns with duplicates
        self.df.dropna(inplace=True)
        self.df.drop_duplicates(inplace=True)

    def get_features(self):
        return self.df.drop('Outcome', axis=1) #Axis=1 will allow the 'drop' function to work with the data as columns

    def get_target(self):
        return self.df['Outcome'] #Axis=0 will work with the data as rows

#Putting everything into a program that runs and gives output
def main():
  #Data Import
  file_path = "https://drive.google.com/uc?export=download&id=1L67Wr-K0xjJErTPqw7ByfUfJMmHVqPiw" #file path to the dataset
  #file_path = "https://docs.google.com/spreadsheets/d/15JHtRlAIu20NKTo5S-oqrQ_J24TBURN7C0fozCS-hRM/export?format=csv"  #use this file path if the above file path link doesn't work
  dataset = DiabetesDataset(file_path)

  #Data Cleaning
  dataset.clean_data() #this method removed all the missing values across the entire dataset & removing duplicates with the "clean_data" function i described above in the Diabetes Dataset class above
  print(df.isnull().sum()) #checking for missing values for the whole dataset by using the integrated .sum function from Pandas

  # number of rows and Columns in this dataset
  df.shape

  # printing the first 5 rows of the dataset
  df.head()

  # getting the statistical measures of the data
  df.describe()

  #getting the number of how many are classed as Diabetic and how many aren't
  df['Outcome'].value_counts() # "0" = Non-Diabetic & "1" = Diabetic


  """#Data Exploration"""
  ##Data Cleaning using OOP
  ##Data Preprocessing
  ##Data split as X features and y targets using OOP
  y = dataset.get_target() # I used the "get_target" method in the defined class "DiabetesDataset" above to get the target column for y
  y

  #assigning all the features to X by leaving out the "Outcome" column of the dataset
  X = dataset.get_features() # I used the "get_features" method in the defined class above to get the features for X
  X


  """##Data Standardising"""
  scaler = StandardScaler() # initialising the Standardscaler function so the values aren't skewed
  scaler.fit(X)
  standardized_data = scaler.transform(X)
  print(standardized_data)


  """###Data Splitting"""
  #Splitting the data as; 20% of the X features & Y targets ("Outcomes" column)for testing and remaining 80% for training.
  #"random_state" is used so that everytime the code cell runs, I'll get the same data split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

  X_train #displaying the split X features to be used for training
  X_test #displaying the split X features to be used for testing

  model_results = [] #Initialising a list to store accuracy values of the models


  """##Models"""
  #LOGISTIC REGRESSION
  #Training the Linear Regression model**

  # Training with Logistic Regression Algorithm
  lr = LogisticRegression(random_state=42, max_iter=1000) #setting the logistic regression to a randomised state of 42 and a maximum iteration of 1000 loops
  lr.fit(X_train, y_train) #using the .fit function to initialise the X training data and the y-target data

  y_lr_test_pred = lr.predict(X_test) #using the .prediction function to initialise the X test data to make predictions
  lr_test_accuracy = accuracy_score(y_test, y_lr_test_pred) #setting the variable "lr_test_accuracy"  to use the "accuracy_score" function to get the accuracy of the test data
  print(f"The accuracy the optimal model of Logistic Regression is {lr_test_accuracy:.2f}")#printing the accuracy score and setting it to 2 decimal places
  model_results.append(['Logistic Regression', lr_test_accuracy, 'N/A']) #appending the accuracy to the list of model results so I can print it



  #RANDOM FOREST CLASSIFIER
  ####**Training the model**
  rf = RandomForestClassifier(max_depth=2, random_state=42) # initialising the Random Forest Classifier model
  rf.fit(X_train, y_train) #using the .fit function to initialise the X training data and the y-target data

  """#Applying the model to make a prediction**"""
  #setting the variables for the predictions on the X training data and test data
  y_rf_train_pred = rf.predict(X_train)
  y_rf_test_pred = rf.predict(X_test)

  """"#Evaluating the model"""

  regr = RandomForestClassifier(max_depth=49)# Assigning the Random Forest Regressor function to a variable called "regr" so I can use the model.
  regr.fit(X_train, y_train)#implementing the input training data for X and the target training data for y

  #Getting the accuracy score of the Random Forest model
  res = regr.predict(X_test)
  model_error = accuracy_score(y_test, res)
  print(f"The accuracy the optimal model of Random Forest is {model_error:.2f}")#printing the accuracy score and setting it to 2 decimal places
  model_results.append(['Random Forest', model_error, 'max_depth=49']) # Appending Random Forest results to the list



  #KNN
  col = "Outcome" # Defining the target column name
  data = df.loc[:, df.columns != col]  # Selecting features (all columns except the target)
  target=df.loc[:, df.columns == col] # Selecting the target column

  # Splitting the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, shuffle=False)


  neigh = KNeighborsClassifier(n_neighbors=33) # Creating a KNN classifier with "n" number of neighbors
  neigh.fit(X_train, y_train.values.ravel()) # Training the KNN model on the training data


  res = neigh.predict(X_test) #making predictions on the test data
  model_error = accuracy_score(y_test, res) #calculating the accuracy of the model
  print(f"The accuracy of the optimal model of KNN is {model_error:.2f}") #printing the accuracy score and setting it to 2 decimal places
  model_results.append(['KNN', model_error, 'n_neighbors=33']) # Appending KNN accuracy score results to the list



  #SUPPORT VECTOR MACHINE
  col = "Outcome" # Defining the target column name
  data = df.loc[:, df.columns != col] # Selecting features (all columns except the target)
  target=df.loc[:, df.columns == col] # Selecting the target column

  #Splitting the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, shuffle=False)

  #Creating a pipeline with StandardScaler and SVC
  regr = make_pipeline(StandardScaler(), SVC(C=1.0))
  regr.fit(X_train, y_train.values.ravel()) #training the SVM model on the training data
  Pipeline(steps=[('standardscaler', StandardScaler()),('svr', SVC(C=1.0, kernel = "rbf"))]) #creating a Pipeline object with two steps: 'standardscaler' (using StandardScaler for feature scaling) and 'svr' (using SVC with C=1.0 and a radial basis function kernel for classification)

  res = regr.predict(X_test) #making predictions on the test data
  model_error = accuracy_score(y_test, res) #calculating the accuracy of the model
  print(f"The accuracy of the optimal model of SVM is {model_error:.2f}") #printing the accuracy score and setting it to 2 decimal places
  model_results.append(['SVM', model_error, 'C=1.0, kernel="rbf"']) # Appending SVM results to the list

  #creating a Pandas DataFrame from the results list
  results_df = pd.DataFrame(model_results, columns=['Model', 'Optimal Accuracy', 'Optimal Parameter'])

  #printing the results table
  print(results_df)


main()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
The accuracy the optimal model of Logistic Regression is 0.77
The accuracy the optimal model of Random Forest is 0.73
The accuracy of the optimal model of KNN is 0.75
The accuracy of the optimal model of SVM is 0.77
         