# Prerequisites

In [2]:
# Import numpy and pandas libraries to begin with
import pandas as pd
import numpy as np


# Data Importation

In [3]:
# Load the diabetes dataset and preview first few records
diabetes_df = pd.read_csv("https://bit.ly/DiabetesDS")
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data Exploration 

In [4]:
# Check dataframe structure
diabetes_df.shape

(768, 9)

In [6]:
# Check the column datatypes
diabetes_df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [7]:
# Check if there any all null columns
diabetes_df.isnull().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [11]:
# Select and preview unique Pregnacies
diabetes_df.Pregnancies.unique().tolist()

[6, 1, 8, 0, 5, 3, 10, 2, 4, 7, 9, 11, 13, 15, 17, 12, 14]

In [13]:
# Select and preview unique Outcomes
diabetes_df.Outcome.unique().tolist()

[1, 0]

In [12]:
# Check for duplicate rows based on all columns
diabetes_df[diabetes_df.duplicated()]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


# Data Exploration Observations
- The dataset 9 columns and 768 rows
- All columns are of integer or float datatype
- There are no duplicate rows
- There are no null values in any of the columns
- First 8 columns will form the features for our analysis while the Outcome column will be our target
- So far the dataset look ok.



# Data Cleanup

We will undertake two clean up exercises.
- When modeling, it is important to clean the data sample to ensure that the observations best represent the problem.
- Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data i.e. outliers.
- Outliers are known to cause e.g. the linear regression model to learn a bias or skewed understanding of the problem, thus removing these outliers from the training set will allow a more effective model to be learned.

Our first clean up excersie will be 
- Round of the Diabetes Pedegree Fuction to 2 decimal places
- Remove outliers from the dataset

In [17]:
# Round diabetes pedegree function to 2 decimal places
diabetes_df['DiabetesPedigreeFunction'] = diabetes_df['DiabetesPedigreeFunction'].round(decimals=2)
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.63,50,1
1,1,85,66,29,0,26.6,0.35,31,0
2,8,183,64,0,0,23.3,0.67,32,1
3,1,89,66,23,94,28.1,0.17,21,0
4,0,137,40,35,168,43.1,2.29,33,1


In [18]:
# Removing Outliers in the dataframe
# We first defining our quantiles using the quantile() function
# ---
# 
Q1 = diabetes_df.quantile(0.25)
Q3 = diabetes_df.quantile(0.75)
IQR = Q3 - Q1
IQR

# Then filtering out our outliers by getting values which are outside our IQR Range.
# ---
#
diabetes_df_iqr = diabetes_df[((diabetes_df < (Q1 - 1.5 * IQR)) | (diabetes_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# One way of dealing with outliers is removing them 
# Checking the size of the dataset with outliers for cleaning purposes
# ---
#
diabetes_df_iqr.shape

(128, 9)

In [19]:
# Explore the outliers before deleting them
diabetes_df_iqr

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
4,0,137,40,35,168,43.1,2.29,33,1
7,10,115,0,0,0,35.3,0.13,29,0
8,2,197,70,45,543,30.5,0.16,53,1
9,8,125,96,0,0,0.0,0.23,54,1
12,10,139,80,0,0,27.1,1.44,57,0
...,...,...,...,...,...,...,...,...,...
706,10,115,0,0,0,0.0,0.26,30,1
707,2,127,46,21,335,34.4,0.18,22,0
710,3,158,64,13,387,31.2,0.30,24,0
715,7,187,50,33,392,33.9,0.83,34,1


We will omit 128 rows from the dataset of which are outliers so that have a dataset that help create a more effective model 

In [22]:
# Lets drop the outliers and retain a clean dataframe
clean_df = diabetes_df[ ~((diabetes_df < (Q1 - 1.5 * IQR)) | (diabetes_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Checking the size of our final dataset.
clean_df.shape

(640, 9)

Our clean dataframe has 640 rows and 9 columns

# Data Preparation and Modeling

# Decision Tree

Decision Tree: 1- Test the max_depth parameter that gives us the highest accuracy for our model

In [69]:
# import DecisionTreeClassifier from sklearn 7 train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# define features and target
features = clean_df.drop(['Outcome'], axis=1)
target = clean_df['Outcome']

# split the dataset between tran set and test set with test_size being 25% of the dataset
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=123456789
)

# declare four variables
features_train = features_train
target_train = target_train
features_valid = features_valid
target_valid = target_valid

for depth in range(1, 20):
        model = DecisionTreeClassifier(random_state=123456789, max_depth=depth) # < create a model, specify max_depth=depth >

        model.fit(features_train, target_train) # < train the model >

        predictions_valid = model.predict(features_valid) # < find the predictions using validation set >

        print("max_depth =", depth, ": ", end='')
        print(accuracy_score(target_valid, predictions_valid))

max_depth = 1 : 0.7875
max_depth = 2 : 0.7625
max_depth = 3 : 0.7625
max_depth = 4 : 0.725
max_depth = 5 : 0.73125
max_depth = 6 : 0.76875
max_depth = 7 : 0.775
max_depth = 8 : 0.7875
max_depth = 9 : 0.78125
max_depth = 10 : 0.76875
max_depth = 11 : 0.7625
max_depth = 12 : 0.74375
max_depth = 13 : 0.76875
max_depth = 14 : 0.775
max_depth = 15 : 0.75
max_depth = 16 : 0.75
max_depth = 17 : 0.75
max_depth = 18 : 0.75
max_depth = 19 : 0.75


Implement using a max_depth of 3 for the best accuracy of 79% when random state is set to 123456789

In [70]:
# import DecisionTreeClassifier from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# define features and target
features = clean_df.drop(['Outcome'], axis=1)
target = clean_df['Outcome']

# split the dataset between tran set and test set with test_size being 25% of the dataset
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=123456789
)

# declare four variables
features_train = features_train
target_train = target_train
features_valid = features_valid
target_valid = target_valid

# create a decison tree classifier model and set max_depth to 3
model = DecisionTreeClassifier(random_state=123456789, max_depth=8)
      
# train the model
model.fit(features_train, target_train)

# predict and convert outcome to a 1 dimensional array and print the outcome
predicted_valid = pd.Series(model.predict(features_valid))  
# print(predicted_valid.head())

# check model accuracy and print the outcome
accuracy_valid = accuracy_score(predicted_valid, target_valid) 
print(accuracy_valid)

0.7875


# Random Forest

Random Forest: 1- Test the n_stimator parameter that gives us the highest accuracy for our model

In [94]:
# import RandomForestClassifier from sklearn & train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# define features and target
features = clean_df.drop(['Outcome'], axis=1)
target = clean_df['Outcome']

# split the dataset between tran set and test set with test_size being 25% of the dataset
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

# declare four variables
features_train = features_train
target_train = target_train
features_valid = features_valid
target_valid = target_valid

for estimator in range(1, 25):
        model = RandomForestClassifier(random_state=12345, n_estimators=estimator) # < create a model >

        model.fit(features_train, target_train) # < train the model >

        predictions_valid = model.score(features_valid, target_valid) # < find the predictions using validation set >

        print("n_estimators =", estimator, ": ", end='')
        print(predictions_valid)

n_estimators = 1 : 0.675
n_estimators = 2 : 0.7375
n_estimators = 3 : 0.73125
n_estimators = 4 : 0.7625
n_estimators = 5 : 0.74375
n_estimators = 6 : 0.74375
n_estimators = 7 : 0.76875
n_estimators = 8 : 0.775
n_estimators = 9 : 0.775
n_estimators = 10 : 0.7875
n_estimators = 11 : 0.7875
n_estimators = 12 : 0.7875
n_estimators = 13 : 0.8
n_estimators = 14 : 0.79375
n_estimators = 15 : 0.79375
n_estimators = 16 : 0.79375
n_estimators = 17 : 0.79375
n_estimators = 18 : 0.8125
n_estimators = 19 : 0.80625
n_estimators = 20 : 0.80625
n_estimators = 21 : 0.81875
n_estimators = 22 : 0.80625
n_estimators = 23 : 0.8
n_estimators = 24 : 0.79375


2: Implement using n_estimator value of 21 for the best accuracy of 82% when random state is set to 12345

In [78]:
# import RandomForestClassifier from sklearn & train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# define features and target
features = clean_df.drop(['Outcome'], axis=1)
target = clean_df['Outcome']

# split the dataset between tran set and test set with test_size being 25% of the dataset
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

# declare four variables
features_train = features_train
target_train = target_train
features_valid = features_valid
target_valid = target_valid


# create a random forest classifier model and set n_estimators to 21
model = RandomForestClassifier(random_state=12345, n_estimators=21)
      
# train the model
model.fit(features_train, target_train)

# predict and convert outcome to a 1 dimensional array and print the outcome
predicted_valid = pd.Series(model.predict(features_valid))  
# print(predicted_valid.head())

# check model accuracy and print the outcome
accuracy_valid = accuracy_score(predicted_valid, target_valid) # < write code here >
print(accuracy_valid)

0.81875


# Logistic Regression

In [84]:
# import LogisticRegression from sklearn & train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# define features and target
features = clean_df.drop(['Outcome'], axis=1)
target = clean_df['Outcome']

# split the dataset between tran set and test set with test_size being 25% of the dataset
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

# declare four variables
features_train = features_train
target_train = target_train
features_valid = features_valid
target_valid = target_valid


model = LogisticRegression(random_state=12345, solver='liblinear')  # < create a model, specify random state as 12345>

model.fit(features_train, target_train) # < train the model >

predictions_valid = model.score(features_valid, target_valid) # < find the predictions using validation set >

print(predictions_valid)

0.76875
