In [None]:
#Problem Statement
#As a data professional working for a pharmaceutical company, you need to develop a
#model that predicts whether a patient will be diagnosed with diabetes. The model needs
#to have an accuracy score greater than 0.85.


In [1]:
# Data Importation
import pandas as pd
df = pd.read_csv('https://bit.ly/DiabetesDS')


In [2]:
# Data Exploration
df.shape


(768, 9)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.head(4)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
# Data Cleaning
df = df.drop_duplicates()

In [7]:
df.shape

(768, 9)

In [6]:
#missing data
#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
Pregnancies,0,0.0
Glucose,0,0.0
BloodPressure,0,0.0
SkinThickness,0,0.0
Insulin,0,0.0
BMI,0,0.0
DiabetesPedigreeFunction,0,0.0
Age,0,0.0
Outcome,0,0.0


In [23]:
# Data Preparation - Spliting to train and test data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.25, random_state=111)


df_train_features = df_train.drop(["Outcome"], axis=1)
df_train_target = df_train["Outcome"]

df_test_features = df_test.drop(["Outcome"], axis=1)
df_test_target = df_test["Outcome"]


In [45]:
#transforming the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train_features = scaler.fit_transform(df_train_features)
df_test_features = scaler.fit_transform(df_test_features)


In [46]:
#models - RandomForest
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(random_state=222, max_depth=3, n_estimators=3)
model_1.fit(df_train_features,df_train_target)
pred_1 = model_1.predict(df_test_features)


In [47]:
#models - DecisionTree
from sklearn.tree import DecisionTreeClassifier
model_2 = DecisionTreeClassifier(random_state=111,max_depth = 2)
model_2.fit(df_train_features,df_train_target)
pred_2 = model_2.predict(df_test_features)

In [48]:
#logisticregression
from sklearn.linear_model import LogisticRegression
model_3 = LogisticRegression(random_state=12345, solver='liblinear')
model_3.fit(df_train_features,df_train_target)
pred_3 = model_3.predict(df_test_features)

In [49]:
#Accuracy
from sklearn.metrics import accuracy_score

acc_1 = accuracy_score(df_test_target,pred_1)
acc_2 = accuracy_score(df_test_target,pred_2)
acc_3 = accuracy_score(df_test_target,pred_3)

print('Accuracy: Random_Forest: %.3f' % acc_1)
print('Accuracy: Decision Tree: %.3f' % acc_2)
print('Accuracy:Logistic Regression: %.3f' % acc_3)

Accuracy: Random_Forest: 0.781
Accuracy: Decision Tree: 0.734
Accuracy:Logistic Regression: 0.776


In [None]:
#the best accuracy attained has been 78.1% from the Random Forest