# Model Quality and Improvements

**Problem Statement**

As a data professional working for a pharmaceutical company, you need to develop a model that predicts whether a patient will be diagnosed with diabetes. The model needs to have an accuracy score greater than 0.85.


# Question 1

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# training set is in train_data_us.csv file
df = pd.read_csv('https://bit.ly/DiabetesDS')
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.shape


(768, 9)

In [None]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [None]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
df.duplicated().sum()

0

In [None]:
#Using Decision tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

features = df.drop(['Outcome'], axis=1)
target = df['Outcome']

model = DecisionTreeClassifier(random_state=12345)
model.fit(features, target)

predictions = model .predict(features)

print(model.score(features, target))
print("Accuracy :", accuracy_score(target,predictions))

1.0
1.0


In [None]:
#Using Random Forest
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

features = df.drop(['Outcome'], axis=1)
target = df['Outcome']

model = RandomForestClassifier(random_state=12345, n_estimators=3)

model.fit(features, target)
predictions = model .predict(features)

print(model.score(features, target))
print("Accuracy :", accuracy_score(target,predictions))

0.9466145833333334
Accuracy : 0.9466145833333334


In [None]:
#Using Logistic Regression
import pandas as pd

from sklearn.linear_model import LogisticRegression

features = df.drop(['Outcome'], axis=1)
target = df['Outcome']

model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(features, target)

predictions = model .predict(features)

print(model.score(features, target))
print("Accuracy :", accuracy_score(target,predictions))

0.7747395833333334
Accuracy : 0.7747395833333334


#Model Improvements

In [None]:
#Using DecisionTreeClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df, test_size=0.25, random_state=12345)

features_train = df_train.drop(['Outcome'], axis=1)
target_train = df_train['Outcome']
features_valid = df_valid.drop(['Outcome'], axis=1)
target_valid = df_valid['Outcome']

for depth in range(1, 6):
    model = DecisionTreeClassifier(random_state=12345, max_depth=depth)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    print('max_depth =', depth, ': ', end='')
    print(accuracy_score(target_valid, predictions_valid))

max_depth = 1 : 0.7708333333333334
max_depth = 2 : 0.7708333333333334
max_depth = 3 : 0.7604166666666666
max_depth = 4 : 0.75
max_depth = 5 : 0.8177083333333334


In [None]:
#Using RandomForestClassifier
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df, test_size=0.25, random_state=12345)

features_train = df_train.drop(['Outcome'], axis=1)
target_train = df_train['Outcome']
features_valid = df_valid.drop(['Outcome'], axis=1)
target_valid = df_valid['Outcome']

model = RandomForestClassifier(random_state=12345, n_estimators=3)
model.fit(features_train, target_train)

print("Accuracy: ", model.score(features_valid,target_valid))

Accuracy:  0.7083333333333334


In [None]:
#Using LogisticRegression
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df, test_size=0.25, random_state=12345)

features_train = df_train.drop(['Outcome'], axis=1)
target_train = df_train['Outcome']
features_valid = df_valid.drop(['Outcome'], axis=1)
target_valid = df_valid['Outcome']

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)

print("Accuracy: ", model.score(features_valid,target_valid))

Accuracy:  0.7916666666666666
