In [96]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [97]:
url = "diabetes.csv"
df = pd.read_csv(url)
columns = list(df.columns)
columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [98]:
df.shape

(768, 9)

In [99]:
df.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [100]:
z_c = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for i in z_c:
    df[i].replace(0,np.NAN,inplace = True)
df.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [101]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [102]:
df.dropna(inplace=True)
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [103]:
dataset = np.array(df)
dataset.shape

(392, 9)

In [104]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Outcome'], axis = 1), 
                                        df['Outcome'], test_size = 0.2, random_state = 2)

In [105]:
X_train.shape

(313, 8)

In [106]:
logisticRegression = LogisticRegression(max_iter = 10000, solver='lbfgs')

In [107]:
logisticRegression.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [108]:
y_pred = logisticRegression.predict(X_test)

In [109]:
from sklearn.metrics import  classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [110]:
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.759493670886076
[[48  7]
 [12 12]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        55
           1       0.63      0.50      0.56        24

    accuracy                           0.76        79
   macro avg       0.72      0.69      0.70        79
weighted avg       0.75      0.76      0.75        79



In [111]:
randomForest = RandomForestClassifier(random_state = 2)
param_grid = {
    'criterion' : ['gini', 'entropy'],
    'n_estimators': [100, 300, 500],
    'max_features': ['auto', 'log2'],
    'max_depth' : [3, 5, 7]    
}


# Grid search
randomForest_CV = GridSearchCV(estimator = randomForest, param_grid = param_grid, cv = 5)
randomForest_CV.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=2,
                                   

In [112]:
print(randomForest_CV.best_params_)

{'criterion': 'entropy', 'max_depth': 3, 'max_features': 'log2', 'n_estimators': 300}


In [113]:
randomForestFinalModel = RandomForestClassifier(random_state = 2, criterion = 'gini', max_depth = 3, max_features = 'log2', n_estimators = 300)
randomForestFinalModel.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [114]:
y_pred = randomForestFinalModel.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.7974683544303798
[[49  6]
 [10 14]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86        55
           1       0.70      0.58      0.64        24

    accuracy                           0.80        79
   macro avg       0.77      0.74      0.75        79
weighted avg       0.79      0.80      0.79        79



In [126]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
paramgrid = {
    'n_neighbors' : [3,4,5]
}

knn_cv = GridSearchCV(estimator = knn, param_grid = paramgrid, cv = 5)
knn_cv.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [3, 4, 5]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [127]:
knn_cv.best_params_

{'n_neighbors': 4}

In [128]:
knn = KNeighborsClassifier(n_neighbors =4)
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [129]:
y_pred = knn.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.7721518987341772
[[51  4]
 [14 10]]
              precision    recall  f1-score   support

           0       0.78      0.93      0.85        55
           1       0.71      0.42      0.53        24

    accuracy                           0.77        79
   macro avg       0.75      0.67      0.69        79
weighted avg       0.76      0.77      0.75        79



In [119]:
from sklearn.svm import SVC

In [131]:
svm = SVC(kernel='rbf',random_state=2)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.759493670886076
[[49  6]
 [13 11]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.84        55
           1       0.65      0.46      0.54        24

    accuracy                           0.76        79
   macro avg       0.72      0.67      0.69        79
weighted avg       0.75      0.76      0.75        79



In [121]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X, y)

y_pred = xgb_model.predict(X)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

NameError: name 'X' is not defined

In [None]:
pip install xgboost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))