In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import BaggingClassifier,ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
data = pd.read_csv("train.csv")
data.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,5,189,64,33,325,31.2,0.583,29,1
1,2,155,52,27,540,38.7,0.24,25,1
2,13,152,90,33,29,26.8,0.731,43,1


In [3]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

So No missing value is found

In [4]:
data.keys()

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
x_data = data.drop(columns=["Outcome"])
y_data = data.Outcome

In [6]:
classification_models = []
classification_models.append(('Logistic Regression', LogisticRegression(solver="liblinear")))
classification_models.append(('K Nearest Neighbor', KNeighborsClassifier(n_neighbors=5, metric="minkowski",p=2)))
classification_models.append(('Kernel SVM', SVC(kernel = 'rbf',gamma='scale')))
classification_models.append(('Naive Bayes', GaussianNB()))
classification_models.append(('Decision Tree', DecisionTreeClassifier(criterion = "entropy")))
classification_models.append(('Random Forest', RandomForestClassifier(n_estimators=100, criterion="entropy", max_features = 3)))
classification_models.append(('Extra Trees', ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_features = 3)))
classification_models.append(('Ada Boost', AdaBoostClassifier(n_estimators=100, random_state=100)))
classification_models.append(('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=100)))

In [7]:
for name, model in classification_models:
  kfold = KFold(n_splits=10, random_state=(7), shuffle=(True))
  result = cross_val_score(model, x_data, y_data, cv=kfold, scoring='accuracy')
  print("%s: Mean Accuracy = %.2f%% - SD Accuracy = %.2f%%" % (name, result.mean()*100, result.std()*100))

Logistic Regression: Mean Accuracy = 75.80% - SD Accuracy = 4.73%
K Nearest Neighbor: Mean Accuracy = 71.73% - SD Accuracy = 7.42%
Kernel SVM: Mean Accuracy = 75.05% - SD Accuracy = 7.47%
Naive Bayes: Mean Accuracy = 75.62% - SD Accuracy = 5.84%
Decision Tree: Mean Accuracy = 66.68% - SD Accuracy = 6.96%
Random Forest: Mean Accuracy = 75.05% - SD Accuracy = 7.51%
Extra Trees: Mean Accuracy = 75.06% - SD Accuracy = 6.58%
Ada Boost: Mean Accuracy = 74.32% - SD Accuracy = 6.72%
Gradient Boosting: Mean Accuracy = 74.68% - SD Accuracy = 5.01%


In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.30, random_state=101)

In [10]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [13]:
from catboost import *
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.001
)

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=True
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())

0:	learn: 0.6926027	test: 0.6927855	best: 0.6927855 (0)	total: 1.52ms	remaining: 761ms
1:	learn: 0.6922048	test: 0.6925022	best: 0.6925022 (1)	total: 2.84ms	remaining: 707ms
2:	learn: 0.6916551	test: 0.6920827	best: 0.6920827 (2)	total: 4.44ms	remaining: 736ms
3:	learn: 0.6911620	test: 0.6917261	best: 0.6917261 (3)	total: 5.99ms	remaining: 743ms
4:	learn: 0.6906288	test: 0.6913222	best: 0.6913222 (4)	total: 7.54ms	remaining: 746ms
5:	learn: 0.6901052	test: 0.6908670	best: 0.6908670 (5)	total: 9.02ms	remaining: 743ms
6:	learn: 0.6896831	test: 0.6905581	best: 0.6905581 (6)	total: 10.5ms	remaining: 738ms
7:	learn: 0.6891928	test: 0.6901723	best: 0.6901723 (7)	total: 12ms	remaining: 738ms
8:	learn: 0.6887298	test: 0.6897940	best: 0.6897940 (8)	total: 13.5ms	remaining: 736ms
9:	learn: 0.6882590	test: 0.6893870	best: 0.6893870 (9)	total: 15ms	remaining: 735ms
10:	learn: 0.6876362	test: 0.6889327	best: 0.6889327 (10)	total: 16.5ms	remaining: 732ms
11:	learn: 0.6871304	test: 0.6885136	best: 0.

In [14]:
#Let us calculate contest predictions
test_pool = Pool(data=X_test)
contest_predictions = model.predict_proba(test_pool)
contest_predictions = [np.argmax(i) for i in contest_predictions]

In [15]:
print("\nClassification Report: \n",classification_report(y_test,contest_predictions))
print("\nConfusion Matrix: \n",confusion_matrix(y_test,contest_predictions))
print("\nAccuracy Score: ",str(round(accuracy_score(y_test,contest_predictions),2)*100)+"%")


Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       107
           1       0.67      0.60      0.63        55

    accuracy                           0.77       162
   macro avg       0.74      0.73      0.73       162
weighted avg       0.76      0.77      0.76       162


Confusion Matrix: 
 [[91 16]
 [22 33]]

Accuracy Score:  77.0%
