In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("C:/Users/sai/Downloads/UahZCQ.csv")
df=df.sample(frac=1,random_state=4).reset_index(drop=True)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,100,66,29,196,32.0,0.444,42,0
1,9,57,80,37,0,32.8,0.096,41,0
2,0,100,70,26,50,30.8,0.597,21,0
3,1,119,88,41,170,45.3,0.507,26,0
4,2,102,86,36,120,45.5,0.127,23,1


In [3]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
y=df['Outcome']
x=df.drop('Outcome',axis=1)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [1]:
from sklearn.ensemble import AdaBoostClassifier

In [7]:
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,roc_curve

ada=AdaBoostClassifier(random_state=4)
ada.fit(x_train,y_train)

y_train_pred=ada.predict(x_train)
y_train_prob=ada.predict_proba(x_train)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_train,y_train_pred))
print('train - accuracy score : ','\n', accuracy_score(y_train,y_train_pred))
print('train - AUC : ', roc_auc_score(y_train,y_train_prob))

train - confusion matrix :  
 [[320  29]
 [ 49 139]]
train - accuracy score :  
 0.8547486033519553
train - AUC :  0.9358425288057063


In [8]:
y_test_pred=ada.predict(x_test)
y_test_prob=ada.predict_proba(x_test)[:,1]
print('test - confusion matrix : ','\n',confusion_matrix(y_test,y_test_pred))
print('test - accuracy score : ','\n', accuracy_score(y_test,y_test_pred))
print('test - AUC : ', roc_auc_score(y_test,y_test_prob))

test - confusion matrix :  
 [[128  23]
 [ 40  40]]
test - accuracy score :  
 0.7272727272727273
test - AUC :  0.7894867549668874


# lgbm

In [9]:
# !pip install lightgbm

In [10]:
import lightgbm as lgb

In [11]:
lgbmc=lgb.LGBMClassifier(random_state=4)
lgbmc.fit(x_train,y_train)

LGBMClassifier(random_state=4)

In [12]:
y_train_pred=lgbmc.predict(x_train)
y_train_prob=lgbmc.predict_proba(x_train)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_train,y_train_pred))
print('train - accuracy score : ','\n', accuracy_score(y_train,y_train_pred))
print('train - AUC : ', roc_auc_score(y_train,y_train_prob))

train - confusion matrix :  
 [[349   0]
 [  0 188]]
train - accuracy score :  
 1.0
train - AUC :  1.0


In [13]:
y_test_pred=lgbmc.predict(x_test)
y_test_prob=lgbmc.predict_proba(x_test)[:,1]
print('test - confusion matrix : ','\n',confusion_matrix(y_test,y_test_pred))
print('test - accuracy score : ','\n', accuracy_score(y_test,y_test_pred))
print('test - AUC : ', roc_auc_score(y_test,y_test_prob))

test - confusion matrix :  
 [[120  31]
 [ 35  45]]
test - accuracy score :  
 0.7142857142857143
test - AUC :  0.7836092715231788


# hyper parameter tuning LGBM

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [15]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

lgbmc=lgb.LGBMClassifier(random_state=4)
params={'n_estimators':sp_randint(50,250),'max_depth':sp_randint(1,15),
        'learning_rate':sp_uniform(0,0.5)}

r_search=RandomizedSearchCV(estimator=lgbmc,param_distributions=params,cv=3,n_iter=10,scoring='roc_auc',
                           random_state=4,n_jobs=-1)

print(r_search.fit(x,y))
print(r_search.best_params_)

RandomizedSearchCV(cv=3, estimator=LGBMClassifier(random_state=4), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D00808>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D00608>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D00488>},
                   random_state=4, scoring='roc_auc')
{'learning_rate': 0.022080028965749787, 'max_depth': 6, 'n_estimators': 80}


In [16]:
lgbmc=lgb.LGBMClassifier(random_state=4)
params={'n_estimators':sp_randint(50,250),'max_depth':sp_randint(1,15),
        'learning_rate':sp_uniform(0,0.5)}

r_search=RandomizedSearchCV(estimator=lgbmc,param_distributions=params,cv=3,n_iter=100,scoring='roc_auc',
                           random_state=4,n_jobs=-1)

print(r_search.fit(x,y))
print(r_search.best_params_)

RandomizedSearchCV(cv=3, estimator=LGBMClassifier(random_state=4), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D4A408>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123CD0308>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D00448>},
                   random_state=4, scoring='roc_auc')
{'learning_rate': 0.1822765876628779, 'max_depth': 1, 'n_estimators': 57}


In [17]:
lgbm=lgb.LGBMClassifier(**r_search.best_params_,importance_type='gain',random_state=4)
lgbm.fit(x_train,y_train)
y_train_pred=lgbm.predict(x_train)
y_train_prob=lgbm.predict_proba(x_train)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_train,y_train_pred))
print('train - accuracy score : ','\n', accuracy_score(y_train,y_train_pred))
print('train - AUC : ', roc_auc_score(y_train,y_train_prob))

train - confusion matrix :  
 [[318  31]
 [ 62 126]]
train - accuracy score :  
 0.8268156424581006
train - AUC :  0.8987532768396025


In [18]:
y_train_pred=lgbm.predict(x_test)
y_train_prob=lgbm.predict_proba(x_test)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_test,y_test_pred))
print('train - accuracy score : ','\n', accuracy_score(y_test,y_test_pred))
print('train - AUC : ', roc_auc_score(y_test,y_test_prob))

train - confusion matrix :  
 [[120  31]
 [ 35  45]]
train - accuracy score :  
 0.7142857142857143
train - AUC :  0.7836092715231788


In [19]:
pd.DataFrame(lgbm.feature_importances_,index=x.columns,columns=['imp'])

Unnamed: 0,imp
Pregnancies,29.983
Glucose,514.958633
BloodPressure,9.57662
SkinThickness,0.0
Insulin,0.0
BMI,140.735138
DiabetesPedigreeFunction,29.7579
Age,99.068299


In [20]:
# we can drop the insignificant variables

In [21]:
y=df['Outcome']
x=df.drop(['Outcome','SkinThickness','Insulin'],axis=1)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [22]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

lgbmc=lgb.LGBMClassifier(random_state=4)
params={'n_estimators':sp_randint(50,250),'max_depth':sp_randint(1,15),
        'learning_rate':sp_uniform(0,0.5)}

r_search=RandomizedSearchCV(estimator=lgbmc,param_distributions=params,cv=3,n_iter=10,scoring='roc_auc',
                           random_state=4,n_jobs=-1)

print(r_search.fit(x,y))
print(r_search.best_params_)

RandomizedSearchCV(cv=3, estimator=LGBMClassifier(random_state=4), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D06CC8>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123D55608>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028123CF81C8>},
                   random_state=4, scoring='roc_auc')
{'learning_rate': 0.022080028965749787, 'max_depth': 6, 'n_estimators': 80}
