In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report


### Load the input datasets

In [2]:
def loadData():
    input_df = pd.read_csv("./input/diabetes.csv",header=None)
    input_df.head(10)
    X = input_df.iloc[:,0:8]
    y = input_df.iloc[:,8:9]
    return X,y

In [3]:
X, y = loadData()
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,759.0,759.0,759.0,759.0,759.0,759.0,759.0,759.0
mean,-0.407657,0.218563,0.176505,-0.289735,-0.323534,-0.032245,-0.663253,-0.516162
std,0.38626,0.306419,0.201287,0.25848,0.375544,0.205376,0.283056,0.400794
min,-0.882353,-0.557789,-0.606557,-0.858586,-0.966903,-0.457526,-0.994876,-0.966667
25%,-0.764706,-0.005025,0.016393,-0.494949,-0.716312,-0.178837,-0.858241,-0.866667
50%,-0.529412,0.165829,0.180328,-0.292929,0.0,-0.034277,-0.747225,-0.633333
75%,0.0,0.407035,0.311475,0.0,0.0,0.087928,-0.531597,-0.233333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Split into train and test set

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508 entries, 95 to 37
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       508 non-null    float64
 1   1       508 non-null    float64
 2   2       508 non-null    float64
 3   3       508 non-null    float64
 4   4       508 non-null    float64
 5   5       508 non-null    float64
 6   6       508 non-null    float64
 7   7       508 non-null    float64
dtypes: float64(8)
memory usage: 35.7 KB


### Model Evaluation

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    [
     ('selector',SelectKBest(score_func=f_classif)),
     ('model',LogisticRegression(solver='liblinear'))
    ])


In [7]:
search = GridSearchCV(
    estimator = pipeline,
    param_grid = {'selector__k':range(1,9)},
    n_jobs=-1,
    scoring="accuracy",
    cv=20,
    verbose=1
)

In [8]:
search.fit(X_train,y_train)

Fitting 20 folds for each of 8 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 145 out of 160 | elapsed:    2.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    2.6s finished
  return f(**kwargs)
  return f(**kwargs)


GridSearchCV(cv=20,
             estimator=Pipeline(steps=[('selector', SelectKBest()),
                                       ('model',
                                        LogisticRegression(solver='liblinear'))]),
             n_jobs=-1, param_grid={'selector__k': range(1, 9)},
             scoring='accuracy', verbose=1)

In [9]:
search.best_params_

{'selector__k': 6}

In [10]:
search.best_score_

0.7536923076923079

In [11]:
y_true, y_pred = y_test, search.predict(X_test)

In [12]:
print(classification_report(y_true, y_pred))
print()

              precision    recall  f1-score   support

           0       0.78      0.57      0.66        87
           1       0.80      0.91      0.85       164

    accuracy                           0.80       251
   macro avg       0.79      0.74      0.76       251
weighted avg       0.79      0.80      0.79       251


