In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

### Data Transformation

In [2]:
df = pd.read_csv(r'..\\datasets\\diabetic_data.csv')

In [3]:
df.drop(columns = ['encounter_id','patient_nbr','weight', 'medical_specialty'], inplace=True)
df = df[df.race!='?']
df = df[df.gender!='Unknown/Invalid']
df = df[df.payer_code!='?']
df.replace(to_replace=['Steady', 'Down', 'Up', '>30', '<30'], value='Yes', inplace=True)

In [4]:
df_breakout = pd.get_dummies(df, columns=["race","gender", "age", "admission_type_id", "discharge_disposition_id","admission_source_id",
                                          "payer_code", "diag_1", "diag_2", "diag_3", "max_glu_serum", "A1Cresult", "metformin", "repaglinide",
                                          "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide",
                                          "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton",
                                          "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone",
                                          "metformin-pioglitazone", "change", "diabetesMed"], drop_first=True)

**Split dataset into train and test sets**

In [5]:
X = df_breakout.drop(columns = 'readmitted')
y = df_breakout.readmitted
y.replace(to_replace=['>30', '<30'], value='YES', inplace=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Ridge Classifier Tuning

In [None]:
alpha = [0.1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]     #list of parameters to tune
# define grid search        
grid = dict(alpha=alpha)                                       #dictionary of paramaters to tune: (alpha = alphas, beta = betas)

ridge = RidgeClassifier()

clf = GridSearchCV(ridge, grid, cv=5, verbose = 3, n_jobs=-1, scoring='accuracy', error_score=0)
grid_result = clf.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
