In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_roc_curve
import matplotlib.pyplot as plt

### Data Transformation

In [2]:
df = pd.read_csv(r'..\\code\\datasets\\diabetic_data.csv')

In [3]:
df.drop(columns = ['encounter_id','patient_nbr','weight', 'medical_specialty'], inplace=True)
df = df[df.race!='?']
df = df[df.gender!='Unknown/Invalid']
df = df[df.payer_code!='?']
df.replace(to_replace=['Steady', 'Down', 'Up', '>30', '<30'], value='Yes', inplace=True)

In [4]:
df_breakout = pd.get_dummies(df, columns=["race","gender", "age", "admission_type_id", "discharge_disposition_id","admission_source_id",
                                          "payer_code", "diag_1", "diag_2", "diag_3", "max_glu_serum", "A1Cresult", "metformin", "repaglinide",
                                          "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide",
                                          "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton",
                                          "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone",
                                          "metformin-pioglitazone", "change", "diabetesMed"], drop_first=True)

**Split dataset into train and test sets**

In [5]:
X = df_breakout.drop(columns = 'readmitted')
y = df_breakout.readmitted
y.replace(to_replace=['>30', '<30'], value='YES', inplace=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

**Ensemble Voting Classifier**

In [7]:
rdg = RidgeClassifier(alpha = 88)
rfc = RandomForestClassifier(n_estimators = 500, criterion = 'gini', max_features = 'sqrt', random_state=0, n_jobs = -1)
sgd = make_pipeline(StandardScaler(),SGDClassifier(loss = 'log', penalty = 'l1', max_iter = 1000, n_jobs = -1, n_iter_no_change = 5, alpha = 0.0001))
knn = KNeighborsClassifier(n_neighbors=11, leaf_size=1, p=1,n_jobs=-1)

eclf1 = VotingClassifier(estimators=[('rdg', rdg), ('rfc', rfc), ('sgd', sgd), ('knn', knn)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
prediction = eclf1.predict(X_test)
predSeries = pd.Series(prediction)
print(eclf1.score(X_test, y_test))

y_test = y_test.reset_index(drop = 'True')
ML_result = pd.concat([y_test, predSeries], axis=1)
ML_result.columns = ['True','Predicted']
confusion_matrix = pd.crosstab(ML_result['True'], ML_result['Predicted'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)





0.6517462866318747
Predicted    NO   Yes
Actual               
NO         6272  1675
Yes        3530  3469


NameError: name 'ax' is not defined

In [10]:
eclf2 = VotingClassifier(estimators=[('rdg', rdg), ('rfc', rfc), ('sgd', sgd), ('knn', knn)], voting='soft')
eclf2 = eclf2.fit(X_train, y_train)
prediction = eclf1.predict(X_test)
predSeries = pd.Series(prediction)
print(eclf2.score(X_test, y_test))

y_test = y_test.reset_index(drop = 'True')
ML_result = pd.concat([y_test, predSeries], axis=1)
ML_result.columns = ['True','Predicted']
confusion_matrix = pd.crosstab(ML_result['True'], ML_result['Predicted'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)



AttributeError: 'RidgeClassifier' object has no attribute 'predict_proba'