In [1]:
import pandas as pd 
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
df2 = pd.read_csv("../data/clean/df_processed.csv")
df2.drop(['Unnamed: 0'],inplace=True,axis=1)

In [5]:
# Group df2 by the values of the 'readmitted' column
groups = df2.groupby('readmitted')

# Sample equally from each group to create the balanced DataFrame
balanced_dfs = []
for _, group in groups:
    balanced_sample = resample(group, n_samples=len(df2) // 2 // len(groups), random_state=42)
    balanced_dfs.append(balanced_sample)

# Concatenate the sampled DataFrames
df = pd.concat(balanced_dfs)

# Shuffle the concatenated DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [6]:
df

Unnamed: 0,race,gender,age,weight,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,insulin,change,diabetesMed,readmitted
0,caucasian,male,[50-60),[100-125),1,69,0,10,0,0,...,no,no,steady,no,no,no,no,no,yes,>30
1,africanamerican,female,[60-70),[50-75),6,56,3,13,1,3,...,no,no,no,no,no,no,no,no,no,<30
2,caucasian,female,[50-60),[100-125),2,36,2,18,1,0,...,no,no,no,no,no,no,no,no,yes,<30
3,caucasian,male,[60-70),[100-125),2,24,2,22,0,0,...,no,steady,no,no,steady,no,no,ch,yes,<30
4,caucasian,female,[60-70),[50-75),3,62,0,12,1,0,...,no,no,no,no,no,no,no,no,yes,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,caucasian,female,[50-60),[100-125),3,53,1,25,2,0,...,no,no,no,steady,no,no,no,ch,yes,<30
515,caucasian,female,[80-90),[75-100),3,18,2,6,0,0,...,no,no,no,no,no,no,no,no,no,<30
516,caucasian,male,[70-80),[75-100),3,68,6,21,0,0,...,no,no,steady,steady,no,no,no,ch,yes,>30
517,caucasian,male,[70-80),[125-150),3,10,1,12,0,0,...,no,no,no,no,no,no,steady,ch,yes,no


In [7]:
features = df.drop(['readmitted'],axis=1)
target = df[['readmitted']]

## 1st Try with all data though distribution of target values are not equal

In [77]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.31, random_state=23)

In [78]:
X_train_cat = X_train.select_dtypes('object')
X_test_cat  = X_test.select_dtypes('object')
#
X_train_num = X_train.select_dtypes('number')
X_test_num  = X_test.select_dtypes('number')

In [79]:
categorical_column_values = [ X_train[col].unique() for col in X_train.select_dtypes('object').columns ]

In [84]:
encoder = OneHotEncoder(drop='first',handle_unknown='ignore', sparse_output=False)

In [85]:
encoder.fit(X_train_cat)

In [86]:
X_train_cat_encoded_np = encoder.transform(X_train_cat)
X_test_cat_encoded_np  = encoder.transform(X_test_cat)




In [87]:
X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_train.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_test.index)

In [88]:
X_test_cat_encoded_df

Unnamed: 0,race_asian,race_caucasian,race_other,gender_male,age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),...,glyburide_up,pioglitazone_no,pioglitazone_steady,rosiglitazone_steady,acarbose_steady,insulin_no,insulin_steady,insulin_up,change_no,diabetesMed_yes
411,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
368,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
233,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
203,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
376,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
391,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
179,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
161,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [89]:
X_train = pd.concat([X_train_num, X_train_cat_encoded_df], axis=1)
X_test = pd.concat([X_test_num, X_test_cat_encoded_df], axis=1)

In [90]:
X_train

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_asian,race_caucasian,...,glyburide_up,pioglitazone_no,pioglitazone_steady,rosiglitazone_steady,acarbose_steady,insulin_no,insulin_steady,insulin_up,change_no,diabetesMed_yes
463,2,4,2,12,0,0,0,9,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
483,12,53,0,20,0,6,1,9,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
500,3,56,0,12,3,0,1,9,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
292,2,52,0,23,9,1,0,9,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
274,4,69,2,15,3,0,1,7,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,2,20,1,21,2,0,0,9,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
237,2,31,1,14,0,0,0,7,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
31,3,45,1,12,2,1,2,7,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
488,6,41,0,15,4,1,2,9,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [91]:
normalizer = MinMaxScaler()
normalizer.fit(X_train)

In [92]:
X_train_norm_np = normalizer.transform(X_train)
X_test_norm_np = normalizer.transform(X_test)

In [93]:
X_train_norm_df = pd.DataFrame(X_train_norm_np, columns=X_train.columns, index=X_train.index)
X_test_norm_df  = pd.DataFrame(X_test_norm_np,  columns=X_test.columns,  index=X_test.index)

In [97]:
knn = KNeighborsClassifier(n_neighbors=3)  
knn.fit(X_train_norm_df, y_train)
# Save the model with pickle

  return self._fit(X, y)


In [98]:
y_pred = knn.predict(X_test_norm_df)

In [99]:
accuracy = knn.score(X_test_norm_df, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.4906832298136646


In [101]:
dt_classifier = DecisionTreeClassifier(random_state=3)
dt_classifier.fit(X_train, y_train)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5)
rf_classifier.fit(X_train, y_train)
# Evaluate Decision Tree Classifier
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)


  return fit_method(estimator, *args, **kwargs)


Decision Tree Accuracy: 0.5652173913043478
Random Forest Accuracy: 0.577639751552795


In [102]:
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [103]:
rf_accuracy

0.577639751552795

In [104]:
df2.shape

(1043, 25)