In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

In [42]:
df = pd.read_csv("../data/clean/df_processed.csv")
df.drop(['Unnamed: 0'],inplace=True,axis=1)

In [43]:
df

Unnamed: 0,gender,age,weight,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,insulin,change,diabetesMed,readmitted
0,female,[70-80),[75-100),10,65,1,28,1,1,3,...,no,steady,no,no,no,no,steady,ch,yes,<30
1,male,[80-90),[50-75),6,73,0,16,0,0,0,...,no,no,no,no,no,no,steady,no,yes,no
2,male,[60-70),[100-125),2,58,3,12,0,0,0,...,no,no,no,steady,no,no,no,no,yes,>30
3,male,[40-50),[75-100),3,33,0,7,4,3,6,...,no,no,no,steady,no,no,no,no,yes,>30
4,male,[50-60),[100-125),2,5,4,11,0,0,0,...,no,no,steady,no,no,no,no,no,yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
877,male,[80-90),[50-75),3,82,0,19,0,0,0,...,no,no,no,no,no,no,no,no,no,no
878,male,[70-80),[75-100),1,69,1,20,0,0,0,...,no,no,no,no,no,no,steady,no,yes,no
879,male,[60-70),[100-125),3,48,0,11,0,0,1,...,no,steady,no,no,no,no,no,no,yes,no
880,female,[70-80),[50-75),13,82,1,22,7,0,0,...,no,no,down,no,no,no,no,ch,yes,no


In [44]:
features = df.drop(['readmitted'],axis=1)
target = df[['readmitted']]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=5)

In [46]:
# Split the dataset into numerical and categorical column
X_train_cat = X_train.select_dtypes('object')
X_test_cat  = X_test.select_dtypes('object')

X_train_num = X_train.select_dtypes('number')
X_test_num  = X_test.select_dtypes('number')

In [47]:
#Encoding. 


#Decided to use one hot encoding for weight and age  as 
#there's perhaps an ordinal relationship but not well/equally distributed

In [48]:
categorical_column_values = [ df[col].unique() for col in df.select_dtypes('object').columns ]

In [49]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [50]:
encoder.fit(X_train_cat)

with open('../transformers/transformer_d.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [53]:
X_train_cat_encoded_np = encoder.transform(X_train_cat)
X_test_cat_encoded_np  = encoder.transform(X_test_cat)

In [54]:
X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_train.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_test.index)

In [55]:
X_train_encoded = pd.concat([X_train_num, X_train_cat_encoded_df], axis=1)
X_test_encoded = pd.concat([X_test_num, X_test_cat_encoded_df], axis=1)

In [56]:
normalizer = MinMaxScaler()
normalizer.fit(X_train_encoded)

with open('../scalers/scaler_d.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [57]:
X_train_norm_np = normalizer.transform(X_train_encoded)
X_test_norm_np = normalizer.transform(X_test_encoded)

In [58]:
X_train_norm_df = pd.DataFrame(X_train_norm_np, columns=X_train_encoded.columns, index=X_train_encoded.index)
X_test_norm_df  = pd.DataFrame(X_test_norm_np,  columns=X_test_encoded.columns,  index=X_test_encoded.index)

In [None]:
###KNN

In [74]:
knn = KNeighborsClassifier(n_neighbors=15)  
knn.fit(X_train_norm_df, y_train)

with open('../models/d_KNN.pkl', 'wb') as f:
    pickle.dump(knn, f)


  return self._fit(X, y)


In [None]:
y_pred = knn.predict(X_test_norm_df)

In [None]:
accuracy = knn.score(X_test_norm_df, y_test)
print("Accuracy:", accuracy)

In [None]:
##Decision Tree 

In [73]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_norm_df, y_train)

with open('../models/d_DT.pkl', 'wb') as d:
    pickle.dump(dt_classifier, d)

In [None]:
dt_predictions = dt_classifier.predict(X_test_norm_df)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)

In [None]:
##RF Classifier 

In [76]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_norm_df, y_train)

with open('../models/d_RF.pkl', 'wb') as r:
    pickle.dump(rf_classifier, r)

  return fit_method(estimator, *args, **kwargs)


In [None]:
rf_predictions = rf_classifier.predict(X_test_norm_df)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)