In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [29]:
df = pd.read_csv("../data/clean/df_processed.csv")
df.drop(['Unnamed: 0'],inplace=True,axis=1)

In [30]:
#in order to test accuracy, changing target column to only yes or no
#to see if it works better when only predicting readmission
df['readmitted'] = df['readmitted'].replace({'<30':'yes','>30':'yes'})

In [31]:
features = df.drop(['readmitted'],axis=1)
target = df[['readmitted']]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=5)

In [33]:
# Split the dataset into numerical and categorical column
X_train_cat = X_train.select_dtypes('object')
X_test_cat  = X_test.select_dtypes('object')

X_train_num = X_train.select_dtypes('number')
X_test_num  = X_test.select_dtypes('number')

In [34]:
#Encoding. 


#Decided to use one hot encoding for weight and age  as 
#there's perhaps an ordinal relationship but not well/equally distributed

In [35]:
categorical_column_values = [ df[col].unique() for col in df.select_dtypes('object').columns ]

In [36]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [54]:
encoder.fit(X_train_cat)

with open('../transformers/transformer_e.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [38]:
X_train_cat_encoded_np = encoder.transform(X_train_cat)
X_test_cat_encoded_np  = encoder.transform(X_test_cat)

In [39]:
X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_train.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_np, columns=encoder.get_feature_names_out(), index=X_test.index)

In [40]:
X_train_encoded = pd.concat([X_train_num, X_train_cat_encoded_df], axis=1)
X_test_encoded = pd.concat([X_test_num, X_test_cat_encoded_df], axis=1)

In [55]:
normalizer = MinMaxScaler()
normalizer.fit(X_train_encoded)
with open('../scalers/scaler_e.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [42]:
X_train_norm_np = normalizer.transform(X_train_encoded)
X_test_norm_np = normalizer.transform(X_test_encoded)

In [43]:
X_train_norm_df = pd.DataFrame(X_train_norm_np, columns=X_train_encoded.columns, index=X_train_encoded.index)
X_test_norm_df  = pd.DataFrame(X_test_norm_np,  columns=X_test_encoded.columns,  index=X_test_encoded.index)

In [44]:
###KNN

In [56]:
knn = KNeighborsClassifier(n_neighbors=16)  
knn.fit(X_train_norm_df, y_train)

with open('../models/e_KNN.pkl', 'wb') as f:
    pickle.dump(knn, f)


  return self._fit(X, y)


In [46]:
y_pred = knn.predict(X_test_norm_df)

In [47]:
accuracy = knn.score(X_test_norm_df, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.5875706214689266


In [48]:
##DEcision Tree 

In [57]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_norm_df, y_train)

with open('../models/e_DT.pkl', 'wb') as d:
    pickle.dump(dt_classifier, d)

In [50]:
dt_predictions = dt_classifier.predict(X_test_norm_df)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)

Decision Tree Accuracy: 0.5706214689265536


In [51]:
##RF Classifier 

In [58]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_norm_df, y_train)

with open('../models/e_RF.pkl', 'wb') as r:
    pickle.dump(rf_classifier, r)

  return fit_method(estimator, *args, **kwargs)


In [53]:
rf_predictions = rf_classifier.predict(X_test_norm_df)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.615819209039548
