# Readmission Prediction with FasterRisk risk score

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import confusion_matrix,roc_auc_score
import joblib

## Load data

In [2]:
# Load data
training_data = pd.read_csv("data/Training_Data.csv",index_col=0)
testing_data = pd.read_csv("data/Testing_Data.csv",index_col=0)
# Fill nan ICU_LOS with 0 to avoid error in risk score computation
training_data.fillna(value={'ICU_LOS':0},inplace=True)
testing_data.fillna(value={'ICU_LOS':0},inplace=True)
# Combine training and testing data for discretize continuous features
all_data = pd.concat([training_data,testing_data],axis=0)

## Discretize features with continuous values

In [3]:
def fit_discretizer(feature_name,all_data):
    """
    Fits a discretizer to discretize a continuous feature in the data.
    Returns the fitted discretizer and associated feature names.
    """
    # Initialize the discretizer with 10 bins based on quantiles
    discretizer = KBinsDiscretizer(n_bins=10,encode = 'onehot-dense',strategy='quantile',random_state=42)
    # Fit the discretizer
    feature_arr = all_data[feature_name].values.reshape((-1,1))
    discretizer.fit(feature_arr)
    # Save the fitted discretizer
    joblib.dump(discretizer, f'data/{feature_name}_discretizer.pkl')
    # Generate the corresponding feature names
    bin_cutoffs = discretizer.bin_edges_[0]
    discretized_feature_names = []
    discretized_feature_names.append(f"{feature_name}<{'%.2f' % (bin_cutoffs[1])}")
    for i in range(1,len(bin_cutoffs)-2):
        discretized_feature_names.append(f"{feature_name}_in_[{'%.2f' % (bin_cutoffs[i])},{'%.2f' % (bin_cutoffs[i+1])})")
    discretized_feature_names.append(f"{feature_name}>={'%.2f' % (bin_cutoffs[-2])}")
    # Save the discretized feature names
    joblib.dump(discretized_feature_names, f'data/{feature_name}_discretized_names.pkl')
    return discretizer,discretized_feature_names

In [4]:
def discretize(discretizer,discretized_feature_names,array_to_discretize):
    """
    Use a fitted discretizer to transform a continuous feature column in the data.
    """
    # Discretize the feature
    discretized_features = discretizer.transform(array_to_discretize)
    # Create a dataframe from the discretized feature
    discretized_df = pd.DataFrame(discretized_features,columns=discretized_feature_names)
    return discretized_df

In [5]:
def discretize_continuous_feature(feature_name,all_data,df_to_discretize):
    """
    Discretize a continuous feature in the dataframe.
    """
    discretizer,discretized_feature_names = fit_discretizer(feature_name,all_data)
    discretized_df = discretize(discretizer,discretized_feature_names,df_to_discretize[feature_name].values.reshape((-1,1)))
    discretized_df.index = df_to_discretize.index
    return discretized_df

In [6]:
# Discretize the LOS, age, and ICU_LOS columns for the training and testing data
LOS_train_discretized = discretize_continuous_feature('LOS',all_data,training_data)
age_train_discretized = discretize_continuous_feature('age',all_data,training_data)
ICU_LOS_train_discretized = discretize_continuous_feature('ICU_LOS',all_data,training_data)
train_discretized = pd.concat([training_data,LOS_train_discretized,age_train_discretized,ICU_LOS_train_discretized],axis=1)
LOS_test_discretized = discretize_continuous_feature('LOS',all_data,testing_data)
age_test_discretized = discretize_continuous_feature('age',all_data,testing_data)
ICU_LOS_test_discretized = discretize_continuous_feature('ICU_LOS',all_data,testing_data)
test_discretized = pd.concat([testing_data,LOS_test_discretized,age_test_discretized,ICU_LOS_test_discretized],axis=1)
# Drop the original continuous columns
train_discretized.drop(columns=['LOS','age','ICU_LOS'],inplace=True)
test_discretized.drop(columns=['LOS','age','ICU_LOS'],inplace=True)
# Save the discretized test data for easy inference by the LLM assistant
test_discretized.to_csv('data/Testing_Data_Discretized.csv',header=True,index=True)

## Get features and labels from the discretized training and test data

In [7]:
# Get features and labels
X_train = train_discretized.drop("Readmission",axis=1).astype(float).to_numpy()
y_train = train_discretized['Readmission'].astype(float).to_numpy()
y_train[y_train == 0] = -1
X_test = test_discretized.drop("Readmission",axis=1).astype(float).to_numpy()
y_test = test_discretized['Readmission'].astype(float).to_numpy()
y_test[y_test == 0] = -1

## Oversample the minority class to balance the training data

In [8]:
# Oversample the minority class
sampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)

In [9]:
# Class distribution before oversampling
np.unique(y_train,return_counts=True)

(array([-1.,  1.]), array([30497,  1676]))

In [10]:
# Class distribution after oversampling
np.unique(y_train_res,return_counts=True)

(array([-1.,  1.]), array([30497, 30497]))

In [11]:
# Training and testing fata shape
print(X_train_res.shape)
print(X_test.shape)

(60994, 66)
(13789, 66)


## Fit the FasterRisk risk score model

In [12]:
# Fit a risk score model with 10 items
optimizer = RiskScoreOptimizer(X = X_train_res, y = y_train_res, k = 10, parent_size = 10)
optimizer.optimize()
all_multipliers, all_intercepts, all_coefficients = optimizer.get_models()
# Get the first model from 50 candidates 
multiplier = all_multipliers[0]
intercept = all_intercepts[0]
coefficients = all_coefficients[0]

## Use the risk score model as a readmission classifier

In [13]:
classifier = RiskScoreClassifier(multiplier, intercept, coefficients, X_train = X_train_res)
y_test_pred = classifier.predict(X_test)
y_test_pred_prob = classifier.predict_prob(X_test)

In [14]:
# Get performance metrics
print("Confusion matrix:",confusion_matrix(y_test,y_test_pred))
print("AUC:",roc_auc_score(y_test,y_test_pred_prob))

Confusion matrix: [[7184 5886]
 [ 257  462]]
AUC: 0.6247364410955026


## Print and save the risk score model 

In [15]:
feature_names = list(train_discretized.drop("Readmission",axis=1).columns)
classifier.reset_featureNames(feature_names)
classifier.print_model_card()

The Risk Score is:
1.                circulatory      2 point(s) |   ...
2.                      supp1      2 point(s) | + ...
3.                        ICU     -4 point(s) | + ...
4.              ADM_EMERGENCY      2 point(s) | + ...
5. ETH_BLACK/AFRICAN AMERICAN      2 point(s) | + ...
6.          ETH_OTHER/UNKNOWN     -3 point(s) | + ...
7.                   LOS<2.27     -2 point(s) | + ...
8.       LOS_in_[11.57,15.61)      2 point(s) | + ...
9.       LOS_in_[15.61,23.71)      2 point(s) | + ...
10.                 LOS>=23.71      4 point(s) | + ...
                                        SCORE | =    
SCORE |  -9.0  |  -7.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |   1.0  |   2.0  |   3.0  |
RISK  |  11.2% |  16.1% |  19.0% |  22.4% |  26.2% |  30.4% |  35.0% |  39.8% |  44.9% |  50.0% |  55.1% |  60.2% |
SCORE |   4.0  |   5.0  |   6.0  |   7.0  |   8.0  |   9.0  |  10.0  |  11.0  |  12.0  |  13.0  |  14.0  |  16.0  |
RISK  |  65.0% |  69.6% |  73.8% |  77

In [16]:
# Save model
joblib.dump(classifier, 'data/faster_risk.pkl')

['data/faster_risk.pkl']