# Readmission Prediction with FasterRisk risk score

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from fasterrisk.binarization_util import convert_continuous_df_to_binary_df
from sklearn.metrics import confusion_matrix,roc_auc_score
import joblib

## Load data

In [2]:
# Load data
training_data = pd.read_csv("data/Training_Data.csv",index_col=0)
testing_data = pd.read_csv("data/Testing_Data.csv",index_col=0)
# Fill nan ICU_LOS with 0 to avoid error in risk score computation
training_data.fillna(value={'ICU_LOS':0},inplace=True)
testing_data.fillna(value={'ICU_LOS':0},inplace=True)
# Binarize continuous columns
training_data_binarized = convert_continuous_df_to_binary_df(training_data)
testing_data_binarized = convert_continuous_df_to_binary_df(testing_data)
# Get features and labels
X_train = training_data_binarized.iloc[:,:-1].astype(float).to_numpy()
y_train = training_data_binarized.iloc[:,-1].astype(float).to_numpy()
y_train[y_train == 0] = -1
X_test = testing_data_binarized.iloc[:,:-1].astype(float).to_numpy()
y_test = testing_data_binarized.iloc[:,-1].astype(float).to_numpy()
y_test[y_test == 0] = -1

Converting continuous features to binary features in the dataframe......
If a feature has more than 100 unqiue values, we pick the threasholds by selecting 100 quantile points. You can change the number of thresholds by passing another specified number: convert_continuous_df_to_binary_df(df, num_quantiles=50).
Finish converting continuous features to binary features......
Converting continuous features to binary features in the dataframe......
If a feature has more than 100 unqiue values, we pick the threasholds by selecting 100 quantile points. You can change the number of thresholds by passing another specified number: convert_continuous_df_to_binary_df(df, num_quantiles=50).
Finish converting continuous features to binary features......


## Oversample the minority class to balance the training data

In [3]:
# Oversample the minority class
sampler = RandomOverSampler(random_state=42)
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)

In [4]:
# Class distribution before oversampling
np.unique(y_train,return_counts=True)

(array([-1.,  1.]), array([30497,  1684]))

In [5]:
# Class distribution after oversampling
np.unique(y_train_res,return_counts=True)

(array([-1.,  1.]), array([30497, 30497]))

In [6]:
# Training and testing fata shape
print(X_train_res.shape)
print(X_test.shape)

(60994, 333)
(13792, 333)


## Fit the FasterRisk risk score model

In [7]:
# Fit a risk score model with 10 items
optimizer = RiskScoreOptimizer(X = X_train_res, y = y_train_res, k = 10, parent_size = 10)
optimizer.optimize()
all_multipliers, all_intercepts, all_coefficients = optimizer.get_models()
# Get the first model from 50 candidates 
multiplier = all_multipliers[0]
intercept = all_intercepts[0]
coefficients = all_coefficients[0]

## Use the risk score model as a readmission classifier

In [8]:
classifier = RiskScoreClassifier(multiplier, intercept, coefficients, X_train = X_train_res)
y_test_pred = classifier.predict(X_test)
y_test_pred_prob = classifier.predict_prob(X_test)

In [9]:
# Get performance metrics
print("Confusion matrix:",confusion_matrix(y_test,y_test_pred))
print("AUC:",roc_auc_score(y_test,y_test_pred_prob))

Confusion matrix: [[8052 5018]
 [ 316  406]]
AUC: 0.625065119206828


## Print and save the risk score model 

In [10]:
feature_names = list(training_data_binarized.columns[:-1])
classifier.reset_featureNames(feature_names)
classifier.print_model_card()

The Risk Score is:
1.      LOS<=4.642013888888889     -2 point(s) |   ...
2.     LOS<=10.227694444444445     -2 point(s) | + ...
3.                       supp1      2 point(s) | + ...
4.                         ICU     -5 point(s) | + ...
5. ICU_LOS<=12.045539999999999     -3 point(s) | + ...
6.               ADM_EMERGENCY      2 point(s) | + ...
7.                 INS_Private     -1 point(s) | + ...
8.  ETH_BLACK/AFRICAN AMERICAN      2 point(s) | + ...
9.           ETH_OTHER/UNKNOWN     -3 point(s) | + ...
10.       MAR_UNKNOWN (DEFAULT)     -4 point(s) | + ...
                                         SCORE | =    
SCORE |  -20.0  |  -19.0  |  -18.0  |  -17.0  |  -16.0  |  -15.0  |  -14.0  |  -13.0  |  -12.0  |  -11.0  |  -10.0  |  -9.0  |  -8.0  |  -7.0  |
RISK  |   7.3% |   8.7% |  10.4% |  12.4% |  14.6% |  17.3% |  20.2% |  23.6% |  27.3% |  31.3% |  35.7% |  40.3% |  45.1% |  50.0% |
SCORE |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |   1.0  |   2.0  |   3.0  

In [11]:
# Save model
joblib.dump(classifier, 'data/faster_risk.pkl')

['data/faster_risk.pkl']