# Predictive modeling for length of stay

Here, I will demonstrate how to perform supervised, predictive modeling using spatial parameters as an input and length of stay as an output.

In [1]:
#standard
import pandas as pd
import janitor
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt

#Model
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support, r2_score

#interpretation
import eli5
from eli5.sklearn import PermutationImportance
from eli5.formatters.as_dataframe import format_as_dataframes

#Computational performance
from timeit import default_timer as timer

#Sampling
from scipy.stats import gamma

#custom functions
def get_feature_importances(fitted_model, X, y, labels):
    perm = PermutationImportance(fitted_model).fit(X, y)
    exp = eli5.explain_weights(perm, feature_names=labels)
    importance_err_df = format_as_dataframes(exp)['feature_importances'].rename_column('weight', 'importance')
    return importance_err_df

def plot_feature_importances(importance_df):
    ax = importance_df.plot(x='feature', y='importance', yerr='std', kind='bar')
    ax.set_ylabel('importance')
    #return ax



First, I'll import the data. I'm going to import data which was already somewhat cleaned (removed irrelevant features, made titles nice).

In [4]:
df = pd.read_csv('Data/aggregated_simulated_outcomes.csv')
df = df.drop(columns='index')
df.head()

Unnamed: 0,room_depth,dist_to_elevator,dist_to_nurse_station,head_isovist,door_isovist,sink_isovist,fall_prob,fall,delirium_prob,has_delirium,summed_mean,sampled_LOS,view_type
0,0.294253,236.617906,110.440005,426.676555,480.776509,336.318056,0.018,0.0125,0.132,0.1625,3.761661,3.6657,hardscape
1,0.464286,99.592921,19.918584,548.296155,636.07136,374.817764,0.009,0.0125,0.176,0.1625,3.761661,3.795298,building
2,0.257143,153.002567,67.301283,436.017976,493.723516,343.912753,0.018,0.0125,0.132,0.1375,3.311095,3.370062,greenery
3,0.300265,201.873097,119.523355,445.453332,498.349012,347.664951,0.018,0.0625,0.132,0.1375,3.394298,3.473077,greenery
4,0.301149,220.30385,122.501283,436.017976,493.723516,343.912753,0.018,0.025,0.132,0.1875,3.833269,3.585154,hardscape


I thought that either Random Forest Regression or Gradient Boosting regression would work well for our objective: identifying 