In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model, ensemble, metrics, preprocessing, model_selection
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# import data
def transform_gender(gender):
    # unifies data for later label encoding
    try:
        float(gender)
        return 'u'
    except ValueError:
        return gender
    
def transform_weekday(weekday):
    # unifies data for later label encoding
    weekdays = np.concatenate(([str(float(i)) for i in range(0,7)],[str(i) for i in range(0,7)]))
    for elem in weekdays:
        if elem == weekday:
            return int(weekday[0])    
    return int(7)

# read .csv with converters to unifiy for label encoding
DATAFRAME = pd.read_csv('LoS_data.csv', 
                        converters={'geschlecht': transform_gender, 
                                    'wochentag_aufnahmedatum': transform_weekday, 
                                    'wochentag_ops_datum': transform_weekday })

In [2]:
## This part can be done once only
# drop cases where LoS is missing
DATAFRAME.dropna(axis=0, inplace=True)
#DATAFRAME = DATAFRAME[DATAFRAME['absolute_verweildauer_in_tagen'] != np.nan]

# delete DATETIME - too broad feature
DATAFRAME.drop('ops_datum', axis=1, inplace=True)
DATAFRAME.drop('aufnahmedatum', axis=1, inplace=True)
DATAFRAME.drop('record_id', axis=1, inplace=True)


DATAFRAME

Unnamed: 0,ops_kode,ops_version,icd_three,geschlecht,alter_in_jahren_am_aufnahmetag,clinic_id,aufnahmeanlass,aufnahmegrund,absolute_verweildauer_in_tagen,wochentag_ops_datum,wochentag_aufnahmedatum
0,162000,2011,A19,m,34,1,E,101,228.0,5,3
1,3202,2011,A19,m,34,1,E,101,228.0,1,3
2,3222,2011,A19,m,34,1,E,101,228.0,0,3
3,8930,2011,A19,m,34,1,E,101,228.0,4,3
4,568302,2011,D25,w,31,1,E,101,9.0,2,1
5,557801,2011,D25,w,31,1,E,101,9.0,2,1
6,5822a2,2011,M17,w,73,1,E,101,13.0,3,2
7,85500,2011,M17,w,73,1,E,101,13.0,4,2
8,5793af,2011,S72,w,75,1,N,107,21.0,2,1
9,579426,2011,S72,w,75,1,N,107,21.0,2,1


In [3]:
# separation between features and target dataframes
TARGETS = DATAFRAME['absolute_verweildauer_in_tagen'].values
DATAFRAME.drop('absolute_verweildauer_in_tagen', axis=1, inplace=True)
FEATURES = DATAFRAME

In [4]:
## Final Feature Set
#FEATURES.shape  # (988459 examples, 10 features)
#TARGETS.shape  #  (988459,)

In [5]:
## Encode data with multiple labels

def DF_encode_multilabels(feature, dataframe):
    dataframe[feature] = LabelEncoder().fit_transform(dataframe[feature])
    return dataframe
multilabels = ['ops_kode', 'ops_version', 'icd_three', 'geschlecht', 'aufnahmeanlass']

for feature in multilabels:
    FEATURES = DF_encode_multilabels(feature, FEATURES)



In [17]:
## Check for any NaN values in the Feature Dataframe
print("List features:")
print(FEATURES.columns)

print("Check data type:")
print(FEATURES.dtypes)

print("Check for finite values:")
for feature in FEATURES.columns:
    print(feature)   
    if not np.isfinite(FEATURES[feature].values).all():
        print(feature)    
        
FEATURES_Train, FEATURES_Test, TARGETS_Train, TARGETS_Test = model_selection.train_test_split(FEATURES.values, TARGETS, test_size=0.3, shuffle=True)


print('Number of observations in the training data:', len(FEATURES_Train))
print('Number of observations in the testing data:', len(FEATURES_Test))

List features:
Index(['ops_kode', 'ops_version', 'icd_three', 'geschlecht',
       'alter_in_jahren_am_aufnahmetag', 'clinic_id', 'aufnahmeanlass',
       'aufnahmegrund', 'wochentag_ops_datum', 'wochentag_aufnahmedatum'],
      dtype='object')
Check data type:
ops_kode                          int64
ops_version                       int64
icd_three                         int64
geschlecht                        int64
alter_in_jahren_am_aufnahmetag    int64
clinic_id                         int64
aufnahmeanlass                    int64
aufnahmegrund                     int64
wochentag_ops_datum               int64
wochentag_aufnahmedatum           int64
dtype: object
Check for finite values:
ops_kode
ops_version
icd_three
geschlecht
alter_in_jahren_am_aufnahmetag
clinic_id
aufnahmeanlass
aufnahmegrund
wochentag_ops_datum
wochentag_aufnahmedatum
Number of observations in the training data: 691921
Number of observations in the testing data: 296538


In [20]:
regr = ensemble.RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=1e-07, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=210, n_jobs=-1,
           oob_score=False, random_state=7, verbose=0, warm_start=False)
regr.fit(FEATURES_Train,TARGETS_Train)

print(regr)
print('Error on Train: %0.2f' % (metrics.mean_absolute_error(TARGETS_Train,  regr.predict(FEATURES_Train))))
print('Error on Test: %0.2f' % (metrics.mean_absolute_error(TARGETS_Test,  regr.predict(FEATURES_Test))))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=1e-07, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=210, n_jobs=-1,
           oob_score=False, random_state=7, verbose=0, warm_start=False)
Error on Train: 1.55
Error on Test: 3.88


In [21]:
score = metrics.r2_score(TARGETS_Test,  regr.predict(FEATURES_Test))
print("R^2 accuracy score: %0.2f" % score)

print(pd.DataFrame(data=regr.feature_importances_, index=FEATURES.columns, columns=["Feature Importances"]))

R^2 accuracy score: 0.93
                                Feature Importances
ops_kode                                   0.268571
ops_version                                0.101929
icd_three                                  0.235362
geschlecht                                 0.021907
alter_in_jahren_am_aufnahmetag             0.163602
clinic_id                                  0.050330
aufnahmeanlass                             0.036142
aufnahmegrund                              0.028364
wochentag_ops_datum                        0.033128
wochentag_aufnahmedatum                    0.060666


In [24]:
#evaluating the performance of this chosen model
FEATURES_Train, FEATURES_Test, TARGETS_Train, TARGETS_Test = model_selection.train_test_split(FEATURES.values, TARGETS, test_size=0.30)
      
regr.fit(FEATURES_Train, TARGETS_Train)


test_x1 = FEATURES_Test[TARGETS_Test <=5]
test_y1 = TARGETS_Test[TARGETS_Test <=5]

test_x2 = FEATURES_Test[np.logical_and(TARGETS_Test > 5, TARGETS_Test <=14)]
test_y2 = TARGETS_Test[np.logical_and(TARGETS_Test > 5, TARGETS_Test <=14)]

test_x3 = FEATURES_Test[np.logical_and(TARGETS_Test > 14, TARGETS_Test <=61)]
test_y3 = TARGETS_Test[np.logical_and(TARGETS_Test > 14, TARGETS_Test <=61)]

test_x4 = FEATURES_Test[TARGETS_Test > 61]
test_y4 = TARGETS_Test[TARGETS_Test > 61]


err_all_absolute = metrics.mean_absolute_error(TARGETS_Test, regr.predict(FEATURES_Test))
err_5days_absolute = metrics.mean_absolute_error(test_y1, regr.predict(test_x1))
err_2weeks_absolute = metrics.mean_absolute_error(test_y2, regr.predict(test_x2))
err_2month_absolute = metrics.mean_absolute_error(test_y3, regr.predict(test_x3))
err_over2month_absolute = metrics.mean_absolute_error(test_y4, regr.predict(test_x4))

print("Sample size of test data: ", len(TARGETS_Test))    
print('Mean absolute error on all data', err_all_absolute)

print("Sample size of subgroup 'less than five days':", len(test_x1))
print('Mean absolute error of up to 5 days', err_5days_absolute)

print("Sample size of subgroup 'five days up to two weeks':", len(test_x2))
print('Mean absolute error of 6 days to 2 weeks', err_2weeks_absolute)

print("Sample size of subgroup 'more than two weeks up to two month':", len(test_x3))
print('Mean absolute error of 2 weeks up to 2 months', err_2month_absolute)

print("Sample size of subgroup 'more than two month':", len(test_x4))
print('Mean absolute error of more than 2 months', err_over2month_absolute)


Sample size of test data:  296538
Mean absolute error on all data 3.90219150935
Sample size of subgroup 'less than five days': 115801
Mean absolute error of up to 5 days 2.87042914242
Sample size of subgroup 'five days up to two weeks': 98528
Mean absolute error of 6 days to 2 weeks 2.83330581284
Sample size of subgroup 'more than two weeks up to two month': 66347
Mean absolute error of 2 weeks up to 2 months 6.06931226247
Sample size of subgroup 'more than two month': 15862
Mean absolute error of more than 2 months 9.00951234862
