In [1]:
import src.wrangle
import src.features
import src.preprocessing
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Getting and Splitting the Data

In [2]:
df = src.wrangle.get_raw_data()
train, test = train_test_split(df, train_size=.8, random_state=42, stratify=df.hospital_death)

print(train.shape)
train.isna().sum().sort_values(ascending=False)

(73370, 185)


h1_bilirubin_min    67708
h1_bilirubin_max    67708
h1_lactate_max      67493
h1_lactate_min      67493
h1_albumin_min      67058
                    ...  
icu_id                  0
icu_stay_type           0
icu_type                0
pre_icu_los_days        0
encounter_id            0
Length: 185, dtype: int64

# Cleaning up the Train Data

We're only cleaning up train since I wanted to split the data before exploring it. We won't be using our test data in this notebook

In [3]:
train = src.wrangle.prepare_data(train)

In [4]:
print(train.shape)
train.isna().sum().sort_values(ascending=False)

(73370, 185)


apache_2_bodysystem       0
d1_mbp_noninvasive_max    0
d1_sysbp_min              0
d1_sysbp_max              0
d1_sysbp_invasive_min     0
                         ..
d1_platelets_min          0
d1_platelets_max          0
d1_lactate_min            0
d1_lactate_max            0
encounter_id              0
Length: 185, dtype: int64

# Are certain hospitals better at data collection, and does that have an impact on patient outcomes?

## How many hospitals are in the data?

In [5]:
print('Number of hospitals: {}'.format(train.hospital_id.nunique()))

Number of hospitals: 147


## What is the average percentage of deaths for each hospital?

In [6]:
hospital_deaths = pd.DataFrame(train.groupby('hospital_id').hospital_death.mean()).reset_index()
mapping = dict(hospital_deaths[['hospital_id', 'hospital_death']].values)
train['hospital_death_avg'] = train.hospital_id.map(mapping)

In [7]:
print('Average percentage of patients dying over all ICUs is: {: .2f}'
      .format(train.hospital_death_avg.mean()))

Average percentage of patients dying over all ICUs is:  0.09


## Looking at hospitals with an above average death rate

Adding a count of the number of patients for each hospital to get better perspective of the percentages

In [8]:
num_patients_by_hospital = pd.DataFrame(train.groupby('hospital_id').count()).reset_index()
mapping = dict(num_patients_by_hospital[['hospital_id', 'hospital_death']].values)
train['num_patients_by_hospital'] = train.hospital_id.map(mapping)

In [9]:
death_perc_by_hospital = (
    train[train.hospital_death_avg > train.hospital_death_avg.mean()]
    [['hospital_id', 'hospital_death_avg', 'num_patients_by_hospital']]
    .reset_index()
    .drop(columns='patient_id')
    .sort_values('hospital_death_avg', ascending=False)
)

In [18]:
death_perc_by_hospital.drop_duplicates().sort_values('num_patients_by_hospital')

Unnamed: 0,hospital_id,hospital_death_avg,num_patients_by_hospital
8406,130,0.5,2
16897,23,0.166667,6
22801,167,0.125,8
9666,74,0.090909,11
9883,68,0.166667,12
10166,78,0.095238,21
1317,150,0.178571,28
17991,102,0.137931,29
4106,29,0.210526,38
1543,36,0.125,48
