# Getting the Data

In [1]:
import wrangle

import pandas as pd
import numpy as np

In [2]:
all_data = wrangle.get_all_data()

In [3]:
df = wrangle.get_training_data()

In [4]:
df.head()

Unnamed: 0,bmi,age,gender,ethnicity,hospital_death
0,22.73,68.0,1,2,0
1,27.42,77.0,0,2,0
2,31.95,25.0,0,2,0
3,22.64,81.0,0,2,0
5,27.56,67.0,1,2,0


## Looking at Missing Values

In [49]:
def handle_missing_values(df, prop_required_column = .7, prop_required_row = .8):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [50]:
all_data = handle_missing_values(all_data)

In [51]:
all_data.shape

(82769, 112)

In [53]:
all_data.isna().sum().sort_values(ascending=False)

hospital_admit_source    19597
h1_temp_min              18479
h1_temp_max              18479
wbc_apache               14020
hematocrit_apache        12174
                         ...  
arf_apache                   0
aids                         0
intubated_apache             0
ventilated_apache            0
encounter_id                 0
Length: 112, dtype: int64

# Explore

## Broad Strokes of All the Data

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
total_dead = df.hospital_death.sum()

In [8]:
print("{:.2%}".format(total_dead/len(df)))

8.26%


# Model

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
df.columns

Index(['bmi', 'age', 'gender', 'ethnicity', 'hospital_death'], dtype='object')

In [11]:
new_df = df[['bmi',
             'age',
             'gender',
             'ethnicity',
             'hospital_death']]

In [12]:
new_df.dropna(inplace=True)

In [13]:
new_df.head()

Unnamed: 0,bmi,age,gender,ethnicity,hospital_death
0,22.73,68.0,1,2,0
1,27.42,77.0,0,2,0
2,31.95,25.0,0,2,0
3,22.64,81.0,0,2,0
5,27.56,67.0,1,2,0


In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
encoder = LabelEncoder()

In [16]:
new_df.gender = encoder.fit_transform(new_df.gender)

In [17]:
new_df.ethnicity = encoder.fit_transform(new_df.ethnicity)

In [18]:
new_df.head()

Unnamed: 0,bmi,age,gender,ethnicity,hospital_death
0,22.73,68.0,1,2,0
1,27.42,77.0,0,2,0
2,31.95,25.0,0,2,0
3,22.64,81.0,0,2,0
5,27.56,67.0,1,2,0


In [19]:
X = new_df.drop(columns='hospital_death')

In [20]:
y = new_df.hospital_death

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state = 123)

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
tree = DecisionTreeClassifier(max_depth=3, random_state=345)

In [24]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=345, splitter='best')

In [25]:
tree.score(X_train, y_train)

0.9173740292577208

In [26]:
tree.score(X_test, y_test)

0.9174091018540814

In [27]:
tree.predict_proba(X)

array([[0.91065276, 0.08934724],
       [0.91065276, 0.08934724],
       [0.97178548, 0.02821452],
       ...,
       [0.93275997, 0.06724003],
       [0.95561594, 0.04438406],
       [0.87709991, 0.12290009]])