# Getting the Data

In [1]:
import wrangle

import pandas as pd
import numpy as np

In [2]:
df = wrangle.get_training_data()

In [3]:
new_df = df[['bmi',
             'age',
             'gender',
             'ethnicity',
             'hospital_death']]

In [4]:
new_df = wrangle.drop_nulls(new_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [5]:
print(new_df.shape)

(83056, 5)


In [30]:
new_df = wrangle.encode_cols(new_df)

## Looking at Missing Values

In [6]:
df.isna().sum()

encounter_id                      0
patient_id                        0
hospital_id                       0
hospital_death                    0
age                            4228
                               ... 
leukemia                        715
lymphoma                        715
solid_tumor_with_metastasis     715
apache_3j_bodysystem           1662
apache_2_bodysystem            1662
Length: 186, dtype: int64

# Explore

## Broad Strokes of All the Data

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
total_dead = df.hospital_death.sum()

In [9]:
print("{:.2%}".format(total_dead/len(df)))

8.63%


# Model

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df.columns

Index(['encounter_id', 'patient_id', 'hospital_id', 'hospital_death', 'age',
       'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height',
       ...
       'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'apache_3j_bodysystem',
       'apache_2_bodysystem'],
      dtype='object', length=186)

In [12]:
new_df = df[['bmi',
             'age',
             'gender',
             'ethnicity',
             'hospital_death']]

In [13]:
new_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
new_df.head()

Unnamed: 0,bmi,age,gender,ethnicity,hospital_death
0,22.73,68.0,M,Caucasian,0
1,27.42,77.0,F,Caucasian,0
2,31.95,25.0,F,Caucasian,0
3,22.64,81.0,F,Caucasian,0
5,27.56,67.0,M,Caucasian,0


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()

In [17]:
new_df.gender = encoder.fit_transform(new_df.gender)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [18]:
new_df.ethnicity = encoder.fit_transform(new_df.ethnicity)

In [19]:
new_df.head()

Unnamed: 0,bmi,age,gender,ethnicity,hospital_death
0,22.73,68.0,1,2,0
1,27.42,77.0,0,2,0
2,31.95,25.0,0,2,0
3,22.64,81.0,0,2,0
5,27.56,67.0,1,2,0


In [20]:
X = new_df.drop(columns='hospital_death')

In [21]:
y = new_df.hospital_death

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state = 123)

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
tree = DecisionTreeClassifier(max_depth=3, random_state=345)

In [25]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=345, splitter='best')

In [26]:
tree.score(X_train, y_train)

0.9173740292577208

In [27]:
tree.score(X_test, y_test)

0.9174091018540814

In [28]:
tree.predict_proba(X)

array([[0.91065276, 0.08934724],
       [0.91065276, 0.08934724],
       [0.97178548, 0.02821452],
       ...,
       [0.93275997, 0.06724003],
       [0.95561594, 0.04438406],
       [0.87709991, 0.12290009]])