# EmOpti Workshop - Data Preparation for XGboost

Kernel `Python 3 (Data Science)` works well with this notebook

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
s3bucket = session.default_bucket()
s3prefix = "emopti"
local_data_path = './data/emopti_data.csv'

role = get_execution_role()

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Data Preparation
Read the data into a Pandas data frame and take a look

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(local_data_path)
pd.set_option("display.max_columns", 500)  # Make sure we can see all of the columns
pd.set_option("display.max_rows", 10)  # Keep the output on one page
df.head(5)

#### The 'age' column has some extraneous characters, fix it so it only contains digits and force the column to type *int*

In [None]:
df['age'] = df['age'].str.extract('(\d+)', expand=False)
df['age'] = df['age'].astype('int64')

#### For numeric columns, fill in missing values with the Mean value of that column

In [None]:
for col in ['age', 'first_temperature', 'first_heart_rate', 'first_bp_systolic', 'first_bp_diastolic', 'first_pain_level', 'first_respiratory_rate', 'first_spo2', 'first_esi']:
    df[col].fillna(value=df[col].mean(), inplace=True)

There should be only one 'y' column, so drop the 'admit_dest' column for this exercise.

Also, the 'chief_complaint' column is free-form text, so we will also drop it for this exercise.


In [None]:
df = df.drop(columns=['admit_dest', 'chief_complaint'])

#### For categorical values such as Gender, use one-hot-encoding

In [None]:
df = pd.get_dummies(df, columns=['gender'])

#### Convert the 'calc_disp' column to binary

In [None]:
# drop all rows that are not ADMIT or DISCHARGE
df = df[df['calc_disp'] != 'OTHER']

In [None]:
df['calc_disp'] = df['calc_disp'].replace({'DISCHARGE': 1, 'ADMIT': 0})

In [None]:
df.head(20)

#### Reserve some data for calling batch inference on the trained model


In [None]:
df_train = df.sample(frac = 0.8, random_state=12345)

!mkdir -p data/xgb/

train_filename = 'train.csv'
df_train.to_csv(f'data/xgb/{train_filename}', index=False)

# save the text labels
test_labels = 'test_labels.csv'
df_test = df.drop(df_train.index)
df_test['calc_disp'].to_csv(f'data/xgb/{test_labels}', index=False, header=False)

# save the test data without the label column
df_test = df_test.drop(columns=['calc_disp'])
test_filename = 'test.csv'
df_test.to_csv(f'data/xgb/{test_filename}', index=False)

