# Imports 

In [34]:
import pandas as pd
import datetime as dt
import numpy as np

# Load and Clean the Data

##### Dummy Variables

In [35]:
# Read the data
df_los = pd.read_csv('./data/LOS_data.csv')

# Rename columns
df_los = df_los.rename(columns={
    'Patient ID': 'patientID',
    'DISCHQTR': 'dischargeQTR',
    'MnthName': 'month',
    'AdmitType': 'admitType',
    'Discharge Status': 'dischargeStatus',
    'ADMITWeekday': 'admittedDOW',
    'Hospital': 'hospital',
    'AgeGroup': 'ageGroup',
    'SEX': 'sex',
    'RACE': 'race',
    'ETHNICIT': 'ethnicity',
    'Case Mix Index': 'caseMixIndex',
    'Severity (APR DRG)': 'severity',
    'Risk of Mortality (APR DRG)': 'riskOfMortality',
    'PrincipleDiagnosisCode': 'principleDiagnosisCode',
    'PrincipalDiagnosis': 'principalDiagnosis',
    'DiagPosCnt': 'diagPosCnt',
    'PrinicpleProcedureCode': 'principalProcedureCode',
    'PrincipalProcedure': 'principalProcedure',
    'ProcPosCnt': 'procPosCnt',
    'Product Line': 'productLine',
    'AltProductLine1': 'altProductLine1',
    'AltProductLine1SUB': 'altProductLine1SUB',
    'PayCode1': 'payCode',
    ' TOTALCHG ': 'totalCharge'
})

# Mapping for weekdays
weekday_mapping = {
    'Mon': 0,
    'Tue': 1,
    'Wed': 2,
    'Thu': 3,
    'Fri': 4,
    'Sat': 5,
    'Sun': 6
}

# git rid of the numbers at the front of APR DRG and principalDiagnosis
df_los['APR DRG'] = df_los['APR DRG'].apply(lambda x: x.split('-')[1] if isinstance(x, str) else x)
df_los['principalDiagnosis'] = df_los['principalDiagnosis'].apply(lambda x: x.split('-')[1] if isinstance(x, str) else x)


# Extract the last digit of dischargeQTR and convert to int
df_los['dischargeQTR'] = df_los['dischargeQTR'].astype(str).str[-1].astype(int)

# Convert month to datetime and extract month
df_los['month'] = pd.to_datetime(df_los['month'], format='%b').dt.month

# Drop 'In/Out' column and 'year' and 'LOSGroupName', 'APR_DRG'
df_los.drop(['In/Out', 'YEAR', 'LOSGroupName', 'APR_DRG'], axis=1, inplace=True)


# Replace 'Medical Emergency' with 'Emergency' in admitType column
df_los['admitType'] = np.where(df_los['admitType'] == 'Medical Emergency', 'Emergency', df_los['admitType'])

# Map weekday names to numerical values
df_los['admittedDOW'] = df_los['admittedDOW'].map(weekday_mapping)

# Convert LOS to int
df_los['LOS'] = df_los['LOS'].astype(int)

# Remove '$' and ',' from totalCharge and convert to float
df_los['totalCharge'] = df_los['totalCharge'].replace('[$,]', '', regex=True).astype(float)
df_los['ageGroup'] = df_los['ageGroup'].replace('[Years ]', '', regex=True)

# Make Target Variable
df_los.loc[:, 'LOSDiscrepancyCost'] = -(df_los['LOS'] - df_los['GM-LOS'] * 1000)

# Drop where the target variable is missing
df_los = df_los.dropna(subset=['LOSDiscrepancyCost'])

In [36]:
df_los.to_csv('./data/los_dummy.csv', index=False)

##### Cat Vars

In [37]:
cols_to_categorize = [
    'month',
    'admitType',
    'dischargeStatus',
    'admittedDOW',
    'hospital',
    'ageGroup',
    'sex',
    'race',
    'principalProcedure',
    'productLine',
]

categorical_cols = []
for col in cols_to_categorize:
    df_los[f'{col}_cat'] = df_los[col].astype('category')
    categorical_cols.append(f'{col}_cat')
    
df_los_cat = df_los.copy()
df_los_cat = df_los_cat.drop(cols_to_categorize, axis = 1)

In [42]:
df_los_cat.to_pickle('./data/los_cat.pkl')