In [154]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.validation import check_is_fitted

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 200)

In [43]:
data_path = "dataset/H_MHAS_c2.dta" 
df = pd.read_stata(data_path)

In [44]:
column_names = list(df.columns.str.lower())

## Feature Pre-Selection

In [45]:
target_columns = ['r1hosp1y',
 'r2hosp1y',
 'r3hosp1y',
 'r4hosp1y',
 'r5hosp1y']
target_columns_count = len(target_columns)

In [46]:
no_wave_columns = [
"unhhidnp",
"ragender",
]
no_wave_columns_count = len(no_wave_columns)

In [118]:
picked_columns = [

# Age at Interview (Months and Years)    
"r1agey",
"r2agey",
"r3agey",
"r4agey",
"r5agey",

# Self-Report of Health
"r1shlt",
"r2shlt",
"r3shlt",
"r4shlt",
"r5shlt",

# Doctor Diagnosed Health Problems: Ever Have Condition
"r1hibpe",
"r2hibpe",
"r3hibpe",
"r4hibpe",
"r5hibpe",
"r1diabe",
"r2diabe",
"r3diabe",
"r4diabe",
"r5diabe",
"r1cancre",
"r2cancre",
"r3cancre",
"r4cancre",
"r5cancre",
"r1respe",
"r2respe",
"r3respe",
"r4respe",
"r5respe",
"r1hrtatte",
"r2hrtatte",
"r3hrtatte",
"r4hrtatte",
"r5hrtatte",

"r4hearte",
"r5hearte",

"r1stroke",
"r2stroke",
"r3stroke",
"r4stroke",
"r5stroke",
"r1arthre",
"r2arthre",
"r3arthre",
"r4arthre",
"r5arthre",
"s5arthre",

# RwBMI is the respondent's self-reported body mass index
"r1bmi",
"r2bmi",
"r3bmi",
"r4bmi",
"r5bmi",

# Health Behaviors: Physical Activity or Exercise
"r1vigact",
"r2vigact",
"r3vigact",
"r4vigact",
"r5vigact",

# Health Behaviors: Smoking (Cigarettes)
"r1smokev",
"r2smokev",
"r3smokev",
"r4smokev",
"r5smokev",
"r1smoken",
"r2smoken",
"r3smoken",
"r4smoken",
"r5smoken",
"r1smokef",
"r2smokef",
"r3smokef",
"r4smokef",
"r5smokef",
"r1strtsmok",
"r2strtsmok",
"r3strtsmok",
"r4strtsmok",
"r5strtsmok",
"r1quitsmok",
"r2quitsmok",
"r3quitsmok",
"r4quitsmok",
"r5quitsmok",

# Health Behaviors: Preventive Care
"r1cholst",
"r2cholst",
"r3cholst",
"r4cholst",
"r5cholst",
"r3flusht",
"r4flusht",
"r5flusht",
"r1breast",
"r2breast",
"r3breast",
"r4breast",
"r5breast",
"r1mammog",
"r2mammog",
"r3mammog",
"r4mammog",
"r5mammog",
"r1papsm",
"r2papsm",
"r3papsm",
"r4papsm",
"r5papsm",
"r1prost",
"r2prost",
"r3prost",
"r4prost",
"r5prost",
]

In [119]:
screening = no_wave_columns + target_columns + picked_columns
print(screening)
# Work in a subset of the dataset
df_sub = df[screening]
df_sub.head(5)

['unhhidnp', 'ragender', 'r1hosp1y', 'r2hosp1y', 'r3hosp1y', 'r4hosp1y', 'r5hosp1y', 'r1agey', 'r2agey', 'r3agey', 'r4agey', 'r5agey', 'r1shlt', 'r2shlt', 'r3shlt', 'r4shlt', 'r5shlt', 'r1hibpe', 'r2hibpe', 'r3hibpe', 'r4hibpe', 'r5hibpe', 'r1diabe', 'r2diabe', 'r3diabe', 'r4diabe', 'r5diabe', 'r1cancre', 'r2cancre', 'r3cancre', 'r4cancre', 'r5cancre', 'r1respe', 'r2respe', 'r3respe', 'r4respe', 'r5respe', 'r1hrtatte', 'r2hrtatte', 'r3hrtatte', 'r4hrtatte', 'r5hrtatte', 'r4hearte', 'r5hearte', 'r1stroke', 'r2stroke', 'r3stroke', 'r4stroke', 'r5stroke', 'r1arthre', 'r2arthre', 'r3arthre', 'r4arthre', 'r5arthre', 's5arthre', 'r1bmi', 'r2bmi', 'r3bmi', 'r4bmi', 'r5bmi', 'r1vigact', 'r2vigact', 'r3vigact', 'r4vigact', 'r5vigact', 'r1smokev', 'r2smokev', 'r3smokev', 'r4smokev', 'r5smokev', 'r1smoken', 'r2smoken', 'r3smoken', 'r4smoken', 'r5smoken', 'r1smokef', 'r2smokef', 'r3smokef', 'r4smokef', 'r5smokef', 'r1strtsmok', 'r2strtsmok', 'r3strtsmok', 'r4strtsmok', 'r5strtsmok', 'r1quitsmok', 

Unnamed: 0,unhhidnp,ragender,r1hosp1y,r2hosp1y,r3hosp1y,r4hosp1y,r5hosp1y,r1agey,r2agey,r3agey,r4agey,r5agey,r1shlt,r2shlt,r3shlt,r4shlt,r5shlt,r1hibpe,r2hibpe,r3hibpe,r4hibpe,r5hibpe,r1diabe,r2diabe,r3diabe,r4diabe,r5diabe,r1cancre,r2cancre,r3cancre,r4cancre,r5cancre,r1respe,r2respe,r3respe,r4respe,r5respe,r1hrtatte,r2hrtatte,r3hrtatte,r4hrtatte,r5hrtatte,r4hearte,r5hearte,r1stroke,r2stroke,r3stroke,r4stroke,r5stroke,r1arthre,r2arthre,r3arthre,r4arthre,r5arthre,s5arthre,r1bmi,r2bmi,r3bmi,r4bmi,r5bmi,r1vigact,r2vigact,r3vigact,r4vigact,r5vigact,r1smokev,r2smokev,r3smokev,r4smokev,r5smokev,r1smoken,r2smoken,r3smoken,r4smoken,r5smoken,r1smokef,r2smokef,r3smokef,r4smokef,r5smokef,r1strtsmok,r2strtsmok,r3strtsmok,r4strtsmok,r5strtsmok,r1quitsmok,r2quitsmok,r3quitsmok,r4quitsmok,r5quitsmok,r1cholst,r2cholst,r3cholst,r4cholst,r5cholst,r3flusht,r4flusht,r5flusht,r1breast,r2breast,r3breast,r4breast,r5breast,r1mammog,r2mammog,r3mammog,r4mammog,r5mammog,r1papsm,r2papsm,r3papsm,r4papsm,r5papsm,r1prost,r2prost,r3prost,r4prost,r5prost
0,110.0,1.Man,1.Yes,0.No,0.No,,,59.0,61.0,71.0,,,5.Poor,2.Very good,5.Poor,,,1.yes,1.yes,1.yes,,,0.no,0.no,0.no,,,0.no,0.no,0.no,,,0.no,0.no,0.no,,,0.no,0.no,0.no,,,,,0.no,0.no,0.no,,,0.no,0.no,0.no,,,,29.068796,,28.719721,,,1.Yes,1.Yes,0.No,,,1.Yes,1.Yes,1.Yes,,,0.No,0.No,0.No,,,0.0,0.0,0.0,,,14.0,14.0,14.0,,,15.0,15.0,25.0,,,1.Yes,1.Yes,1.Yes,,,0.No,,,,,,,,,,,,,,,,,,0.No,0.No,0.No,,
1,120.0,2.Woman,0.No,0.No,0.No,0.No,0.No,50.0,52.0,62.0,65.0,68.0,4.Fair,4.Fair,3.Good,3.Good,3.Good,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,,,20.239502,,20.0,,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.No,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,1.Yes,1.Yes,0.No,0.No,1.Yes,0.No,0.No,0.No,1.Yes,1.Yes,1.Yes,1.Yes,0.No,0.No,0.No,0.No,0.No,0.No,1.Yes,1.Yes,1.Yes,0.No,0.No,,,,,
2,210.0,1.Man,0.No,0.No,0.No,0.No,1.Yes,69.0,72.0,81.0,83.0,86.0,5.Poor,,4.Fair,4.Fair,3.Good,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,1.yes,1.yes,,,39.262234,,26.709404,,0.No,,1.Yes,1.Yes,0.No,1.Yes,0.No,1.Yes,0.No,0.No,1.Yes,0.No,0.No,0.No,0.No,1.0,0.0,0.0,0.0,0.0,14.0,14.0,14.0,14.0,14.0,,,60.0,60.0,60.0,0.No,,1.Yes,1.Yes,1.Yes,0.No,1.Yes,0.No,,,,,,,,,,,,,,,,,,0.No,0.No,0.No
3,220.0,2.Woman,0.No,0.No,0.No,1.Yes,,48.0,50.0,70.0,74.0,,3.Good,4.Fair,,,,0.no,0.no,0.no,0.no,,0.no,0.no,0.no,0.no,,0.no,0.no,0.no,0.no,,0.no,0.no,0.no,0.no,,0.no,0.no,0.no,0.no,,0.no,,0.no,0.no,0.no,0.no,,0.no,0.no,0.no,0.no,,,,36.444443,22.038568,,,0.No,0.No,,,,0.No,0.No,0.No,0.No,,0.No,0.No,0.No,0.No,,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.No,1.Yes,,1.Yes,,,0.No,,0.No,0.No,,,,0.No,0.No,,,,0.No,1.Yes,,,,,,,,
4,310.0,1.Man,0.No,0.No,0.No,0.No,0.No,50.0,53.0,62.0,65.0,68.0,3.Good,3.Good,4.Fair,3.Good,3.Good,0.no,0.no,0.no,0.no,0.no,0.no,0.no,1.yes,1.yes,1.yes,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,0.no,26.81222,22.437674,28.400547,29.411762,25.605536,0.No,1.Yes,0.No,1.Yes,0.No,1.Yes,0.No,1.Yes,1.Yes,1.Yes,0.No,0.No,0.No,0.No,0.No,0.0,0.0,0.0,0.0,0.0,18.0,18.0,18.0,18.0,18.0,3.0,3.0,36.0,36.0,36.0,0.No,1.Yes,1.Yes,1.Yes,1.Yes,1.Yes,1.Yes,1.Yes,,,,,,,,,,,,,,,,1.Yes,1.Yes,1.Yes,1.Yes,1.Yes


In [120]:
# original name for columns with wave reference 
wave_vars_orig_name = screening[no_wave_columns_count:]
print(wave_vars_orig_name)

['r1hosp1y', 'r2hosp1y', 'r3hosp1y', 'r4hosp1y', 'r5hosp1y', 'r1agey', 'r2agey', 'r3agey', 'r4agey', 'r5agey', 'r1shlt', 'r2shlt', 'r3shlt', 'r4shlt', 'r5shlt', 'r1hibpe', 'r2hibpe', 'r3hibpe', 'r4hibpe', 'r5hibpe', 'r1diabe', 'r2diabe', 'r3diabe', 'r4diabe', 'r5diabe', 'r1cancre', 'r2cancre', 'r3cancre', 'r4cancre', 'r5cancre', 'r1respe', 'r2respe', 'r3respe', 'r4respe', 'r5respe', 'r1hrtatte', 'r2hrtatte', 'r3hrtatte', 'r4hrtatte', 'r5hrtatte', 'r4hearte', 'r5hearte', 'r1stroke', 'r2stroke', 'r3stroke', 'r4stroke', 'r5stroke', 'r1arthre', 'r2arthre', 'r3arthre', 'r4arthre', 'r5arthre', 's5arthre', 'r1bmi', 'r2bmi', 'r3bmi', 'r4bmi', 'r5bmi', 'r1vigact', 'r2vigact', 'r3vigact', 'r4vigact', 'r5vigact', 'r1smokev', 'r2smokev', 'r3smokev', 'r4smokev', 'r5smokev', 'r1smoken', 'r2smoken', 'r3smoken', 'r4smoken', 'r5smoken', 'r1smokef', 'r2smokef', 'r3smokef', 'r4smokef', 'r5smokef', 'r1strtsmok', 'r2strtsmok', 'r3strtsmok', 'r4strtsmok', 'r5strtsmok', 'r1quitsmok', 'r2quitsmok', 'r3quitsmo

In [121]:
# Remove the wave reference characters
from collections import OrderedDict
wave_vars_clean_name = list(OrderedDict.fromkeys([name[2:] for name in wave_vars_orig_name]))
wave_vars_clean_name_count = len(wave_vars_clean_name) 
print(wave_vars_clean_name_count, wave_vars_clean_name)

24 ['hosp1y', 'agey', 'shlt', 'hibpe', 'diabe', 'cancre', 'respe', 'hrtatte', 'hearte', 'stroke', 'arthre', 'bmi', 'vigact', 'smokev', 'smoken', 'smokef', 'strtsmok', 'quitsmok', 'cholst', 'flusht', 'breast', 'mammog', 'papsm', 'prost']


In [122]:
# Decouple the wave as a column from the variables

from pprint import pprint

# Initialize an empty DataFrame with the desired final columns
# accumulated_df = pd.DataFrame(columns=final_columns)
accumulated_df = None

# Loop over the wave numbers
for wave in range(1, 6):
    # Create new column names for the current wave
    _names = [f"r{wave}{var}" for var in wave_vars_clean_name]
    wave_column_names = [name for name in _names if name in df_sub.columns] # Only keep valid column names
    # print(wave_column_names)
    column_mapping = {name: name[2:] for name in wave_column_names}  
    # print(column_mapping)

    # Select the necessary columns from the subset DataFrame and rename them
    wave_df = df_sub[no_wave_columns + wave_column_names].copy()
    wave_df = wave_df.rename(columns=column_mapping)

    # Insert the wave number as a new column
    wave_df.insert(0, "wave", wave)

    # Append the current wave's DataFrame to the accumulated DataFrame
    if accumulated_df is None:
        accumulated_df = wave_df.copy()
    else:
        accumulated_df = pd.concat([accumulated_df, wave_df])


In [123]:
accumulated_df.shape

(134195, 27)

In [124]:
drop_first_columns_count = 2 # wave and unhhidnp columns are not needed
# print(list(accumulated_df.columns))
# accumulated_df.iloc[:,drop_first_columns_count:].head() 

In [125]:
# Prepare new clean dataset
df_clean = accumulated_df.iloc[:,drop_first_columns_count:].dropna(subset=["hosp1y"]).copy()
print(df_clean.shape)
# Swap ragender with hosp1y column to better readability
cols = list(df_clean)  # Get a list of column names
cols[0], cols[1] = cols[1], cols[0]  # Swap the first two names
df_clean = df_clean.reindex(columns=cols)  # Reindex the DataFrame with the new column order

(76353, 25)


## Data Preprocessing

In [126]:
df_clean.head()

Unnamed: 0,hosp1y,ragender,agey,shlt,hibpe,diabe,cancre,respe,hrtatte,stroke,arthre,bmi,vigact,smokev,smoken,smokef,strtsmok,quitsmok,cholst,breast,mammog,papsm,prost,flusht,hearte
0,1.Yes,1.Man,59.0,5.Poor,1.yes,0.no,0.no,0.no,0.no,0.no,0.no,29.068796,1.Yes,1.Yes,0.No,0.0,14.0,15.0,1.Yes,,,,0.No,,
1,0.No,2.Woman,50.0,4.Fair,0.no,0.no,0.no,0.no,0.no,0.no,0.no,,0.No,0.No,0.No,0.0,,,1.Yes,1.Yes,0.No,1.Yes,,,
2,0.No,1.Man,69.0,5.Poor,0.no,0.no,0.no,0.no,0.no,0.no,0.no,,0.No,1.Yes,1.Yes,1.0,14.0,,0.No,,,,,,
3,0.No,2.Woman,48.0,3.Good,0.no,0.no,0.no,0.no,0.no,0.no,0.no,,0.No,0.No,0.No,0.0,,,0.No,0.No,0.No,0.No,,,
4,0.No,1.Man,50.0,3.Good,0.no,0.no,0.no,0.no,0.no,0.no,0.no,26.81222,0.No,1.Yes,0.No,0.0,18.0,3.0,0.No,,,,1.Yes,,


In [127]:
df_clean.isna().sum()

hosp1y          0
ragender        0
agey           34
shlt         5727
hibpe         578
diabe         584
cancre        536
respe         518
hrtatte       516
stroke        493
arthre        529
bmi         13793
vigact       5847
smokev         44
smoken         54
smokef         72
strtsmok    42317
quitsmok    53011
cholst       3703
breast      35155
mammog      35167
papsm       35531
prost       47743
flusht      30310
hearte      44562
dtype: int64

In [128]:
# Drop some NaN records 
df_clean = df_clean.dropna(subset=["hosp1y","shlt"]).copy()
df_clean.shape

(70626, 25)

In [129]:
df_clean.dtypes

hosp1y      category
ragender    category
agey         float64
shlt        category
hibpe       category
diabe       category
cancre      category
respe       category
hrtatte     category
stroke      category
arthre      category
bmi          float32
vigact      category
smokev      category
smoken      category
smokef       float64
strtsmok     float64
quitsmok     float64
cholst      category
breast      category
mammog      category
papsm       category
prost       category
flusht      category
hearte      category
dtype: object

In [130]:
# Prepare target value
df_clean.hosp1y = (df_clean.hosp1y == "1.Yes").astype(int)

In [131]:
df_clean.hosp1y.value_counts()

hosp1y
0    62564
1     8062
Name: count, dtype: int64

In [132]:
from sklearn.model_selection import train_test_split

def split_dataset(df, target, random_state=1):
    df_full_train, df_test = train_test_split(df, test_size=.2, random_state=random_state)
    df_train, df_val = train_test_split(df_full_train, test_size=.2/.8, random_state=random_state)

    # Reset index and get y vectors
    df_full_train = df_full_train.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values
            
    del df_train[target] 
    del df_val[target] 
    del df_test[target]

    return df_full_train, df_train, df_val, df_test, y_train, y_val, y_test

In [133]:
df_full_train, df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df_clean, "hosp1y")
len(df_train), len(df_val), len(df_test)

(42375, 14125, 14126)

# Preprocessing

In [134]:
df_full_train.shape

(56500, 25)

In [135]:
df_full_train.dtypes

hosp1y         int64
ragender    category
agey         float64
shlt        category
hibpe       category
diabe       category
cancre      category
respe       category
hrtatte     category
stroke      category
arthre      category
bmi          float32
vigact      category
smokev      category
smoken      category
smokef       float64
strtsmok     float64
quitsmok     float64
cholst      category
breast      category
mammog      category
papsm       category
prost       category
flusht      category
hearte      category
dtype: object

In [136]:
for col in df_full_train.columns:
    print(col)
    print(list(df_full_train[col].unique()))

hosp1y
[0, 1]
ragender
['2.Woman', '1.Man']
agey
[56.0, 59.0, 69.0, 76.0, 68.0, 80.0, 62.0, 70.0, 53.0, 64.0, 77.0, 50.0, 54.0, 51.0, 78.0, 61.0, 79.0, 39.0, 55.0, 47.0, 45.0, 66.0, 60.0, 58.0, 63.0, 57.0, 65.0, 49.0, 90.0, 48.0, 83.0, 72.0, 75.0, 67.0, 92.0, 35.0, 43.0, 74.0, 73.0, 52.0, 82.0, 46.0, 84.0, 89.0, 71.0, 44.0, 34.0, 28.0, nan, 42.0, 38.0, 81.0, 86.0, 31.0, 87.0, 91.0, 85.0, 88.0, 40.0, 97.0, 96.0, 94.0, 29.0, 98.0, 112.0, 41.0, 32.0, 27.0, 37.0, 21.0, 93.0, 25.0, 107.0, 110.0, 102.0, 30.0, 36.0, 99.0, 95.0, 33.0, 19.0, 105.0, 26.0, 22.0, 24.0, 18.0, 101.0, 100.0, 106.0, 16.0, 20.0, 23.0]
shlt
['3.Good', '4.Fair', '5.Poor', '2.Very good', '1.Excellent']
hibpe
['1.yes', '0.no', nan]
diabe
['0.no', '1.yes', nan]
cancre
['0.no', nan, '1.yes']
respe
['0.no', '1.yes', nan]
hrtatte
['0.no', '1.yes', nan]
stroke
['0.no', nan, '1.yes']
arthre
['0.no', '1.yes', nan]
bmi
[20.342798, 26.395803, nan, 26.037493, 41.09139, 24.977043, 31.202566, 31.640625, 26.171875, 25.402817, 29.136318

In [137]:
numerical = list(df_full_train.dtypes[df_full_train.dtypes.ne("category")].index)[1:] # using 1: to skip hosp1y
numerical

['agey', 'bmi', 'smokef', 'strtsmok', 'quitsmok']

In [138]:
categorical = [col for col in list(df_full_train.dtypes.index) if col not in numerical][1:] # using 1: to skip hosp1y
categorical

['ragender',
 'shlt',
 'hibpe',
 'diabe',
 'cancre',
 'respe',
 'hrtatte',
 'stroke',
 'arthre',
 'vigact',
 'smokev',
 'smoken',
 'cholst',
 'breast',
 'mammog',
 'papsm',
 'prost',
 'flusht',
 'hearte']

In [139]:
df_full_train[numerical]

Unnamed: 0,agey,bmi,smokef,strtsmok,quitsmok
0,56.0,20.342798,0.0,,
1,59.0,26.395803,1.0,21.0,
2,56.0,,0.0,11.0,
3,69.0,,0.0,,
4,76.0,26.037493,0.0,,
...,...,...,...,...,...
56495,71.0,24.557755,0.0,,
56496,67.0,30.818542,10.0,50.0,
56497,75.0,22.206331,6.0,16.0,
56498,62.0,24.801588,0.0,,


In [140]:
df_full_train[numerical].corrwith(df_full_train.hosp1y)

agey        0.072868
bmi        -0.000478
smokef     -0.029393
strtsmok   -0.001650
quitsmok    0.040857
dtype: float64

In [141]:
for col in categorical:
    print("\t", col, "->", df_full_train[col].unique())

	 ragender -> ['2.Woman', '1.Man']
Categories (2, object): ['1.Man' < '2.Woman']
	 shlt -> ['3.Good', '4.Fair', '5.Poor', '2.Very good', '1.Excellent']
Categories (5, object): ['1.Excellent' < '2.Very good' < '3.Good' < '4.Fair' < '5.Poor']
	 hibpe -> ['1.yes', '0.no', NaN]
Categories (2, object): ['0.no' < '1.yes']
	 diabe -> ['0.no', '1.yes', NaN]
Categories (2, object): ['0.no' < '1.yes']
	 cancre -> ['0.no', NaN, '1.yes']
Categories (2, object): ['0.no' < '1.yes']
	 respe -> ['0.no', '1.yes', NaN]
Categories (2, object): ['0.no' < '1.yes']
	 hrtatte -> ['0.no', '1.yes', NaN]
Categories (2, object): ['0.no' < '1.yes']
	 stroke -> ['0.no', NaN, '1.yes']
Categories (2, object): ['0.no' < '1.yes']
	 arthre -> ['0.no', '1.yes', NaN]
Categories (2, object): ['0.no' < '1.yes']
	 vigact -> ['0.No', '1.Yes', NaN]
Categories (2, object): ['0.No' < '1.Yes']
	 smokev -> ['0.No', '1.Yes', NaN]
Categories (2, object): ['0.No' < '1.Yes']
	 smoken -> ['0.No', '1.Yes', NaN]
Categories (2, object): 

In [142]:
enc_mapper = {"numerical": numerical, "categorical": categorical}

In [143]:
enc_mapper["categorical"], enc_mapper["numerical"]

(['ragender',
  'shlt',
  'hibpe',
  'diabe',
  'cancre',
  'respe',
  'hrtatte',
  'stroke',
  'arthre',
  'vigact',
  'smokev',
  'smoken',
  'cholst',
  'breast',
  'mammog',
  'papsm',
  'prost',
  'flusht',
  'hearte'],
 ['agey', 'bmi', 'smokef', 'strtsmok', 'quitsmok'])

In [144]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder


def preprocess_data(df_train, df_val, df_test, enc_mapper):
    # Print shape of input data
    print("Input train data shape: ", df_train.shape)
    print("Input val data shape: ", df_val.shape)
    print("Input test data shape: ", df_test.shape, "\n")

    # Make a copy of the dataframes
    working_train_df = df_train.copy()
    working_val_df = df_val.copy()
    working_test_df = df_test.copy()

    # Group cols by type
    categorical = enc_mapper["categorical"]
    numerical = enc_mapper["numerical"]

    X_train = working_train_df[numerical].values
    X_val = working_val_df[numerical].values
    X_test = working_test_df[numerical].values

    ohe = OneHotEncoder()

    # Fit on the training data
    ohe.fit(working_train_df[categorical])

    # Transform train, val and test data
    X_train = np.concatenate((X_train, ohe.transform(working_train_df[categorical]).todense()), axis=1)
    X_val = np.concatenate((X_val, ohe.transform(working_val_df[categorical]).todense()), axis=1)
    X_test = np.concatenate((X_test, ohe.transform(working_test_df[categorical]).todense()), axis=1)

    X_train = np.asarray(X_train)
    X_val = np.asarray(X_val)
    X_test = np.asarray(X_test)
    
    imputer = SimpleImputer(strategy='median')
    imputer.fit(X_train)

    X_train = imputer.transform(X_train)
    X_val = imputer.transform(X_val)
    X_test = imputer.transform(X_test)

    scaler = MinMaxScaler(feature_range = (0, 1))
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, X_val, X_test

## Preprocessing

In [145]:
X_train, X_val, X_test = preprocess_data(df_train, df_val, df_test, enc_mapper)

Input train data shape:  (42375, 24)
Input val data shape:  (14125, 24)
Input test data shape:  (14126, 24) 



## Logistic Regression

In [146]:
from sklearn.linear_model import LogisticRegression

In [151]:
%%time

log_reg = None
log_reg = LogisticRegression(C=0.0001, solver='liblinear', max_iter=1000)
log_reg.fit(X_train, y_train)

CPU times: user 126 ms, sys: 196 µs, total: 126 ms
Wall time: 124 ms


In [152]:
# Train data predictions (class 1)
log_reg_train = log_reg.predict_proba(X_train)[:, 1]

# Validation data predictions (class 1)
log_reg_val = log_reg.predict_proba(X_val)[:, 1]

In [153]:
from sklearn.metrics import roc_auc_score

# Train ROC AUC Score
roc_auc_train = roc_auc_score(y_true=y_train, y_score=log_reg_train)
print(f"Train ROC AUC Score: {roc_auc_train:.4f}")

# Validation ROC AUC Score
roc_auc_val = roc_auc_score(y_true=y_val, y_score=log_reg_val)
print(f"Validation ROC AUC Score: {roc_auc_val:.4f}")

Train ROC AUC Score: 0.6793
Validation ROC AUC Score: 0.6944


## Random Forest

In [156]:
%%time

rf = RandomForestClassifier(n_estimators = 100, random_state = 47, verbose = 1, n_jobs = -1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s


CPU times: user 10.7 s, sys: 107 ms, total: 10.8 s
Wall time: 1.77 s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.7s finished


In [157]:
# Train data predictions (class 1)
rf_pred_train = rf.predict_proba(X_train)[:, 1]

# Validation data predictions (class 1)
rf_pred_val = rf.predict_proba(X_val)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [158]:
roc_auc_train = roc_auc_score(y_true=y_train, y_score=rf_pred_train)
print(f"Train ROC AUC Score: {roc_auc_train:.4f}")

# Validation ROC AUC Score
roc_auc_val = roc_auc_score(y_true=y_val, y_score=rf_pred_val)
print(f"Validation ROC AUC Score: {roc_auc_val:.4f}")

Train ROC AUC Score: 1.0000
Validation ROC AUC Score: 0.6655
