# Import libraries

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import scrapy
from scrapy import Selector
from typing import List
import re

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

# Helper functions

In [14]:
# Referenced Robert's lab python file
def parse_row(row:Selector) -> List[str]:
    '''
    Parses a html row into a list of individual elements
    '''
    cells = row.xpath('.//th | .//td')
    row_data = []
    
    for cell in cells:
        cell_text = cell.xpath('normalize-space(.)').get()
        cell_text = re.sub(r'<.*?>', ' ', cell_text)  # Remove remaining HTML tags
        # if there are br tags, there will be some binary characters
        cell_text = cell_text.replace('\xa0', '')  # Remove \xa0 characters
        row_data.append(cell_text)
    
    
    return row_data

# Referenced Robert's lab python file
def parse_table_as_df(table_sel:Selector,header:bool=True) -> pd.DataFrame:
    '''
    Parses a html table and returns it as a Pandas DataFrame
    '''
    # extract rows
    rows = table_sel.xpath('./tbody//tr')
    
    # parse header and the remaining rows
    columns = None
    start_row = 0
    if header:
        columns = parse_row(rows[0])
        start_row += 1
        
    table_data = [parse_row(row) for row in rows[start_row:]]
    
    # return data frame
    return pd.DataFrame(table_data,columns=columns)


def age_to_smoke_percent(abs_data, age):
    if pd.isna(age):
        return np.nan
    for i, row in abs_data.iterrows():
        age_group = row["Age Range"]

        if len(age_group) == 2:
            if int(age) >= int(age_group[0]) and int(age) <= int(age_group[1]):
                return row['2022 (%)']
        elif len(age_group) == 1:
            if int(age) >= int(age_group[0]):
                return row['2022 (%)']
    return np.nan

def cdc_age_sex_smoke(cdc_age_data, age, cdc_sex_data, sex):
    if pd.isna(age) or pd.isna(sex):
        return np.nan
    age_rate = np.nan
    for index, row in cdc_age_data.iterrows():
        age_range = row['Age Range']
        if len(age_range) == 2:
            if int(age) >= int(age_range[0]) and int(age) <= int(age_range[1]):
                age_rate = float(row['2022 (%)'])
        elif len(age_range) == 1:
            if int(age) >= int(age_range[0]):
                age_rate = float(row['2022 (%)'])

    sex_rate = np.nan
    if sex == 0.0:
        sex_rate = float(cdc_sex_data[cdc_sex_data['Sex'] == 'Female']['2022 (%)'].values[0])
    else:
        female_rate = float(cdc_sex_data[cdc_sex_data['Sex'] == 'Female']['2022 (%)'].values[0])
        male_rate = float(cdc_sex_data[cdc_sex_data['Sex'] == 'Male']['2022 (%)'].values[0])
        sex_rate = age_rate * (male_rate / female_rate)

    return sex_rate

def smoke_transform(x):
    if pd.isna(x):
        x = np.nan
    else:
        x = float(x)
        if x >= 12:
            x = 1
        else:
            x = 0
    return x

def impute_smoke_transform(row):
    if pd.isna(row["smoke"]):
        if row["abs_transformed"] == 1 or row["cdc_transformed"] == 1:
            return 1
        else:
            return 0
    return row["smoke"]

def evaluate_train_model(X, y, model):
    roc_auc = cross_val_score(model, X, y, scoring = "roc_auc", cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100))
    f1 = cross_val_score(model, X, y, scoring = "f1", cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100))
    accuracy = cross_val_score(model, X, y, scoring = "accuracy", cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100))
    to_return_dict = {"accuracy": np.mean(accuracy), "f1": np.mean(f1), "roc_auc": np.mean(roc_auc)}
    return to_return_dict


# 1. Load data

In [24]:
path_to_csv = "../data/heart_disease.csv"
df = pd.read_csv(path_to_csv)

# 2. Clean data

In [16]:
# Cleaning step 1
columns = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 'smoke', 'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thalach', 'exang', 'oldpeak', 'slope', 'target']
df = df[columns]

# Cleaning step 2
def trestbpsClean(x):
    if x < 100:
        x = np.nan
    return x
def oldpeakClean(x):
    if x < 0 or x > 4:
        x = np.nan
    return x
def greaterthanoneClean(x):
    if x > 1:
        x = np.nan
    return x
df['trestbps'] = df['trestbps'].apply(trestbpsClean)
df['oldpeak'] = df['oldpeak'].apply(oldpeakClean)
for col in ['fbs', 'prop', 'nitr', 'pro', 'diuretic']:
    df[col] = df[col].apply(greaterthanoneClean)

for col in ['painloc', 'painexer', 'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'exang', 'slope']:
    mode_object = df[col].mode()
    df[col].fillna(mode_object[0], inplace=True)

for col in ['trestbps', 'thaldur', 'thalach', 'oldpeak']:
    df[col].fillna(df[col].median(), inplace=True)
    
df["age"] = pd.to_numeric(df["age"], errors='coerce')

# Cleaning step 3
df['abs_smoke'] = np.NaN
df['cdc_smoke'] = np.NaN


In [17]:
# Source 2
url = "https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# CDC DATA
html_sections = str(soup).split('<h4 class="card-header h4 bg-white rounded-0">')
html_sections = html_sections[1:]
cdc_data = {}
for section in html_sections:
    sup_index = section.index("<sup>")
    to_append = section[3:sup_index]
    soup_section = BeautifulSoup(section, 'html.parser')
    data_key = soup_section.find_all('li')
    data_list = []
    for item in data_key:
        if item is not None:
            data_text = item.get_text(strip=True)
            data_list.append(data_text)

    if to_append == "Sex" or to_append == "Age":
        cdc_data[to_append] = data_list

df_sex_list = [['Male'], ['Female']]

male_number = cdc_data["Sex"][0][cdc_data["Sex"][0].index("(") + 1:cdc_data["Sex"][0].index("%")]
female_number = cdc_data["Sex"][1][cdc_data["Sex"][1].index("(") + 1:cdc_data["Sex"][1].index("%")]

df_sex_list[0].append(male_number)
df_sex_list[1].append(female_number)

df_age_list = []
for age in cdc_data["Age"]:
    age_range = age.index("aged")
    years_range = age.index("years")
    final_ages = age[age_range + 5:years_range - 1]
    
    bracket_index = age.index("(")
    percentage_index = age.index("%")
    final_percent_age = age[bracket_index + 1:percentage_index]
    df_age_list.append([final_ages, final_percent_age])

cdc_sex_data = pd.DataFrame(df_sex_list)
cdc_sex_data = cdc_sex_data.set_axis(['Sex', '2022 (%)'], axis=1)

for row in df_age_list:
    if "–" in row[0]:
        dash_index = row[0].index("–")
        row[0] = [row[0][:dash_index], row[0][dash_index+1:]]
    else:
        row[0] = [row[0]]

cdc_age_data = pd.DataFrame(df_age_list)
cdc_age_data = cdc_age_data.set_axis(['Age Range', '2022 (%)'], axis=1)

In [18]:
# Source 1
url = "https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking-and-vaping/latest-release"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
url_selector_table = Selector(text = response.content).xpath("//table")[1]

html_body = url_selector_table.xpath(".//tbody//tr")
html_header = parse_row(url_selector_table.xpath(".//thead//tr")[0])

row_list = []
for row in html_body:
    row_list.append(parse_row(row))

html_header[0] = "Age Range"

for row in row_list:
    if "–" in row[0]:
        dash_index = row[0].index("–")
        row[0] = [row[0][:dash_index], row[0][dash_index+1:]]
    else:
        years_index = row[0].index("years")
        row[0] = [row[0][:years_index-1]]

abs_data_df = pd.DataFrame(row_list, columns = html_header)
abs_data = abs_data_df[["Age Range", "2022 (%)"]]

In [19]:
# Create new columns, and impute missing values into Smoke

df['abs_smoke'] = df['age'].apply(lambda x: age_to_smoke_percent(abs_data, x))
df['cdc_smoke'] = df.apply(lambda x: cdc_age_sex_smoke(cdc_age_data, x['age'], cdc_sex_data, x['sex']), axis=1)

df['abs_smoke'] = pd.to_numeric(df['abs_smoke'])

# Transformation: If >= 12%, then classify as smoker, otherwise classify as non-smoker
df["abs_transformed"] = df["abs_smoke"].apply(lambda x: smoke_transform(x))
df["cdc_transformed"] = df["cdc_smoke"].apply(lambda x: smoke_transform(x))

df["smoke"] = df.apply(impute_smoke_transform, axis=1)

In [20]:
# More cleaning
for col in df.columns:
    for row in df[col]:
        if not pd.isna(row):
            row = float(row)

# FROM HW1
# Binary data (1 or 0)
binary_cols = []
for col in df.columns:
    if df[col].isin([0, 1, np.nan]).all():
        binary_cols.append(col)

# Categorical data
category_cols = ["cp", "slope"]

# Numberical data
number_cols = []
for col in df.columns:
    if col not in binary_cols and col not in category_cols:
        number_cols.append(col)
        
# Delete negative values
for col in number_cols:
    if (df[col] < 0).any():
            df = df[df[col] >= 0]

# Binary and categorical cleaning: impute using the mode
for col in binary_cols:
    mode_object = df[col].mode()
    df[col].fillna(mode_object[0], inplace=True)
    
for col in category_cols:
    mode_object = df[col].mode()
    df[col].fillna(mode_object[0], inplace=True)
    
# Numerical cleaning: impute using mean
for col in number_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# 3. Split Data into Train and Test

In [21]:
columns = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 'smoke', 'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thalach', 'exang', 'oldpeak', 'slope', 'target']
df = df[columns]

y = df['target']
X = df.drop(columns="target")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100, test_size = 0.1, stratify = y)

# 4. Train binary classification models on data

In [22]:
random_model = RandomForestClassifier(random_state = 100)
logistic_model = LogisticRegression(random_state = 100, max_iter = 1000)
svc_model = SVC(random_state = 100, probability = True)

model_dict = {"Random Forest": random_model, "Logistic Regression": logistic_model, "SVC": svc_model}

pipeline = Pipeline([('scaler', StandardScaler()), ('imputer', SimpleImputer(strategy='mean'))])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

model_results = {}

for model in model_dict:
    model_dict[model].fit(X_train, y_train)
    model_results[model] = evaluate_train_model(X_train, y_train, model_dict[model])

# 5. Select Final Model

In [23]:
print("Best Model: ", max(model_results, key = lambda x: model_results[x]['roc_auc']))
best_model = model_dict[max(model_results, key = lambda x: model_results[x]['roc_auc'])]
best_model.fit(X_train, y_train)
y_prediction = best_model.predict(X_test)
y_prediction_probability = best_model.predict_proba(X_test)
y_prediction_probability = y_prediction_probability[:, 1]

roc_test = roc_auc_score(y_test, y_prediction_probability)
print("ROC AUC Score Test: ", roc_test)
f1_test = f1_score(y_test, y_prediction)
print("F1 Score Test: ", f1_test)
accu_test = accuracy_score(y_test, y_prediction)
print("Accuracy Score Test: ", accu_test)
confusion_matrix_result = confusion_matrix(y_test, y_prediction)
print("Confusion Matrix: ", confusion_matrix_result)

Best Model:  Logistic Regression
ROC AUC Score Test:  0.8141463414634147
F1 Score Test:  0.7843137254901961
Accuracy Score Test:  0.7582417582417582
Confusion Matrix:  [[29 12]
 [10 40]]
