In [1]:
import boto3
from io import BytesIO
# import pandas as pd
import csv

import matplotlib.pyplot as plt
import seaborn as sns

import xml.etree.ElementTree as ET
import psycopg2

#required for navigating machine's directory
import glob
import os.path

import numpy as np

import warnings

from scrapy import Selector
import requests

import re
from typing import List

from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, LongType, StringType, DoubleType
from pyspark.sql.dataframe import DataFrame
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import VectorAssembler, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pyspark.sql.functions as F
from pyspark.sql.functions import when, col
from itertools import combinations
import os

In [2]:
DATA_FOLDER = "../data"

NUMBER_OF_FOLDS = 3
SPLIT_SEED = 7576
TRAIN_TEST_SPLIT = 0.9

In [8]:
def main():
    # Create a Spark session
    spark = SparkSession.builder \
        .appName("Predict Heart Disease") \
        .getOrCreate()

    raw_data = read_data(spark)
    data = clean_data(raw_data)
    data.show()
    pipeline(data)
    
    # data.show()


    spark.stop()
    
main()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/21 17:27:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception ignored in: <function JavaWrapper.__del__ at 0x71c1f15657e0>          
Traceback (most recent call last):
  File "/tmp/demos/lib/python3.10/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'Imputer' object has no attribute '_java_obj'


+---+---+-------+--------+---+--------+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------+-------------------+
|age|sex|painloc|painexer| cp|trestbps|fbs|prop|nitr|pro|diuretic|thaldur|thalach|exang|oldpeak|slope|target|smoke_1|            smoke_2|
+---+---+-------+--------+---+--------+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------+-------------------+
| 63|  1|   null|    null|  1|     145|  1|   0|   0|  0|       0|   10.5|    150|    0|    2.3|    3|     0|  0.149|0.19325742574257426|
| 67|  1|   null|    null|  4|     160|  0|   1|   0|  0|       0|    9.5|    108|    1|    1.5|    2|     1|  0.087|0.10765346534653465|
| 67|  1|   null|    null|  4|     120|  0|   1|   0|  0|       0|    8.5|    129|    1|    2.6|    2|     1|  0.087|0.10765346534653465|
| 37|  1|   null|    null|  3|     130|  0|   1|   0|  0|       0|   13.0|    187|    0|    3.5|    3|     0|  0.109|0.16342574257425743|
| 41|  0|   null|    null|  2|    

## Get data from S3

In [None]:
s3 = boto3.client('s3',
                  aws_access_key_id='',
                  aws_secret_access_key='',
                  aws_session_token='')


bucket_name = 'de300spring2024'
object_key = 'emily_kohlberg/hw/heart_disease.csv'

In [None]:
csv_obj = s3.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

In [None]:
raw_data = pd.read_csv(BytesIO(csv_string.encode()))
raw_data

## Get data from data

In [3]:
def read_data(spark: SparkSession) -> DataFrame:
    """
    read data; since the data has the header we let spark guess the schema
    """
    
    data = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(os.path.join(DATA_FOLDER,"*.csv"))

    return data

## Cleaning

### Clean and Impute

In [4]:
def retain_cols(data: DataFrame) -> DataFrame:
    columns_to_retain = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 'smoke', 
                         'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thalach', 
                         'exang', 'oldpeak', 'slope', 'target']
    
    filtered_data = data.select(columns_to_retain)
    return filtered_data
    
def replace_out_of_range(data: DataFrame) -> DataFrame:
    data = data.withColumn('painloc', when(col('painloc') < 0, 0).when(col('painloc') > 1, 1).otherwise(col('painloc')))
    data = data.withColumn('painexer', when(col('painexer') < 0, 0).when(col('painexer') > 1, 1).otherwise(col('painexer')))
    data = data.withColumn('trestbps', when(col('trestbps') < 100, 100).otherwise(col('trestbps')))
    data = data.withColumn('oldpeak', when(col('oldpeak') < 0, 0).when(col('oldpeak') > 4, 4).otherwise(col('oldpeak')))
    data = data.withColumn('fbs', when(col('fbs') < 0, 0).when(col('fbs') > 1, 1).otherwise(col('fbs')))
    data = data.withColumn('prop', when(col('prop') < 0, 0).when(col('prop') > 1, 1).otherwise(col('prop')))
    data = data.withColumn('nitr', when(col('nitr') < 0, 0).when(col('nitr') > 1, 1).otherwise(col('nitr')))
    data = data.withColumn('pro', when(col('pro') < 0, 0).when(col('pro') > 1, 1).otherwise(col('pro')))
    data = data.withColumn('diuretic', when(col('diuretic') < 0, 0).when(col('diuretic') > 1, 1).otherwise(col('diuretic')))
    data = data.withColumn('exang', when(col('exang') < 0, 0).when(col('exang') > 1, 1).otherwise(col('exang')))
    data = data.withColumn('slope', when(col('slope') < 1, None).when(col('slope') > 3, None).otherwise(col('slope')))
    return data
    
def replace_nulls_with_mean(data: DataFrame) -> DataFrame:
    columns_for_imputation = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 
                     'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thalach', 
                     'exang', 'oldpeak', 'slope', 'target']
    
    for column in columns_for_imputation:
        mean_value = data.select(F.mean(col(column))).collect()[0][0]
        if mean_value is not None:
            data = data.withColumn(column, when(col(column).isNull(), mean_value).otherwise(col(column)))
    return data

In [5]:
def smoke_1(data: DataFrame) -> DataFrame:
    url1 = 'https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking-and-vaping/latest-release'
    response = requests.get(url1)
        
    # get the HTML file as a string
    html_content = response.content
    
    # create a selector object
    full_sel = Selector(text=html_content)
    
    # select all tables in page -> returns a SelectorList object
    tables = full_sel.xpath('//table')
    smokers_by_age = tables[1]
    # get the rows
    rows = smokers_by_age.xpath('./tbody//tr')

    def parse_row_1(row:Selector) -> List[str]:
        '''
        Parses a html row into a list of individual elements
        '''
        cells = row.xpath('.//th | .//td')
        row_data = []
        
        for i, cell in enumerate(cells):
            if i == 0 or i == 10:
                cell_text = cell.xpath('normalize-space(.)').get()
                cell_text = re.sub(r'<.*?>', ' ', cell_text)  # Remove remaining HTML tags
                # if there are br tags, there will be some binary characters
                cell_text = cell_text.replace('\xa0', '')  # Remove \xa0 characters
                row_data.append(cell_text)
        
        return row_data
    
    table_data = [parse_row_1(row) for row in rows]

    def get_rate_1(age):
        try:
            age = int(age)
            for i, row in enumerate(table_data):
                if i < len(table_data) - 1:
                    cutoff = row[0].split('–')[1]
                    if age <= int(cutoff):
                        return float(row[1])
                else:
                    return float(row[1])
        except:
            return np.nan
    
    # Register the UDF
    get_rate_1_udf = F.udf(lambda age: get_rate_1(age) / 100, DoubleType())

    data = data.withColumn('smoke_1', when(col('smoke_1').isNull(), get_rate_1_udf(col('age'))).otherwise(col('smoke_1')))

    return data

def smoke_2(data: DataFrame) -> DataFrame:
    url2 = 'https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm'
    response = requests.get(url2)

    # Create a scrapy Selector from the response content
    selector = Selector(text=response.content)

    ul_sel_list = selector.xpath('//ul[@class="block-list"]')
    genders = ul_sel_list[0]
    ages = ul_sel_list[1]

    def clean_gender_percents(rows):
        dict = {}
        for row in rows:
            gender = 'woman' if 'women' in row.split('(')[0] else 'man'
            percent = float(row.split('(')[1].split('%')[0])
            dict[gender] = float(percent)
        return dict

    def clean_age_percents(rows):
        for i, row in enumerate(rows):
            if i < len(rows) - 1:
                age = int(row.split('–')[1].split(' ')[0])
            else:
                age = int(row.split(' ')[7])
                
            percent = float(row.split('(')[1].split('%')[0])
            rows[i] = [age, percent]
        return rows

    def parse_row_2(row:Selector) -> List[str]:
        '''
        Parses a html row into a list of individual elements
        '''
        cells = row.xpath('./li')
        row_data = []
        
        for i, cell in enumerate(cells):
            cell_text = cell.xpath('normalize-space(.)').get()
            cell_text = re.sub(r'<.*?>', ' ', cell_text)  # Remove remaining HTML tags
            # if there are br tags, there will be some binary characters
            cell_text = cell_text.replace('\xa0', '')  # Remove \xa0 characters
            row_data.append(cell_text)
        
        return row_data

    per_by_gender = clean_gender_percents(parse_row_2(genders))
    per_by_age = clean_age_percents(parse_row_2(ages))

    def get_rate_2(sex, age):
        if sex == 0:
            try:
                age = int(age)
                for i, row in enumerate(per_by_age):
                    if i < len(per_by_age) - 1:
                        if age <= row[0]:
                            return row[1]
                    else:
                        return row[1]
            except:
                return np.nan
        else:
            try:
                age = int(age)
                for i, row in enumerate(per_by_age):
                    if i < len(per_by_age) - 1:
                        if age <= row[0]:
                            return row[1] * per_by_gender['man'] / per_by_gender['woman']
                    else:
                        return row[1] * per_by_gender['man'] / per_by_gender['woman']
            except:
                return np.nan

    # Register the UDF
    get_rate_2_udf = F.udf(lambda sex, age: get_rate_2(sex, age) / 100, DoubleType())

    data = data.withColumn('smoke_2', when(col('smoke_2').isNull(), get_rate_2_udf(col('sex'), col('age'))).otherwise(col('smoke_2')))

    return data 

def impute_smoke(data: DataFrame) -> DataFrame:
    data = data.withColumn('smoke_1', F.col('smoke'))
    data = data.withColumn('smoke_2', F.col('smoke'))

    data = smoke_1(data)
    data = smoke_2(data)

    data = data.drop('smoke')
    
    return data

### Final Clean

In [6]:
def clean_data(data: DataFrame) -> DataFrame:
    data = retain_cols(data)
    data = replace_out_of_range(data)
    # data = replace_nulls_with_mean(data)
    data = impute_smoke(data)
    return data

## Prediction Model

In [7]:
def pipeline(data: DataFrame):

    data = data.withColumn("age", data["age"].cast(IntegerType()))

    numeric_features = [f.name for f in data.schema.fields if isinstance(f.dataType, DoubleType) or isinstance(f.dataType, FloatType) or isinstance(f.dataType, IntegerType) or isinstance(f.dataType, LongType)]
    string_features = [f.name for f in data.schema.fields if isinstance(f.dataType, StringType)]

    print(numeric_features)
    print(string_features)

    features = [f.name for f in data.schema.fields]

    imputed_columns = [f"Imputed{v}" for v in features]
    

    try:
        imputer = Imputer(inputCols=features, outputCols=imputed_columns, strategy="mean")
        print("Imputer initialized successfully.")
    except AttributeError as e:
        print("Error initializing Imputer:", e)
        return
    # imputer = Imputer(inputCols=features, outputCols=imputed_columns, strategy="mean")


    # Assemble feature columns into a single feature vector
    assembler = VectorAssembler(
        inputCols=imputed_columns, 
        outputCol="features"
        )

    # Define a Random Forest classifier
    classifier = RandomForestClassifier(labelCol="target", featuresCol="features")

    # Create the pipeline
    pipeline = Pipeline(stages=[imputer, assembler, classifier])
    
    # Set up the parameter grid for maximum tree depth
    paramGrid = ParamGridBuilder() \
        .addGrid(classifier.maxDepth, [2, 4, 6, 8, 10]) \
        .addGrid(classifier.numTrees, [150, 200, 250, 500]) \
        .build()

    # Set up the cross-validator
    evaluator = BinaryClassificationEvaluator(labelCol="Survived", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
    crossval = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=NUMBER_OF_FOLDS,
        seed=SPLIT_SEED)

    # Split the data into training and test sets
    train_data, test_data = data.randomSplit([TRAIN_TEST_SPLIT, 1-TRAIN_TEST_SPLIT], seed=SPLIT_SEED)

    # Train the cross-validated pipeline model
    cvModel = crossval.fit(train_data)

    # Make predictions on the test data
    predictions = cvModel.transform(test_data)

    # Evaluate the model
    auc = evaluator.evaluate(predictions)
    print(f"Area Under ROC Curve: {auc:.4f}")

    # Get the best RandomForest model
    best_model = cvModel.bestModel.stages[-1]

    # Retrieve the selected maximum tree depth
    selected_max_depth = best_model.getOrDefault(best_model.getParam("maxDepth"))

    # Print the selected maximum tree depth
    print(f"Selected Maximum Tree Depth: {selected_max_depth}")

    # Retrieve the selected number of trees
    selected_num_trees = best_model.getOrDefault(best_model.getParam("numTrees"))

    # Print the selected number of trees
    print(f"Selected Number of Trees: {selected_num_trees}")