In [1]:
# Install Java and Spark dependencies
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Download Spark from the Apache archive
!wget https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz

# Unpack the Spark tar file
!tar xf spark-3.2.1-bin-hadoop2.7.tgz

# Install PySpark and FindSpark
!pip install -q findspark pyspark

# Setting environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"

# Initialize FindSpark
import findspark
findspark.init()

# Start a Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("My Spark App").getOrCreate()

# Test to ensure Spark session is active
print(spark.sparkContext)

--2024-04-16 23:16:50--  https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272637746 (260M) [application/x-gzip]
Saving to: ‘spark-3.2.1-bin-hadoop2.7.tgz’


2024-04-16 23:17:15 (10.7 MB/s) - ‘spark-3.2.1-bin-hadoop2.7.tgz’ saved [272637746/272637746]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
<SparkContext master=local[*] appName=My Spark App>


In [2]:
# Load the dataset
df = spark.read.csv('/content/heart_prediction_cleaned.csv', header=True, inferSchema=True)
df.show(5)


+---+------+-------------+------------------+----------------+------------------+---------+---------+----------+---------+----------------+-----------+-----------------+------------+---------+---------------------+------------------+--------------+----------------+---------------+--------------+
|_c0|   Sex|GeneralHealth|PhysicalHealthDays|MentalHealthDays|PhysicalActivities|HadAngina|HadStroke|SleepHours|HadAsthma|HadKidneyDisease|HadDiabetes|DifficultyWalking|SmokerStatus|ChestScan|RaceEthnicityCategory|       AgeCategory|HeightInMeters|             BMI|AlcoholDrinkers|HadHeartAttack|
+---+------+-------------+------------------+----------------+------------------+---------+---------+----------+---------+----------------+-----------+-----------------+------------+---------+---------------------+------------------+--------------+----------------+---------------+--------------+
|  1|Female|    Very good|                 0|               0|                 0|        0|        0|        

In [3]:
# Install Spark and setup environment
!pip install pyspark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("HeartAttackPrediction_cleaned").getOrCreate()

# Load data
df = spark.read.csv('/content/heart_prediction_cleaned.csv', header=True, inferSchema=True)
df.show(5)

# Check for null values and data types
df.printSchema()

# Preprocessing: Indexing, handling categorical columns, assembling features
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Indexing the target column
indexer = StringIndexer(inputCol="HadHeartAttack", outputCol="label")

# Assemble numeric features
numeric_features = [t[0] for t in df.dtypes if t[1] != 'string' and t[0] != 'HadHeartAttack']
assembler = VectorAssembler(inputCols=numeric_features, outputCol="features")

# Pipeline for preprocessing
pipeline = Pipeline(stages=[indexer, assembler])

data_transformed = pipeline.fit(df).transform(df)

# Split data into training and testing
train_data, test_data = data_transformed.randomSplit([0.7, 0.3])

# Random Forest model
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

# Train the model
model = rf.fit(train_data)

# Predictions
predictions = model.transform(test_data)

# Evaluate the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

print(f"Model Accuracy (Area Under ROC): {accuracy}")




+---+------+-------------+------------------+----------------+------------------+---------+---------+----------+---------+----------------+-----------+-----------------+------------+---------+---------------------+------------------+--------------+----------------+---------------+--------------+
|_c0|   Sex|GeneralHealth|PhysicalHealthDays|MentalHealthDays|PhysicalActivities|HadAngina|HadStroke|SleepHours|HadAsthma|HadKidneyDisease|HadDiabetes|DifficultyWalking|SmokerStatus|ChestScan|RaceEthnicityCategory|       AgeCategory|HeightInMeters|             BMI|AlcoholDrinkers|HadHeartAttack|
+---+------+-------------+------------------+----------------+------------------+---------+---------+----------+---------+----------------+-----------+-----------------+------------+---------+---------------------+------------------+--------------+----------------+---------------+--------------+
|  1|Female|    Very good|                 0|               0|                 0|        0|        0|        

In [4]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col

# Evaluate the accuracy of the model
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Accuracy: {accuracy}")

# Convert DataFrame to RDD to use MulticlassMetrics for confusion matrix
predictionAndLabels = predictions.select("prediction", "label").rdd

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Confusion matrix
confusion_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:")
print(confusion_matrix)


Accuracy: 0.9491554875929659




Confusion Matrix:
[[8.1802e+04 5.0000e+01]
 [4.3390e+03 1.3100e+02]]
