In [6]:
# Install Java, Spark, and Findspark to initiate Spark in Python
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!wget https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

!tar xf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

# Start a Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("HeartAttackPrediction").getOrCreate()


--2024-04-16 23:29:47--  https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 224374704 (214M) [application/x-gzip]
Saving to: ‘spark-3.1.1-bin-hadoop2.7.tgz’


2024-04-16 23:29:58 (19.8 MB/s) - ‘spark-3.1.1-bin-hadoop2.7.tgz’ saved [224374704/224374704]



In [5]:
!tar -xvzf spark-3.1.1-bin-hadoop2.7.tgz



tar (child): spark-3.1.1-bin-hadoop2.7.tgz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [7]:
# Load the data from a CSV file
df = spark.read.csv('/content/heart_prediction_cleaned.csv', header=True, inferSchema=True)

# Display the first few rows of the dataset to understand its structure
df.show(5)

# Print the schema to check data types
df.printSchema()


+---+------+-------------+------------------+----------------+------------------+---------+---------+----------+---------+----------------+-----------+-----------------+------------+---------+---------------------+------------------+--------------+----------------+---------------+--------------+
|_c0|   Sex|GeneralHealth|PhysicalHealthDays|MentalHealthDays|PhysicalActivities|HadAngina|HadStroke|SleepHours|HadAsthma|HadKidneyDisease|HadDiabetes|DifficultyWalking|SmokerStatus|ChestScan|RaceEthnicityCategory|       AgeCategory|HeightInMeters|             BMI|AlcoholDrinkers|HadHeartAttack|
+---+------+-------------+------------------+----------------+------------------+---------+---------+----------+---------+----------------+-----------+-----------------+------------+---------+---------------------+------------------+--------------+----------------+---------------+--------------+
|  1|Female|    Very good|                 0|               0|                 0|        0|        0|        

In [8]:
# Import necessary libraries for data preprocessing
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Index the categorical target variable 'HadHeartAttack'
indexer = StringIndexer(inputCol="HadHeartAttack", outputCol="label")

# Assemble numeric features into a single vector column
numeric_features = [t[0] for t in df.dtypes if t[1] != 'string' and t[0] != 'HadHeartAttack']
assembler = VectorAssembler(inputCols=numeric_features, outputCol="features")

# Define a pipeline to apply transformations
pipeline = Pipeline(stages=[indexer, assembler])
data_transformed = pipeline.fit(df).transform(df)


In [9]:
# Randomly split data into training and testing sets
train_data, test_data = data_transformed.randomSplit([0.7, 0.3])


In [10]:
# Import Decision Tree Classifier
from pyspark.ml.classification import DecisionTreeClassifier

# Create a Decision Tree model
dt = DecisionTreeClassifier(featuresCol='features', labelCol='label')

# Train the model
model = dt.fit(train_data)


In [11]:
# Predictions
predictions = model.transform(test_data)

# Import evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Optionally, get detailed report (e.g., Confusion Matrix)
from pyspark.mllib.evaluation import MulticlassMetrics
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
print(f"Confusion Matrix:\n{metrics.confusionMatrix().toArray()}")


Model Accuracy: 0.9483962078505366
Confusion Matrix:
[[81231.   601.]
 [ 3857.   700.]]
