<a href="https://colab.research.google.com/github/chanpaulamol/logistic_regression_spark/blob/main/heart_disease_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Members: Lelyta, Noora , Chan
#### Project: Logistic Regression using spark to predict heart disease!
#### Course: Big Data

In [None]:
# Impport the Drive from google.colab
from google.colab import drive

# Mount the Drive to access the CSV file
drive.mount('/content/drive')

Mounted at /content/drive


# uncomment to install pyspark 

In [None]:
# pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=015520ca3736b8f39449499a142a63b8e229cec33f57c41c479edf1276036545
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


### Import Libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Create Spark Session

In [None]:
# Create a Spark session
spark = SparkSession.builder.getOrCreate()

### Load data

In [None]:
# Read the CSV file into a DataFrame
data = spark.read.csv('/content/drive/MyDrive/spark_dataset/heart_dataset.csv', header=True, inferSchema=True)

### Read data

In [None]:
data.show()

+---+---+---------+---------+-----------+---------+-------+-----+--------------+------------+-----+------------+-----------+------+
|Age|Sex|ChestPain|RestingBP|Cholesterol|FastingBS|RestECG|MaxHR|ExerciseAngina|STDepression|Slope|MajorVessels|Thalassemia|Target|
+---+---+---------+---------+-----------+---------+-------+-----+--------------+------------+-----+------------+-----------+------+
| 52|  1|        0|      125|        212|        0|      1|  168|             0|         1.0|    2|           2|          3|     0|
| 53|  1|        0|      140|        203|        1|      0|  155|             1|         3.1|    0|           0|          3|     0|
| 70|  1|        0|      145|        174|        0|      1|  125|             1|         2.6|    0|           0|          3|     0|
| 61|  1|        0|      148|        203|        0|      1|  161|             0|         0.0|    2|           1|          3|     0|
| 62|  0|        0|      138|        294|        1|      1|  106|           

### Feature Extraction

In [None]:
# Define the input features
feature_columns = ['Age', 'Sex', 'ChestPain', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestECG',
                   'MaxHR', 'ExerciseAngina', 'STDepression', 'Slope', 'MajorVessels', 'Thalassemia']

# Create a VectorAssembler to combine the input features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Transform the data by assembling the input features
data = assembler.transform(data)

# Create a StandardScaler to scale the features
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

# Fit and transform the data using the scaler
scaled_data = scaler.fit(data).transform(data)

# Select the scaled features and target column for further processing
selected_data = scaled_data.select('scaled_features', 'Target')

# Show the resulting DataFrame
selected_data.show()

+--------------------+------+
|     scaled_features|Target|
+--------------------+------+
|[5.73173902764403...|     0|
|[5.84196477817564...|     0|
|[7.71580253721311...|     0|
|[6.72377078242857...|     0|
|[6.83399653296019...|     0|
|(13,[0,3,4,7,9,10...|     1|
|[6.39309353083372...|     0|
|[6.06241627923887...|     0|
|[5.07038452445433...|     0|
|[5.95219052870726...|     0|
|[7.82602828774473...|     1|
|[4.73970727285948...|     0|
|[3.74767551807494...|     1|
|[5.62151327711241...|     0|
|[5.73173902764403...|     0|
|[3.74767551807494...|     1|
|[5.62151327711241...|     1|
|[5.95219052870726...|     0|
|[5.5112875265808,...|     1|
|[6.39309353083372...|     1|
+--------------------+------+
only showing top 20 rows



## Split data

In [None]:
# Split the data into training and testing sets (70% for training, 30% for testing)
train_data, test_data = selected_data.randomSplit([0.7, 0.3], seed=123)

# Show the resulting DataFrames
print("Training Data:")
train_data.show()
print("Testing Data:")
test_data.show()

Training Data:
+--------------------+------+
|     scaled_features|Target|
+--------------------+------+
|(13,[0,1,3,4,7,10...|     0|
|(13,[0,1,3,4,7,10...|     0|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,3,4,6,7,10...|     1|
|(13,[0,3,4,6,7,10...|     1|
|(13,[0,3,4,6,7,10...|     1|
|(13,[0,3,4,6,7,10...|     1|
|(13,[0,3,4,6,7,10...|     1|
+--------------------+------+
only showing top 20 rows

Testing Data:
+--------------------+------+
|     scaled_features|Target|
+--------------------+------+
|(13,[0,1,3,4,7,10...|     0|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,1,3,4,7,10...|     1|
|(13,[0,2,3,4,7,10...|     1|
|(13,[0,2,3,4,7

## Train Model

In [None]:
# Create an instance of LogisticRegression
model = LogisticRegression(labelCol='Target', featuresCol='scaled_features')

# Train the logistic regression model
lr_model = model.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Show the predictions
predictions.show(5)

+--------------------+------+--------------------+--------------------+----------+
|     scaled_features|Target|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(13,[0,1,3,4,7,10...|     0|[-0.8471861079950...|[0.30002346852686...|       1.0|
|(13,[0,1,3,4,7,10...|     1|[-1.9392014945617...|[0.12573560692610...|       1.0|
|(13,[0,1,3,4,7,10...|     1|[-1.1538191161639...|[0.23979219741835...|       1.0|
|(13,[0,2,3,4,7,10...|     1|[-4.8386138646432...|[0.00785581934885...|       1.0|
|(13,[0,2,3,4,7,10...|     1|[-4.8386138646432...|[0.00785581934885...|       1.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



## Model Evaluation

In [None]:
# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='Target', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2f}".format(accuracy))

# Evaluate the model using precision, recall, and F1-score
evaluator = MulticlassClassificationEvaluator(labelCol='Target', predictionCol='prediction',
                                              metricName='weightedPrecision')
precision = evaluator.evaluate(predictions)
evaluator.setMetricName('weightedRecall')
recall = evaluator.evaluate(predictions)
evaluator.setMetricName('f1')
f1_score = evaluator.evaluate(predictions)

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-Score: {:.2f}".format(f1_score))

Accuracy: 0.86
Precision: 0.87
Recall: 0.86
F1-Score: 0.86
