# Step 7： Data Mining

# Logistic Regression

In [None]:

# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DBAS-Step7-DM-LogisticRegression').getOrCreate()

# Enable pandas-on-Spark
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [None]:
## Load data from csv file
spk_df = spark.read.csv("Data/4DT/heart_failure_dataset_4DT.csv", header=True, inferSchema=True)
spk_df.printSchema

# Convert Spark DataFrame to pandas-on-Spark DataFrame using to_pandas_on_spark()
spkpd_df = spk_df.to_pandas_on_spark()
spkpd_df.info()


In [None]:
### --------  06-DMA
# Load relevant algorithms for Logical Regression model

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression


In [None]:
# Use VectorAssembler to transform features into a single feature vector
assembler = VectorAssembler(inputCols=["creatinine_phosphokinase", 
                                       "ejection_fraction", 
                                       "platelets", 
                                       "serum_creatinine", 
                                       "serum_sodium"], 
                            outputCol="features")



spk_df = assembler.transform(spk_df)

# Split the dataset into training and testing sets
train_data, test_data = spk_df.randomSplit([0.8, 0.2])

# Create and train a logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="DEATH_EVENT")
model = lr.fit(train_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Predict using model
predictions = model.transform(test_data)

# Evaluate accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", 
                                          labelCol="DEATH_EVENT")

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Get the coefficients from the model
coefficients = model.coefficients.toArray()

# Create a DataFrame to display features and their corresponding coefficients
feature_importance = pd.DataFrame({"Feature": ["creatinine_phosphokinase", 
                                               "ejection_fraction", 
                                               "platelets", 
                                               "serum_creatinine", 
                                               "serum_sodium"], 
                                   "Coefficient": coefficients})

# Sort the features based on the absolute value of coefficients
feature_importance = feature_importance.sort_values(by="Coefficient", key=abs, ascending=False)
print(feature_importance)

# Visualize the importance of features
plt.figure(figsize=(10, 6))
plt.barh(feature_importance["Feature"], feature_importance["Coefficient"], color='skyblue')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance based on Coefficients')
plt.gca().invert_yaxis()  # To display the most important feature at the top
plt.show()

In [None]:
# Stop Spark session
spark.stop()