# Step 7ï¼šData Mining

# Random Forest Model

In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DBAS-Step7-DM-RandomForest').getOrCreate()

# Enable pandas-on-Spark
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [None]:
## Load data from csv file
spk_df = spark.read.csv("Data/4DT/heart_failure_dataset_4DT.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to pandas-on-Spark DataFrame using to_pandas_on_spark()
spkpd_df = spk_df.to_pandas_on_spark()
spkpd_df.info()


In [None]:
### --------  06-DMA

# Load relevant algorithms for Random Forest model

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt

In [None]:
# Prepare data for MLlib
feature_cols = [col for col in spk_df.columns if col != 'DEATH_EVENT']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(spk_df)

# Split the data into training and test sets
(train_data, test_data) = df_assembled.randomSplit([0.7, 0.3], seed=42)

# Train using Random Forest
clf = RandomForestClassifier(numTrees=100, labelCol="DEATH_EVENT", featuresCol="features", seed=42)
model = clf.fit(train_data)


In [None]:
# Predict and evaluate accuracy
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="DEATH_EVENT", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy of Random Forest: {accuracy*100:.2f}%")

In [None]:
# Get feature importances
feature_importances = model.featureImportances.toArray()

# Sort features based on importance and print them
important_features = sorted(zip(feature_cols, feature_importances), key=lambda x: x[1], reverse=True)
print("Feature importance using Random Tree:")
for feature, importance in important_features:
    print(f"{feature}: {importance:.4f}")
    
# Plotting feature importance
features, importances = zip(*important_features)
plt.figure(figsize=(10, 6)) 
plt.barh(features, importances, align='center', color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importance using Random Tree')
plt.gca().invert_yaxis()  # Display the most important feature at the top
plt.show()


In [None]:
# Stop Spark session
spark.stop()