In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DBAS-Step4-Data Transformation').getOrCreate()

# Enable pandas-on-Spark
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

print(pyspark.__version__)


In [None]:
## Load data from csv file
spk_df = spark.read.csv("Data/3DP/heart_failure_dataset_3DP.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to pandas-on-Spark DataFrame using to_pandas_on_spark()
spkpd_df = spk_df.to_pandas_on_spark()
spkpd_df.info()




# ### ---------- 04-DT



#  Balancing 

In [None]:

### ---------- 04-DT
# Add any transformation steps

#from sklearn.utils import resample
from pyspark.sql import functions as F
import matplotlib.pyplot as plt


# Pieplot before boosting
death_event_counts = spk_df.groupBy("DEATH_EVENT").count().collect()
sizes = [row["count"] for row in death_event_counts]
labels = [f"{status} ({size})" for status, size in zip(["Deceased","Alive"], sizes)]

# Use matplotlib 
colors = ['yellowgreen', 'lightcoral']
explode = (0.1, 0)  # explode 1st slice for emphasis

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Distribution of DEATH_EVENT before balancing')
plt.show()


In [None]:
# Boosting
df_high = spk_df.filter(F.col("DEATH_EVENT") == 0)
df_low = spk_df.filter(F.col("DEATH_EVENT") == 1)

# boosting with sample function 
df_low_boost = df_low.sample(True, float(df_high.count()) / df_low.count(), seed=42)
spk_df_boosted = df_high.union(df_low_boost)


In [None]:
# Pieplot after boosting

spk_df = spk_df_boosted
death_event_counts = spk_df.groupBy("DEATH_EVENT").count().collect()
sizes = [row["count"] for row in reversed(death_event_counts)]
labels = [f"{status} ({size})" for status, size in zip(["Deceased", "Alive"], sizes)]

# Use matplotlib for Pieplot
colors = ['yellowgreen','lightcoral']
explode = (0.1, 0)  # explode 1st slice for emphasis

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Distribution of DEATH_EVENT After balancing')
plt.show()


# Reducing Data 

In [None]:
#### -- Reducing data 

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import col
import pandas as pd

# Convert 'DEATH_EVENT' from boolean to integer
spk_df = spk_df.withColumn("DEATH_EVENT", col("DEATH_EVENT").cast("int"))

# Prepare data for MLlib
feature_cols = [col for col in spk_df.columns if col not in ['DEATH_EVENT', 'Age_Level']]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(spk_df)

# Fit a RandomForest model
clf = RandomForestClassifier(numTrees=100, labelCol="DEATH_EVENT", featuresCol="features")
model = clf.fit(df_assembled)

# Get feature importances
feature_importances = model.featureImportances.toArray()

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': feature_importances
})

# Sort the features based on importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from Random Forest')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()


In [None]:

# Select tail 5 features
tail_k = 5
tail_features = importance_df['Feature'].tail(tail_k).tolist()
print("tail_features:")
print(" ",tail_features)

# Ignore the 4 features 
# which are highly related to survival prediction but not our objuectives.

ignore_list = ['time','follow_up_month','age','Age_Level']
print("ignore_list:")
print(" ", ignore_list)

drop_list = tail_features + ignore_list
print("drop_list:")
print(" ",drop_list)



In [None]:
# Reduce data to drop the 9 columes
df_reduced = spk_df.drop(*drop_list)
df_reduced.show()

spk_df = df_reduced
spk_df.printSchema()

In [None]:
# Convert Spark DataFrame to pandas-on-Spark DataFrame using to_pandas_on_spark()
spkpd_df = spk_df.to_pandas_on_spark()
spkpd_df.info()

In [None]:
# Save to CSV file
spk_df.coalesce(1).write.csv("Data/4DT", header=True)

In [None]:
# Stop Spark session
spark.stop()