# Step 7：Data Mining

# Non-Linear SVM Model

In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DBAS-Step7-DM-Non-Linear-SVM').getOrCreate()

# Enable pandas-on-Spark
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/10 10:01:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
## Load data from csv file
spk_df = spark.read.csv("Data/4DT/heart_failure_dataset_4DT.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to pandas-on-Spark DataFrame using to_pandas_on_spark()
spkpd_df = spk_df.to_pandas_on_spark()
spkpd_df.info()


23/10/10 10:01:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/10 10:01:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 383 entries, 0 to 382
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   creatinine_phosphokinase  383 non-null    int32  
 1   ejection_fraction         383 non-null    int32  
 2   platelets                 383 non-null    float64
 3   serum_creatinine          383 non-null    float64
 4   serum_sodium              383 non-null    int32  
 5   DEATH_EVENT               383 non-null    int32  
dtypes: float64(2), int32(4)

In [3]:
### --------  06-DMA

# Load relevant algorithms for non-Linear SVM model

from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline


In [None]:
### -------- 07-DM-non Linear SVM
# Using the prepared dataset

# Convert the features into a single vector column
assembler = VectorAssembler(inputCols=[col for col in spk_df.columns if col != "DEATH_EVENT"], outputCol="features")
spk_df = assembler.transform(spk_df)

# Feature scaling
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(spk_df)
spk_df = scalerModel.transform(spk_df)

# Split the data into training and test sets
train, test = spk_df.randomSplit([0.8, 0.2], seed=42)

# Train using a non-linear SVM with RBF kernel
# Note: PySpark's LinearSVC does not directly support the RBF kernel. 
# For a non-linear SVM with RBF kernel in Spark, you might need to use other libraries or implement it manually.
# Here, we'll use LinearSVC as an example.
svm = LinearSVC(featuresCol="scaledFeatures", labelCol="DEATH_EVENT", maxIter=10, regParam=0.1)

# Create a pipeline
pipeline = Pipeline(stages=[svm])
model = pipeline.fit(train)

# Predict and evaluate accuracy
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", 
                                          labelCol="DEATH_EVENT", 
                                          metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy of non-linear SVM: {accuracy*100:.2f}%")


In [None]:
### -------- 07-DM-non Linear SVM

# Using the prepared dataset
X = df.drop("DEATH_EVENT", axis=1)
y = df["DEATH_EVENT"]
    
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ---Train using a non-linear SVM with RBF kernel
clf = SVC(kernel='rbf', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate accuracy
y_pred = clf.predict(X_test)
accuracy1 = accuracy_score(y_test, y_pred)
print(f"Test Accuracy of non-linear SVM: {accuracy1*100:.2f}%")

In [None]:
# Evaluate feature importance using permutation_importance
result = permutation_importance(clf, X_test, y_test, n_repeats=30, random_state=42)

# Get and sort the feature importances
sorted_idx = result.importances_mean.argsort()

# Visualize the feature importances
# import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.boxplot(result.importances[sorted_idx].T, vert=False, labels=X.columns[sorted_idx])
plt.title("Permutation Importances (test set)")
plt.tight_layout()
plt.show()


In [None]:
# Stop Spark session
spark.stop()