In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("EnsembleModel").getOrCreate()

# read the csv
df = spark.read.csv("../../../data/model_data.csv", header=True, inferSchema=True)

In [None]:
# split the data into training and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=123)

# assemble the feature vector
feature_cols = df.columns[:-1]  # exclude the label column
vectorAssembler = VectorAssembler(inputCols=feature_cols, outputCol="rawFeatures")

In [None]:
# apply MinMaxScaler for scaling the features
scaler = MinMaxScaler(inputCol="rawFeatures", outputCol="features")

In [None]:
# create an ensemble model pipeline
rf = RandomForestClassifier(labelCol="isFraud", featuresCol="features", numTrees=10)
gbt = GBTClassifier(labelCol="isFraud", featuresCol="features", maxIter=10)
lr = LogisticRegression(labelCol="isFraud", featuresCol="features")

pipeline = Pipeline(stages=[vectorAssembler, rf, gbt, lr])

In [None]:
# train the ensemble model
model = pipeline.fit(train_data)

In [None]:
# make predictions on the test data
predictions = model.transform(test_data)

In [None]:
# evaluate the model using a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel"})

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)