In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("EnsembleModel").getOrCreate()

# read the csv
data = spark.read.csv("../../../data/model_data.csv", header=True, inferSchema=True)
data.show(5)

In [None]:
# assemble the feature vector 
feature_cols = [col for col in data.columns if col != "isFraud"] 
assembler = VectorAssembler(inputCols=feature_cols, outputCol="rawFeatures")
data = assembler.transform(df)
data.show(5)

In [None]:
# apply MinMaxScaler for scaling the features
scaler = MinMaxScaler(inputCol="rawFeatures", outputCol="scaledFeatures")
data = scaler.fit(data).transform(data)
data.show(5)

In [None]:
# split into training and test
train, test = data.randomSplit([0.7, 0.3], seed=1)

In [None]:
# create an ensemble model pipeline
rf = RandomForestClassifier(labelCol="isFraud", featuresCol="scaledFeatures", numTrees=10)
gbt = GBTClassifier(labelCol="isFraud", featuresCol="scaledFeatures", maxIter=10)
lr = LogisticRegression(labelCol="isFraud", featuresCol="scaledFeatures")

pipeline = Pipeline(stages=[rf, gbt, lr])

In [None]:
# train the ensemble model
model = pipeline.fit(train)

In [None]:
# make predictions on the test data
predictions = model.transform(test)
predictions.show(5)

In [None]:
# evaluate the model using a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel"})

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)