In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.sql.functions import col, round, when

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("EnsembleModel").getOrCreate()

# read the csv
data = spark.read.csv("../../../data/model_data.csv", header=True, inferSchema=True)
data.show(5)

In [None]:
# assemble the feature vector
feature_cols = [col for col in data.columns if col != "isFraud"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="rawFeatures")
data = assembler.transform(data)
data.show(5)

In [None]:
# apply MinMaxScaler for scaling the features
assembler = VectorAssembler(inputCols=["rawFeatures"], outputCol="features")
data = assembler.transform(data)

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures", min=0.0, max=1.0)
scalerModel = scaler.fit(data)

data = scalerModel.transform(data)
data.show(5)

In [None]:
# create class weight column based on values created by compute_class_weight used in other models
data = data.withColumn('weight', when(col('isFraud') == 0, 0.5006457829418431).otherwise(387.6269799825936))
data.show(5)

In [None]:
# split into training and test
train, test = data.randomSplit([0.8, 0.2], seed=1)

In [None]:
# create models
rf = RandomForestClassifier(labelCol="isFraud", featuresCol="scaledFeatures", numTrees=64, rawPredictionCol="rf_rawPrediction", predictionCol="rf_prediction", probabilityCol="rf_probability", seed=1,  weightCol="weight")
gbt = GBTClassifier(labelCol="isFraud", featuresCol="scaledFeatures", maxIter=20, predictionCol="gbt_prediction", seed=1,  weightCol="weight")
lr = LogisticRegression(labelCol="isFraud", featuresCol="scaledFeatures", rawPredictionCol="lr_rawPrediction", predictionCol="lr_prediction", probabilityCol="lr_probability",  weightCol="weight")


models = [rf, gbt, lr]

In [None]:
# train eval and export models
def train_eval_export(model):
  # compute accuracy
  trained_model = model.fit(train)
  predictions = trained_model.transform(train)
  correct_count = predictions.filter(col("prediction") == col("isFraud")).count()
  accuracy = correct_count / total_count

  # compute recall
  tp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 1)).count()
  fn = predictions.filter((col("prediction") == 0) & (col("isFraud") == 1)).count()
  recall = tp / (tp + fn)

  # compute precision
  tp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 1)).count()
  fp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 0)).count()
  precision = tp / (tp + fp)
  print(model.__class__.__name__+ " accuracy:" + accuracy)
  print(model.__class__.__name__ + " precision:" + precision)
  print(model.__class__.__name__ + " recall:" + recall)
  
  # export model
  trained_model.save("/content/drive/MyDrive/Grad_School/Big_Data_Management/PBDM_Project/" + model.__class__.__name_ + "model.parquet")

# number of rows for computing metrics
total_count = data.count()

for model in models:
  train_eval_export(model)