In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.sql.functions import col, round, when

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("WeightModels").getOrCreate()

# read the csv
data = spark.read.csv("../../../data/model_data.csv", header=True, inferSchema=True)
data.show(5)

In [None]:
# define the feature columns
feature_cols = [col for col in data.columns if col != "isFraud"]

# define the preprocessing pipeline for the features
preprocessing_pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=feature_cols, outputCol="features"),
    MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
])

# fit preprocessing pipeline to dataset
preprocessing_model = preprocessing_pipeline.fit(data)

# # save the fitted preprocessing pipeline to disk
# preprocessing_model.save("/content/drive/MyDrive/Grad_School/Big_Data_Management/PBDM_Project/preprocessing_pipeline.parquet")

preprocessed_data = preprocessing_model.transform(data)

preprocessed_data.show(5)

In [None]:
# create class weight column using weights from sklearn compute class weight results in other model files
preprocessed_data = preprocessed_data.withColumn('weight', when(col('isFraud') == 0, 0.5006457829418431).otherwise(387.6269799825936))
preprocessed_data.show(5)

In [None]:
# split into training and test
train, test = data.randomSplit([0.8, 0.2], seed=1)

In [None]:
# create models
rf = RandomForestClassifier(labelCol="isFraud", featuresCol="scaledFeatures", numTrees=100, seed=1,  weightCol="weight")
gbt = GBTClassifier(labelCol="isFraud", featuresCol="scaledFeatures", maxIter=20, seed=1,  weightCol="weight")
lr = LogisticRegression(labelCol="isFraud", featuresCol="scaledFeatures", weightCol="weight")


In [None]:
# train models
rf_model = rf.fit(train)

In [None]:
gbt_model = gbt.fit(train)

In [None]:
lr_model = lr.fit(train)

In [None]:
# evaluate models
def eval_model(model):
  # compute accuracy
  predictions = model.transform(train)
  correct_count = predictions.filter(col("prediction") == col("isFraud")).count()
  accuracy = correct_count / total_count

  # compute recall
  tp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 1)).count()
  fn = predictions.filter((col("prediction") == 0) & (col("isFraud") == 1)).count()
  recall = tp / (tp + fn)

  # compute precision
  fp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 0)).count()
  precision = tp / (tp + fp)

  print(model.__class__.__name__+ " accuracy: {:.4f}".format(accuracy))
  print(model.__class__.__name__ + " precision: {:.4f}".format(precision))
  print(model.__class__.__name__ + " recall: {:.4f}".format(recall))
  
# number of rows for computing metrics
total_count = data.count()

models = [rf_model, gbt_model, lr_model]

for model in models:
  eval_model(model)

In [None]:
# for use in google colab
# export models
for model in models:
  model.save("/content/drive/MyDrive/Grad_School/Big_Data_Management/PBDM_Project/" + model.__class__.__name__ + ".parquet")