In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.sql.functions import col, round, when

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("OversampledModels").getOrCreate()

# read the csv
data = spark.read.csv("../../../data/model_data.csv", header=True, inferSchema=True)
data.show(5)

In [None]:
# define the feature columns
feature_cols = [col for col in data.columns if col != "isFraud"]

# calculate class frequencies
class_frequencies = data.groupBy("isFraud").count()
fraud_count = class_frequencies.filter(col("isFraud") == 1).select("count").first()[0]
non_fraud_count = class_frequencies.filter(col("isFraud") == 0).select("count").first()[0]

# calculate oversampling ratio
oversampling_ratio = non_fraud_count / fraud_count

# split the data into train and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1)

# oversample the minority class in the training set
oversampled_minority = train_data.filter(col("isFraud") == 1).sample(True, oversampling_ratio, seed=1)

# combine oversampled minority class with majority class in the training set
balanced_train_data = train_data.filter(col("isFraud") == 0).union(oversampled_minority)

# define the preprocessing pipeline for the features
preprocessing_pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=feature_cols, outputCol="features"),
    MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
])

# fit preprocessing pipeline to the balanced training set
preprocessing_model = preprocessing_pipeline.fit(balanced_train_data)

# # Save the fitted preprocessing pipeline to disk
# preprocessing_model.save("/content/drive/MyDrive/Grad_School/Big_Data_Management/PBDM_Project/preprocessing_pipeline.parquet")

# apply the preprocessing pipeline to the train and test sets
preprocessed_train_data = preprocessing_model.transform(balanced_train_data)
preprocessed_test_data = preprocessing_model.transform(test_data)

# define the models
rf = RandomForestClassifier(labelCol="isFraud", featuresCol="scaledFeatures", numTrees=100, seed=1)
gbt = GBTClassifier(labelCol="isFraud", featuresCol="scaledFeatures", maxIter=20, seed=1)
lr = LogisticRegression(labelCol="isFraud", featuresCol="scaledFeatures")

In [None]:
# fit the models
rf_model = rf.fit(preprocessed_train_data)

In [None]:
lr_model = lr.fit(preprocessed_train_data)

In [None]:
gbt_model = gbt.fit(preprocessed_train_data)

In [None]:
def eval_model(model):
  # Make predictions on the preprocessed test data
  predictions = model.transform(preprocessed_test_data)
  correct_count = predictions.filter(col("prediction") == col("isFraud")).count()
  accuracy = correct_count / total_count

  # compute recall
  tp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 1)).count()
  fn = predictions.filter((col("prediction") == 0) & (col("isFraud") == 1)).count()
  recall = tp / (tp + fn)

  # compute precision
  fp = predictions.filter((col("prediction") == 1) & (col("isFraud") == 0)).count()
  precision = tp / (tp + fp)

  print(model.__class__.__name__+ " accuracy: {:.4f}".format(accuracy))
  print(model.__class__.__name__ + " precision: {:.4f}".format(precision))
  print(model.__class__.__name__ + " recall: {:.4f}".format(recall))
    
models = [rf_model, gbt_model, lr_model]

# number of rows for computing metrics
total_count = data.count()
    
for model in models:
  eval_model(model)