In [1]:
import os
import sys
from pathlib import Path
curr_path = str(Path(os.getcwd()).parent)
sys.path.append(curr_path)
from scripts.sa2_age_allocation import *
from scripts.constants import *
from scripts.load import *
from scripts.transform import *
from scripts.read import *
from scripts.misc_changes import *
from scripts.external_etl import *
from scripts.join import *
from scripts.plotting import *
import warnings
warnings.filterwarnings("ignore")
from pyspark.sql.functions import *
from pyspark.sql.column import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression
from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import geopandas as gpd
import pandas as pd 
import numpy as np
import math
import re
import random
import json

# start a spark session
spark = create_spark()
PREFIX = "."
PREDICTION = "prediction"

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/01 15:11:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/01 15:11:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Fraud Probability

In [2]:
raw_transactions = spark.read.parquet("../data/raw/raw_transactions/")
merchant_fraud = read_raw_merchant_fraud(spark, PREFIX)
tbl_merchants = encode_revenue_level(read_mapped_industry_data(spark, PREFIX))
tbl_merchants = tbl_merchants.withColumn(TAKE_RATE, col(TAKE_RATE).cast(FloatType()))

                                                                                

In [3]:
def extract_date_features(df: DataFrame) -> DataFrame:
    """
    Extract the datetime columns into more specific formats
    - Parameters:
        - df (DataFrame): Dataframe with datetime columns

    Returns:
        DataFrame: Dataframe with updated column names
    """
    df = df.withColumns({
        ORDER_DAY_OF_MONTH: dayofmonth(col(ORDER_DATETIME)),
        ORDER_MONTH: month(col(ORDER_DATETIME)),
        ORDER_YEAR: year(col(ORDER_DATETIME))
    })
    
    return df

def rename_fraud_prob_column(df:DataFrame, new_column: str) -> DataFrame:
    """
    Rename fraud probability column name
    Args:
        df (DataFrame): Dataframe to be updated
        new_column (str): New column name

    Returns:
        DataFrame: Updated dataframe
    """
    return df.withColumnRenamed(FRAUD_PROBABILITY, new_column)

# perform log-transformation for fraud probabilities
merchant_fraud = merchant_fraud.withColumn(FRAUD_PROBABILITY, log(col(FRAUD_PROBABILITY)))

# rename fraud probability columns to relevant dataset name
merchant_fraud = rename_fraud_prob_column(merchant_fraud, MERCHANT_FRAUD_PROB)

# extract the date features
merchant_fraud = extract_date_features(merchant_fraud)

# transform the transaction columns to preserve only 2 dp
# raw_transactions = raw_transactions.groupBy([MERCHANT_ABN, ORDER_DATETIME]).agg(avg(DOLLAR_VALUE).alias(DOLLAR_VALUE))
raw_transactions = round_dollar_values(raw_transactions)
raw_transactions = extract_date_features(raw_transactions)

In [4]:
# specify columns to drop and that are categorical
drop_cols = (NAME, ORDER_ID, USER_ID, ORDER_DATETIME)
cat_cols = (MERCHANT_ABN, ORDER_YEAR, ORDER_MONTH, ORDER_DAY_OF_MONTH, INDUSTRY_TAGS)
MERCHANT_JOIN_COLS = [MERCHANT_ABN, ORDER_YEAR, ORDER_MONTH, ORDER_DAY_OF_MONTH]
LABEL = "label"
LOWER_BOUND = 50.0
UPPER_BOUND = 80.0
FEATURES = "features"

In [5]:
# join all the merchant-related columns with the transactions
merchant_transactions = raw_transactions.join(merchant_fraud, on=MERCHANT_JOIN_COLS, how=OUTER_JOIN)\
    .join(tbl_merchants, on=[MERCHANT_ABN], how=INNER_JOIN)

# drop uninformative columns
merchant_transactions = merchant_transactions.drop(*drop_cols)

In [6]:
# cast the merchant ABN, order datetime, and user ID as strings for string indexing 
merchant_transactions = merchant_transactions.withColumns({
    MERCHANT_ABN: col(MERCHANT_ABN).cast(StringType()),
    ORDER_DAY_OF_MONTH: col(ORDER_DAY_OF_MONTH).cast(StringType()),
    ORDER_MONTH: col(ORDER_MONTH).cast(StringType()),
    ORDER_YEAR: col(ORDER_YEAR).cast(StringType())
})

In [7]:
# merchant_transactions.show(5) 

In [8]:
def predict_merchant_fraud_probability(merchant_transactions: DataFrame):
    """
    Predict the remaining unknown merchant fraud probabilities based on transactions and dollar value
    Args:
        merchant_transactions (DataFrame): Dataframe containing aggregated 
    Returns:
        rf: Initialised Random Forest Regressor model
        rf_model: Fitted Random Forest Regressor model
        pred_df: Dataframe of predicted models
    """
    INDEXED_COL = "_indexed"
    DIFFERENCE = "difference"
    PREDICTION = "prediction"

    cat_cols = (ORDER_YEAR, ORDER_MONTH, ORDER_DAY_OF_MONTH, INDUSTRY_TAGS)
    input_assembler_cols = [DOLLAR_VALUE, TAKE_RATE, REVENUE_LEVEL, 
                            "order_day_of_month_indexed", "order_month_indexed", "order_year_indexed", "industry_tags_indexed"]

    merchant_abns = merchant_transactions.select(MERCHANT_ABN)
    merchant_abns = [row[MERCHANT_ABN] for row in merchant_abns.distinct().collect()]

    print("PERFORM STRING INDEXING")
    for column in cat_cols:
        col_indexer = StringIndexer(inputCol=column, outputCol=column+INDEXED_COL)
        merchant_transactions = col_indexer.fit(merchant_transactions).transform(merchant_transactions)

    print("PERFORM VECTOR ASSEMBLING")
    assembler = VectorAssembler(inputCols=input_assembler_cols, outputCol="features")
    merchant_fraud_transactions = assembler.transform(merchant_transactions)
 
    print("SPLIT DATA INTO KNOWN AND UNKNOWN FRAUD PROBABILITIES")
    train_test_merchants = merchant_fraud_transactions.where(col(MERCHANT_FRAUD_PROB).isNotNull())
    to_predict_merchants = merchant_fraud_transactions.where(col(MERCHANT_FRAUD_PROB).isNull())

    print("SPLIT KNOWN PROBABILITIES INTO TRAIN-TEST SET")
    train_merchants, test_merchants = train_test_merchants.randomSplit([0.9, 0.1], seed=42)
    # print(train_merchants.count())
    # print(test_merchants.count())

    print("INITIALISE RFR MODEL")    
    rf = RandomForestRegressor(featuresCol=FEATURES, labelCol=MERCHANT_FRAUD_PROB)
    print("FIT RFR MODEL WITH TRAIN SET")
    rf_model = rf.fit(train_merchants.select(FEATURES, MERCHANT_FRAUD_PROB))
    print("TRANSFORM AND PREDICT RFR MODEL WITH TEST SET")
    predictions = rf_model.transform(test_merchants.select(FEATURES, MERCHANT_FRAUD_PROB))

    print("TRANSFORM THE PREDICTIONS TO EXP OF PREDICTIONS")
    pred_df = predictions.withColumns({
        PREDICTION: exp(col(PREDICTION)),
        MERCHANT_FRAUD_PROB: exp(col(MERCHANT_FRAUD_PROB))})
    pred_df = pred_df.withColumn(DIFFERENCE, col(PREDICTION) - col(MERCHANT_FRAUD_PROB))

    print("RETURN ALL VALUES NEEDED")
    return rf, rf_model, pred_df, to_predict_merchants

In [9]:
rf, rf_model, diff, predicting_merchants = predict_merchant_fraud_probability(merchant_transactions)

                                                                                

PERFORM STRING INDEXING


                                                                                

PERFORM VECTOR ASSEMBLING
SPLIT DATA INTO KNOWN AND UNKNOWN FRAUD PROBABILITIES
SPLIT KNOWN PROBABILITIES INTO TRAIN-TEST SET
INITIALISE RFR MODEL
FIT RFR MODEL WITH TRAIN SET


                                                                                

TRANSFORM AND PREDICT RFR MODEL WITH TEST SET
TRANSFORM THE PREDICTIONS TO EXP OF PREDICTIONS
RETURN ALL VALUES NEEDED


In [10]:
# Evaluate model based on mean absolute error
mae_evaluator = RegressionEvaluator(labelCol=MERCHANT_FRAUD_PROB, predictionCol=PREDICTION, metricName="mae")
print(mae_evaluator.evaluate(diff))

# Evaluate model based on r2
r2_evaluator = RegressionEvaluator(labelCol=MERCHANT_FRAUD_PROB, predictionCol=PREDICTION, metricName="r2")
print(r2_evaluator.evaluate(diff))

                                                                                

0.9187580445408727




0.7008950042824906


                                                                                

In [11]:
# predict the unknown merchant fraud probabilities
pred_df = rf_model.transform(predicting_merchants)

# reformat the dataframe to retain relevant columns as well as renaming columns
pred_df = pred_df.withColumn("prediction", exp(col("prediction")))
pred_df = pred_df.drop(MERCHANT_FRAUD_PROB)
pred_df = pred_df.withColumnRenamed(existing=PREDICTION, new=MERCHANT_FRAUD_PROB)
# pred_df.show(5)

In [12]:
# drop indexed features
drop_cols = [col for col in pred_df.columns if "_indexed" in col]

new_predictions = pred_df.drop(*drop_cols)
new_predictions = new_predictions.drop(FEATURES)
# new_predictions.show(5)

In [13]:
# join the known and predicted fraud probabilities into one dataframe
fraud_transactions = merchant_transactions.where(col(MERCHANT_FRAUD_PROB).isNotNull())
fraud_transactions = fraud_transactions.withColumn(MERCHANT_FRAUD_PROB, exp(col(MERCHANT_FRAUD_PROB)))
all_transactions = fraud_transactions.union(new_predictions)
# all_transactions.show(5)

In [14]:
LOWER_BOUND = 50.0
UPPER_BOUND = 80.0
# filter the dataframe to only retain less fraudulent transactions
all_transactions = all_transactions.where(col(MERCHANT_FRAUD_PROB) < LOWER_BOUND)
# all_transactions.show(5)

In [14]:
all_transactions

                                                                                

merchant_abn,order_year,order_month,order_day_of_month,dollar_value,merchant_fraud_probability,revenue_level,take_rate,industry_tags,MappedIndustry
11590404675,2021,12,21,21317.87,29.607818240092094,1.0,4.19,antique repairs r...,R
11590404675,2021,12,21,30489.85,29.607818240092094,1.0,4.19,antique repairs r...,R
15043504837,2021,10,8,31747.91,25.05439199147392,1.0,4.62,jewelry watch clo...,A
15043504837,2021,10,8,11966.83,25.05439199147392,1.0,4.62,jewelry watch clo...,A
15043504837,2021,12,14,20937.32,26.12523097610844,1.0,4.62,jewelry watch clo...,A
15043504837,2021,12,14,25626.72,26.12523097610844,1.0,4.62,jewelry watch clo...,A
18158387243,2021,11,29,471.43,28.956947892226463,2.0,2.03,health beauty spas,N
18158387243,2021,11,29,617.53,28.956947892226463,2.0,2.03,health beauty spas,N
18158387243,2021,11,29,404.69,28.956947892226463,2.0,2.03,health beauty spas,N
18158387243,2021,11,29,259.78,28.956947892226463,2.0,2.03,health beauty spas,N


In [None]:
all_transactions.printSchema()

                                                                                

<bound method DataFrame.printSchema of +------------+----------+-----------+------------------+------------+--------------------------+-------------+---------+--------------------+--------------+
|merchant_abn|order_year|order_month|order_day_of_month|dollar_value|merchant_fraud_probability|revenue_level|take_rate|       industry_tags|MappedIndustry|
+------------+----------+-----------+------------------+------------+--------------------------+-------------+---------+--------------------+--------------+
| 11590404675|      2021|         12|                21|    21317.87|        29.607818240092094|          1.0|     4.19|antique repairs r...|             R|
| 11590404675|      2021|         12|                21|    30489.85|        29.607818240092094|          1.0|     4.19|antique repairs r...|             R|
| 15043504837|      2021|         10|                 8|    31747.91|         25.05439199147392|          1.0|     4.62|jewelry watch clo...|             A|
| 15043504837|     

In [29]:
all_transactions.write.option("header", True).csv("../data/curated/pred_merchant_fraud.csv")

                                                                                

In [30]:
pred_frauds = spark.read.csv("../data/curated/pred_merchant_fraud.csv", header=True)
pred_frauds

merchant_abn,order_year,order_month,order_day_of_month,dollar_value,merchant_fraud_probability,revenue_level,take_rate,industry_tags,MappedIndustry
10364012396,2021,4,30,231.39,1.0,3.63,music musical ins...,M,55.420211045275806
10364012396,2021,7,1,707.32,1.0,3.63,music musical ins...,M,63.2409153487022
10364012396,2021,7,6,77.76,1.0,3.63,music musical ins...,M,64.7870681748278
10364012396,2021,9,29,293.59,1.0,3.63,music musical ins...,M,54.22022426312685
10364012396,2021,10,4,672.0,1.0,3.63,music musical ins...,M,54.88327634191102
10364012396,2022,4,27,2447.39,1.0,3.63,music musical ins...,M,49.64697734426108
10364012396,2022,4,30,60.36,1.0,3.63,music musical ins...,M,55.47985544440434
10364012396,2022,7,10,313.14,1.0,3.63,music musical ins...,M,55.217432638573854
10364012396,2021,8,30,839.06,1.0,3.63,music musical ins...,M,53.63847687842438
10364012396,2021,10,18,466.6,1.0,3.63,music musical ins...,M,54.7575881875096
