In [1]:
# Import necessary libraries and functions.
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml.feature as ft
import pyspark.ml.classification as cl
import pyspark.ml.evaluation as ev
from pyspark.ml import Pipeline
from pyspark.ml.stat import ChiSquareTest
import pyspark.ml.tuning as tune
from pyspark.sql.functions import array, col, count, countDistinct, explode, isnan, lit, regexp_replace, split, when

In [2]:
# Create and show a dataframe.
spark = SparkSession \
    .builder \
    .appName("Python") \
    .getOrCreate()
fraud_df = spark.read.csv('hdfs:///user/jkren001/bs140513_032310.csv', header=True, inferSchema=True)
fraud_df.show()

+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|step|     customer|age|gender|zipcodeOri|     merchant|zipMerchant|           category|amount|fraud|
+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|   0|'C1093826151'|'4'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'|  4.55|    0|
|   0| 'C352968107'|'2'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 39.68|    0|
|   0|'C2054744914'|'4'|   'F'|   '28007'|'M1823072687'|    '28007'|'es_transportation'| 26.89|    0|
|   0|'C1760612790'|'3'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 17.25|    0|
|   0| 'C757503768'|'5'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 35.72|    0|
|   0|'C1315400589'|'3'|   'F'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 25.81|    0|
|   0| 'C765155274'|'1'|   'F'|   '28007'| 'M348934600'|    '28007'|'es_transporta

### Exploratory data analysis and data cleaning

In [3]:
# Print the schema.
fraud_df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- customer: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- zipcodeOri: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- zipMerchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- fraud: integer (nullable = true)



In [4]:
# Count the number of rows in the dataframe.
fraud_df.count()

594643

In [5]:
# Check for the number of missing values in each column. No missing values. 
fraud_df.select([count(when(isnan(c), c)).alias(c) for c in fraud_df.columns]).show()

+----+--------+---+------+----------+--------+-----------+--------+------+-----+
|step|customer|age|gender|zipcodeOri|merchant|zipMerchant|category|amount|fraud|
+----+--------+---+------+----------+--------+-----------+--------+------+-----+
|   0|       0|  0|     0|         0|       0|          0|       0|     0|    0|
+----+--------+---+------+----------+--------+-----------+--------+------+-----+



In [6]:
# Data cleaning. Rename some feature variables and their categories for clarity and remove quotation marks.
fraud_df = fraud_df.withColumn('age', \
    when(fraud_df.age == "'0'", "<=18") \
    .when(fraud_df.age == "'1'", "19-25") \
    .when(fraud_df.age == "'2'", "26-35") \
    .when(fraud_df.age == "'3'", "36-45") \
    .when(fraud_df.age == "'4'", "46-55") \
    .when(fraud_df.age == "'5'", "56-65") \
    .when(fraud_df.age == "'6'", ">65") \
    .when(fraud_df.age == "'U'", "Unknown")
    .otherwise(fraud_df.age))
fraud_df = fraud_df.withColumn('gender', \
    when(fraud_df.gender == "'M'", "Male") \
    .when(fraud_df.gender == "'F'", "Female") \
    .when(fraud_df.gender == "'E'", "Enterprise") \
    .when(fraud_df.gender == "'U'", "Unknown")
    .otherwise(fraud_df.gender))
fraud_df = fraud_df.withColumn('customer', regexp_replace('customer', "'", ""))
fraud_df = fraud_df.withColumn('zipcodeOri', regexp_replace('zipcodeOri', "'", ""))
fraud_df = fraud_df.withColumn('merchant', regexp_replace('merchant', "'", ""))
fraud_df = fraud_df.withColumn('zipMerchant', regexp_replace('zipMerchant', "'", ""))
fraud_df = fraud_df.withColumn('category', split(fraud_df.category, '\_')[1])
fraud_df = fraud_df.withColumn('category', split(fraud_df.category, "'")[0])
fraud_df = fraud_df.withColumn('category', when(fraud_df.category == "hyper", "hypermarkets") \
                               .when(fraud_df.category == "tech", "technology")
                               .otherwise(fraud_df.category))
fraud_df = fraud_df.withColumnRenamed('zipcodeOri', 'zipcodeOrigin')
fraud_df = fraud_df.withColumnRenamed('category', 'merchandise category')
fraud_df = fraud_df.withColumnRenamed('amount', 'transaction amount')
fraud_df.show()

+----+-----------+-----+------+-------------+-----------+-----------+--------------------+------------------+-----+
|step|   customer|  age|gender|zipcodeOrigin|   merchant|zipMerchant|merchandise category|transaction amount|fraud|
+----+-----------+-----+------+-------------+-----------+-----------+--------------------+------------------+-----+
|   0|C1093826151|46-55|  Male|        28007| M348934600|      28007|      transportation|              4.55|    0|
|   0| C352968107|26-35|  Male|        28007| M348934600|      28007|      transportation|             39.68|    0|
|   0|C2054744914|46-55|Female|        28007|M1823072687|      28007|      transportation|             26.89|    0|
|   0|C1760612790|36-45|  Male|        28007| M348934600|      28007|      transportation|             17.25|    0|
|   0| C757503768|56-65|  Male|        28007| M348934600|      28007|      transportation|             35.72|    0|
|   0|C1315400589|36-45|Female|        28007| M348934600|      28007|   

In [7]:
# Count the number of distinct categories for step feature.
fraud_df.select(countDistinct('step')).show()

+--------------------+
|count(DISTINCT step)|
+--------------------+
|                 180|
+--------------------+



In [8]:
# Count the number of distinct categories for customer feature.
fraud_df.select(countDistinct('customer')).show()

+------------------------+
|count(DISTINCT customer)|
+------------------------+
|                    4112|
+------------------------+



In [9]:
# Show distinct categories for age feature and count their number.
fraud_df.select('age').distinct().show()
fraud_df.select(countDistinct('age')).show()

+-------+
|    age|
+-------+
|  19-25|
|  26-35|
|  46-55|
|Unknown|
|    >65|
|  36-45|
|  56-65|
|   <=18|
+-------+

+-------------------+
|count(DISTINCT age)|
+-------------------+
|                  8|
+-------------------+



In [10]:
# Show distinct categories for gender feature and count their number.
fraud_df.select('gender').distinct().show()
fraud_df.select(countDistinct('gender')).show()

+----------+
|    gender|
+----------+
|    Female|
|   Unknown|
|      Male|
|Enterprise|
+----------+

+----------------------+
|count(DISTINCT gender)|
+----------------------+
|                     4|
+----------------------+



In [11]:
# zipcodeOrigin feature has only one distinct category.
fraud_df.select('zipcodeOrigin').distinct().show()

+-------------+
|zipcodeOrigin|
+-------------+
|        28007|
+-------------+



In [12]:
# Count the number of distinct categories for merchant feature.
fraud_df.select(countDistinct('merchant')).show()

+------------------------+
|count(DISTINCT merchant)|
+------------------------+
|                      50|
+------------------------+



In [13]:
# zipMerchant feature has only one distinct category.
fraud_df.select('zipMerchant').distinct().show()

+-----------+
|zipMerchant|
+-----------+
|      28007|
+-----------+



In [14]:
# Remove features zipcodeOrigin and zipMerchant as each of them has only one distinct category. 
fraud_df = fraud_df.drop('zipcodeOrigin', 'zipMerchant')
fraud_df.show()

+----+-----------+-----+------+-----------+--------------------+------------------+-----+
|step|   customer|  age|gender|   merchant|merchandise category|transaction amount|fraud|
+----+-----------+-----+------+-----------+--------------------+------------------+-----+
|   0|C1093826151|46-55|  Male| M348934600|      transportation|              4.55|    0|
|   0| C352968107|26-35|  Male| M348934600|      transportation|             39.68|    0|
|   0|C2054744914|46-55|Female|M1823072687|      transportation|             26.89|    0|
|   0|C1760612790|36-45|  Male| M348934600|      transportation|             17.25|    0|
|   0| C757503768|56-65|  Male| M348934600|      transportation|             35.72|    0|
|   0|C1315400589|36-45|Female| M348934600|      transportation|             25.81|    0|
|   0| C765155274|19-25|Female| M348934600|      transportation|               9.1|    0|
|   0| C202531238|46-55|Female| M348934600|      transportation|             21.17|    0|
|   0| C10

In [15]:
# Show distinct categories for merchandise category feature and count their number.
fraud_df.select('merchandise category').distinct().show(truncate=False)
fraud_df.select(countDistinct('merchandise category')).show()

+--------------------+
|merchandise category|
+--------------------+
|wellnessandbeauty   |
|travel              |
|leisure             |
|sportsandtoys       |
|hypermarkets        |
|technology          |
|barsandrestaurants  |
|food                |
|otherservices       |
|health              |
|hotelservices       |
|home                |
|contents            |
|transportation      |
|fashion             |
+--------------------+

+------------------------------------+
|count(DISTINCT merchandise category)|
+------------------------------------+
|                                  15|
+------------------------------------+



In [16]:
# Show the descriptive statistics and skewness for transaction amount feature. It is substantially positively skewed, 
# with the maximum value markedly exceeding the mean value.
fraud_df.describe(['transaction amount']).show()
fraud_df.agg({'transaction amount': 'skewness'}).show()

+-------+------------------+
|summary|transaction amount|
+-------+------------------+
|  count|            594643|
|   mean|37.890135308075436|
| stddev|111.40283093084095|
|    min|               0.0|
|    max|           8329.96|
+-------+------------------+

+----------------------------+
|skewness(transaction amount)|
+----------------------------+
|          32.365756507288985|
+----------------------------+



In [17]:
# Show distinct categories for fraud (target variable).
fraud_df.select('fraud').distinct().show()

+-----+
|fraud|
+-----+
|    1|
|    0|
+-----+



In [18]:
# Group by fraud categories and the categories' frequencies and count the share of fraud transactions.
fraud_df.groupby('fraud').count().show()
print('The share of fraud transactions in the dataset is: {0:.1f}%'.format(fraud_df.filter(col("fraud") == 1).count()/fraud_df.count()*100))

+-----+------+
|fraud| count|
+-----+------+
|    1|  7200|
|    0|587443|
+-----+------+

The share of fraud transactions in the dataset is: 1.2%


...meaning that the dataset is highly imbalanced.

In [19]:
# Compare the average amounts spent in fraud and non-fraud transactions for each category.
fraud_df.filter(fraud_df.fraud == 1).groupby('merchandise category').mean().orderBy('avg(transaction amount)', ascending=False).select('merchandise category','avg(transaction amount)').show() 
fraud_df.filter(fraud_df.fraud == 0).groupby('merchandise category').mean().orderBy('avg(transaction amount)', ascending=False).select('merchandise category','avg(transaction amount)').show()

+--------------------+-----------------------+
|merchandise category|avg(transaction amount)|
+--------------------+-----------------------+
|              travel|     2660.8028719723175|
|                home|     457.48483443708596|
|       hotelservices|      421.8233394160586|
|          technology|      415.2741139240506|
|              health|      407.0313384433963|
|       sportsandtoys|     345.36681130171496|
|       otherservices|     316.46960526315786|
|             leisure|      300.2868776371309|
|             fashion|     247.00818965517246|
|   wellnessandbeauty|     229.42253481894176|
|        hypermarkets|     169.25542857142855|
|  barsandrestaurants|     164.09266666666662|
+--------------------+-----------------------+

+--------------------+-----------------------+
|merchandise category|avg(transaction amount)|
+--------------------+-----------------------+
|              travel|      669.0255333333333|
|                home|     113.33840855106888|
|       hote

Travel is the top category in terms of the amounts spent both in a fraud and a non-fraud transaction on average. In addition, one can also see that the amount spent in a fraud transaction is on average around four times bigger than that in a non-fraud transaction for all categories.

In [20]:
# Find the categories where fraud happens most frequently.
fraud_df.groupby('merchandise category').mean().orderBy('avg(fraud)', ascending=False).select('merchandise category', 'avg(fraud)').show()

+--------------------+--------------------+
|merchandise category|          avg(fraud)|
+--------------------+--------------------+
|             leisure|  0.9498997995991983|
|              travel|  0.7939560439560439|
|       sportsandtoys| 0.49525237381309345|
|       hotelservices| 0.31422018348623854|
|       otherservices|                0.25|
|                home| 0.15206445115810674|
|              health| 0.10512613896981343|
|          technology| 0.06666666666666667|
|   wellnessandbeauty| 0.04759379557205356|
|        hypermarkets|0.045916693998032145|
|  barsandrestaurants|0.018829436686019142|
|             fashion|0.017973349860551595|
|                food|                 0.0|
|            contents|                 0.0|
|      transportation|                 0.0|
+--------------------+--------------------+



Leisure and travel are the categories where fraud happens most frequently: 95.0% and 79.4% of total category transactions, respectively.

In [21]:
# Compare the shares of men and women participating in fraud transactions.
fraud_df.groupby('gender').mean().orderBy('avg(fraud)', ascending=False).select('gender','avg(fraud)').show()

+----------+--------------------+
|    gender|          avg(fraud)|
+----------+--------------------+
|    Female|0.014659621339331104|
|      Male| 0.00907278722730406|
|Enterprise|0.005942275042444821|
|   Unknown|                 0.0|
+----------+--------------------+



On average, 1.5% of women and 0.9% of men participate in fraud transactions...

In [22]:
# Compare the average transaction amounts for men and women in fraud transactions.
fraud_df.filter(fraud_df.fraud == 1).groupby('gender').mean().orderBy('avg(transaction amount)', ascending=False).select('gender','avg(transaction amount)').show()

+----------+-----------------------+
|    gender|avg(transaction amount)|
+----------+-----------------------+
|      Male|      540.3700780287468|
|    Female|      526.1781883144178|
|Enterprise|      473.4585714285714|
+----------+-----------------------+



... but men spend slightly more in them.

In [23]:
# Compare the shares of age categories participating in fraud transactions.
fraud_df.groupby('age').mean().orderBy('avg(fraud)', ascending=False).select('age','avg(fraud)').show()

+-------+--------------------+
|    age|          avg(fraud)|
+-------+--------------------+
|   <=18| 0.01957585644371941|
|  46-55| 0.01293281357486815|
|  26-35| 0.01251401420105707|
|  36-45|0.011928145666107075|
|  19-25| 0.01185253995286508|
|  56-65|0.010951119057501357|
|    >65| 0.00974826324045716|
|Unknown|0.005942275042444821|
+-------+--------------------+



2% of the youngest (18-year-olds and younger) participate in fraud transactions - the highest share among age categories... 

In [24]:
# Compare transaction amounts among the age categories participating in fraud transactions. 
fraud_df.filter(fraud_df.fraud == 1).groupby('age').mean().orderBy('avg(transaction amount)', ascending=False).select('age','avg(transaction amount)').show()

+-------+-----------------------+
|    age|avg(transaction amount)|
+-------+-----------------------+
|   <=18|      657.2781249999999|
|  26-35|      552.2124317406141|
|    >65|       545.402681992337|
|  36-45|      532.4287578347573|
|  46-55|      522.4009148936171|
|  19-25|     499.75769230769237|
|  56-65|      489.4180174927113|
|Unknown|      473.4585714285714|
+-------+-----------------------+



...and spend more in a fraud transaction on average than other age categories.

In [25]:
# Index categorical features' values with StringIndexer, then remove original columns.
indexer = ft.StringIndexer(inputCols=['step', 'customer', 'age', 'gender', 'merchant', 'merchandise category'],
 outputCols=['step_indexed', 'customer_indexed', 'age_indexed', 'gender_indexed', 'merchant_indexed', 'category_indexed'])
fraud_df_indexed = indexer.fit(fraud_df).transform(fraud_df)
fraud_df_indexed = fraud_df_indexed.drop('step', 'customer', 'age', 'gender', 'merchant', 'merchandise category')
fraud_df_indexed.show()

+------------------+-----+------------+-----------+----------------+----------------+----------------+--------------+
|transaction amount|fraud|step_indexed|age_indexed|category_indexed|merchant_indexed|customer_indexed|gender_indexed|
+------------------+-----+------------+-----------+----------------+----------------+----------------+--------------+
|              4.55|    0|       178.0|        2.0|             0.0|             1.0|          1795.0|           1.0|
|             39.68|    0|       178.0|        0.0|             0.0|             1.0|          1620.0|           1.0|
|             26.89|    0|       178.0|        2.0|             0.0|             0.0|          3796.0|           0.0|
|             17.25|    0|       178.0|        1.0|             0.0|             1.0|          1273.0|           1.0|
|             35.72|    0|       178.0|        3.0|             0.0|             1.0|          2814.0|           1.0|
|             25.81|    0|       178.0|        1.0|     

### Features’ association with fraud

I set the null hypothesis that categorical features are not associated with fraud and will use Chi-square test to determine that.

In [26]:
# Launch VectorAssembler for calculating p-values of categorical features (statistical significance of association 
# between them and fraud). First, exclude the transaction amount feature as it is numerical.
fraud_df_categorical = fraud_df_indexed.drop('transaction amount')
targetv = ['fraud']

assembler_chisq = ft.VectorAssembler(
    inputCols=[x for x in fraud_df_categorical.columns if x not in targetv],
    outputCol='features')

# Convert to vector column.
df_vector = assembler_chisq.transform(fraud_df_categorical).select('fraud', 'features')

In [27]:
# Run Chi-square test.  
chi = ChiSquareTest.test(df_vector, 'features', 'fraud').head()
print("p-values:" + str(chi.pValues))

p-values:[0.9999999356451084,2.0021179258922217e-07,0.0,0.0,0.0,0.0]


Only the step feature has p-value very close to 1, meaning that at almost 100% confidence level it is not associated with fraud. For other features, I reject the null hypothesis of no association with fraud.

In [28]:
# Calculate point biserial correlation (equivalent to Pearson correlation) between transaction amount and fraud.
print('The point biserial correlation coefficient between transaction amount and fraud is: {0:.2f}%'.format(fraud_df_indexed.corr('transaction amount', 'fraud')))

The point biserial correlation coefficient between transaction amount and fraud is: 0.49%


...indicating moderate positive correlation. Therefore, I only remove the step variable from the dataframe.

In [29]:
# Remove step feature from the dataframe.
fraud_df_indexed = fraud_df_indexed.drop('step_indexed')
fraud_df_indexed.show()

+------------------+-----+-----------+----------------+----------------+----------------+--------------+
|transaction amount|fraud|age_indexed|category_indexed|merchant_indexed|customer_indexed|gender_indexed|
+------------------+-----+-----------+----------------+----------------+----------------+--------------+
|              4.55|    0|        2.0|             0.0|             1.0|          1795.0|           1.0|
|             39.68|    0|        0.0|             0.0|             1.0|          1620.0|           1.0|
|             26.89|    0|        2.0|             0.0|             0.0|          3796.0|           0.0|
|             17.25|    0|        1.0|             0.0|             1.0|          1273.0|           1.0|
|             35.72|    0|        3.0|             0.0|             1.0|          2814.0|           1.0|
|             25.81|    0|        1.0|             0.0|             1.0|           623.0|           0.0|
|               9.1|    0|        4.0|             0.0|

### Modelling

In [30]:
# Launch VectorAssembler for modelling (this time I include the amount feature).
targetv = ['fraud']

assembler_indexed = ft.VectorAssembler(
    inputCols=[x for x in fraud_df_indexed.columns if x not in targetv],
    outputCol='features')

In [31]:
# Split the dataframe into random train and test sets at 70%/30% with a fixed seed of 50.
fraud_train, fraud_test = fraud_df_indexed.randomSplit([0.7, 0.3], seed=50)

In [32]:
# The issue with the imbalanced dataset needs to be resolved, so try both oversampling (populating fraud samples) 
# and undersampling (reducing non-fraud samples) in our train set and then compare the classifiers' performances. 
# First, calculate the ratio of non-fraud rows to fraud rows in our train set. 
major_fraud_train = fraud_train.filter(col("fraud") == 0)
minor_fraud_train = fraud_train.filter(col("fraud") == 1)
ratio = int(major_fraud_train.count()/minor_fraud_train.count())
print("ratio: {}".format(ratio))

ratio: 81


In [33]:
# Now prepare the oversampled dataframe by populating fraud samples.
a = range(ratio)
oversamp_fraud_train = minor_fraud_train.withColumn('dummy', explode(array([lit(x) for x in a]))).drop('dummy')
# Combine both oversampled fraud rows and unchanged non-fraud rows. 
combined_oversamp_fraud_train = major_fraud_train.unionAll(oversamp_fraud_train)
combined_oversamp_fraud_train.show()

+------------------+-----+-----------+----------------+----------------+----------------+--------------+
|transaction amount|fraud|age_indexed|category_indexed|merchant_indexed|customer_indexed|gender_indexed|
+------------------+-----+-----------+----------------+----------------+----------------+--------------+
|               0.0|    0|        0.0|             0.0|             0.0|           727.0|           0.0|
|               0.0|    0|        0.0|             0.0|             0.0|          1101.0|           0.0|
|               0.0|    0|        0.0|             0.0|             0.0|          2113.0|           1.0|
|               0.0|    0|        0.0|             0.0|             0.0|          2818.0|           0.0|
|               0.0|    0|        0.0|             0.0|             0.0|          2982.0|           1.0|
|               0.0|    0|        0.0|             0.0|             1.0|           321.0|           0.0|
|               0.0|    0|        0.0|             0.0|

In [34]:
# Now prepare the undersampled dataframe by reducing non-fraud samples.
undersamp_fraud_train = major_fraud_train.sample(False, 1/ratio, seed=50)
# Combine both undersampled non-fraud rows and unchanged fraud rows.
combined_undersamp_fraud_train = undersamp_fraud_train.unionAll(minor_fraud_train)
combined_undersamp_fraud_train.show()

+------------------+-----+-----------+----------------+----------------+----------------+--------------+
|transaction amount|fraud|age_indexed|category_indexed|merchant_indexed|customer_indexed|gender_indexed|
+------------------+-----+-----------+----------------+----------------+----------------+--------------+
|               0.0|    0|        1.0|             0.0|             1.0|          1494.0|           0.0|
|              0.05|    0|        0.0|             0.0|             0.0|           342.0|           0.0|
|              0.05|    0|        2.0|             0.0|             0.0|          3026.0|           1.0|
|              0.07|    0|        0.0|             0.0|             1.0|           766.0|           0.0|
|              0.14|    0|        0.0|             0.0|             0.0|          2413.0|           1.0|
|              0.16|    0|        4.0|             1.0|             2.0|           659.0|           1.0|
|              0.19|    0|        4.0|             0.0|

In [35]:
# Create a random forest classifier, with maxBins at 4200 as the number of distinct categories 
# for customer_indexed feature is 4112, and a pipeline. Specify the label column, set seed at 50, 
# leave other parameters at default values. Then fit the classifier on the oversampled and undersampled train dataframes. 
classifierRF = cl.RandomForestClassifier(labelCol='fraud', maxBins=4200, seed=50)
pipelineRF = Pipeline(stages=[assembler_indexed, classifierRF])
model_oversampRF = pipelineRF.fit(combined_oversamp_fraud_train)
model_undersampRF = pipelineRF.fit(combined_undersamp_fraud_train)

In [36]:
# Create a logistic regression classifier and a pipeline. Specify the label column, leave other parameters at default values.
# Then fit the classifier on the oversampled and undersampled train dataframes.
classifierLR = cl.LogisticRegression(labelCol='fraud')
pipelineLR = Pipeline(stages=[assembler_indexed, classifierLR])
model_oversampLR = pipelineLR.fit(combined_oversamp_fraud_train)
model_undersampLR = pipelineLR.fit(combined_undersamp_fraud_train)

In [37]:
# Create classification evaluators: binary for area under PR metric and multiclass for accuracy metric.
binary_evaluator = ev.BinaryClassificationEvaluator(labelCol='fraud')
multi_evaluator = ev.MulticlassClassificationEvaluator(labelCol='fraud')

In [38]:
# Test and evaluate the classifiers fit on the oversampled train dataframes.
test_model_oversampRF = model_oversampRF.transform(fraud_test)
test_model_oversampLR = model_oversampLR.transform(fraud_test)
print('Area under PR, random forest: {0:.3f}'.format(binary_evaluator.evaluate(test_model_oversampRF, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, random forest: {0:.3f}'.format(multi_evaluator.evaluate(test_model_oversampRF, {multi_evaluator.metricName: 'accuracy'})))
print('Area under PR, logistic regression: {0:.3f}'.format(binary_evaluator.evaluate(test_model_oversampLR, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, logistic regression: {0:.3f}'.format(multi_evaluator.evaluate(test_model_oversampLR, {multi_evaluator.metricName: 'accuracy'})))

Area under PR, random forest: 0.778
Accuracy, random forest: 0.985
Area under PR, logistic regression: 0.687
Accuracy, logistic regression: 0.959


The random forest classifier outperformed the logistic regression classifier on the oversampled train dataframe, in particular in the area under PR metric. The logistic regression classifier gave unreasonable preference to higher labels of ordinally encoded variables, leading to bias and poorer model performance.

In [39]:
# Test and evaluate the classifiers fit on the undersampled train dataframes.
test_model_undersampRF = model_undersampRF.transform(fraud_test)
test_model_undersampLR = model_undersampLR.transform(fraud_test)
print('Area under PR, random forest: {0:.3f}'.format(binary_evaluator.evaluate(test_model_undersampRF, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, random forest: {0:.3f}'.format(multi_evaluator.evaluate(test_model_undersampRF, {multi_evaluator.metricName: 'accuracy'})))
print('Area under PR, logistic regression: {0:.3f}'.format(binary_evaluator.evaluate(test_model_undersampLR, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, logistic regression: {0:.3f}'.format(multi_evaluator.evaluate(test_model_undersampLR, {multi_evaluator.metricName: 'accuracy'})))

Area under PR, random forest: 0.670
Accuracy, random forest: 0.951
Area under PR, logistic regression: 0.682
Accuracy, logistic regression: 0.957


However, both classifiers registered lower scores in both metrics when fit on the undersampled train dataframe. Undersampling probably resulted in the loss of valuable information about the majority class, with the chosen sample not accurately representing the population.

In [40]:
# As the logistic regression classifier gave undue preference to higher labels of ordinally encoded variables, 
# encode the categorical features with one-hot encoder and evalute models again.

encoder = ft.OneHotEncoder(inputCols=['customer_indexed', 'age_indexed', 'gender_indexed', 'merchant_indexed', 'category_indexed'],
 outputCols=['customer_vec', 'age_vec', 'gender_vec', 'merchant_vec', 'category_vec'])
encoded = encoder.fit(fraud_df_indexed).transform(fraud_df_indexed)
assembler_encoded = ft.VectorAssembler(inputCols = ['customer_vec', 'age_vec', 'gender_vec', 'merchant_vec', 'category_vec'], 
                                        outputCol = 'features')

# Remove indexed features from our dataframe.
fraud_df_encoded = encoded.drop('age_indexed', 'category_indexed', 'merchant_indexed', 'customer_indexed', 'gender_indexed')

# Split the dataframe into random train and test sets at 70%/30% with a fixed seed.
fraud_train_e, fraud_test_e = fraud_df_encoded.randomSplit([0.7, 0.3], seed=50)

# Again, the issue with the imbalanced dataset has to be resolved, so try both oversampling (populating fraud samples) 
# and undersampling (reducing non-fraud samples) in the train set and then compare the classifiers' performances. 
# First, calculate the ratio of non-fraud rows to fraud rows in the train set. 
major_fraud_train_e = fraud_train_e.filter(col("fraud") == 0)
minor_fraud_train_e = fraud_train_e.filter(col("fraud") == 1)
ratio_e = int(major_fraud_train_e.count()/minor_fraud_train_e.count())

# Prepare the oversampled dataframe by populating fraud samples.
a_e = range(ratio_e)
oversamp_fraud_train_e = minor_fraud_train_e.withColumn('dummy', explode(array([lit(x) for x in a_e]))).drop('dummy')

# Combine both oversampled fraud rows and unchanged non-fraud rows. 
combined_oversamp_fraud_train_e = major_fraud_train_e.unionAll(oversamp_fraud_train_e)

# Now prepare the undersampled dataframe by reducing non-fraud samples.
undersamp_fraud_train_e = major_fraud_train_e.sample(False, 1/ratio_e, seed=50)

# Combine both undersampled non-fraud rows and unchanged fraud rows.
combined_undersamp_fraud_train_e = undersamp_fraud_train_e.unionAll(minor_fraud_train_e)

In [41]:
# Create a pipeline, then fit the random forest classifier on the oversampled and undersampled train dataframes.
pipelineRF_e = Pipeline(stages=[assembler_encoded, classifierRF])
model_oversampRF_e = pipelineRF_e.fit(combined_oversamp_fraud_train_e)
model_undersampRF_e = pipelineRF_e.fit(combined_undersamp_fraud_train_e)

In [42]:
# Create a pipeline, then fit the logistic regression classifier on the oversampled and undersampled train dataframes.
pipelineLR_e = Pipeline(stages=[assembler_encoded, classifierLR])
model_oversampLR_e = pipelineLR_e.fit(combined_oversamp_fraud_train_e)
model_undersampLR_e = pipelineLR_e.fit(combined_undersamp_fraud_train_e)

In [43]:
# Test and evaluate the classifiers fit on the oversampled train dataframes.
test_model_oversampRF_e = model_oversampRF_e.transform(fraud_test_e)
test_model_oversampLR_e = model_oversampLR_e.transform(fraud_test_e)
print('Area under PR, random forest: {0:.3f}'.format(binary_evaluator.evaluate(test_model_oversampRF_e, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, random forest: {0:.3f}'.format(multi_evaluator.evaluate(test_model_oversampRF_e, {multi_evaluator.metricName: 'accuracy'})))
print('Area under PR, logistic regression: {0:.3f}'.format(binary_evaluator.evaluate(test_model_oversampLR_e, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, logistic regression: {0:.3f}'.format(multi_evaluator.evaluate(test_model_oversampLR_e, {multi_evaluator.metricName: 'accuracy'})))

Area under PR, random forest: 0.428
Accuracy, random forest: 0.882
Area under PR, logistic regression: 0.765
Accuracy, logistic regression: 0.986


The logistic regression classifier outperformed the random forest classifier, especially in the area under PR metric, on the oversampled train dataframe after one-hot encoding. While the logistic regression classifier’s performance improved in both metrics, the one of the random forest classifier worsened as one-hot encoding led to a marked increase in dimensionality because of a large number of categories for categorical features.

In [44]:
# Test and evaluate the classifiers fit on the undersampled train dataframes.
test_model_undersampRF_e = model_undersampRF_e.transform(fraud_test_e)
test_model_undersampLR_e = model_undersampLR_e.transform(fraud_test_e)
print('Area under PR, random forest: {0:.3f}'.format(binary_evaluator.evaluate(test_model_undersampRF_e, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, random forest: {0:.3f}'.format(multi_evaluator.evaluate(test_model_undersampRF_e, {multi_evaluator.metricName: 'accuracy'})))
print('Area under PR, logistic regression: {0:.3f}'.format(binary_evaluator.evaluate(test_model_undersampLR_e, {binary_evaluator.metricName: 'areaUnderPR'})))
print('Accuracy, logistic regression: {0:.3f}'.format(multi_evaluator.evaluate(test_model_undersampLR_e, {multi_evaluator.metricName: 'accuracy'})))

Area under PR, random forest: 0.397
Accuracy, random forest: 0.936
Area under PR, logistic regression: 0.583
Accuracy, logistic regression: 0.953


After one-hot encoding, the logistic regression classifier outperformed the random forest classifier in both metrics on the undersampled train dataframe too. But both classifiers again recorded lower scores in the area under PR metric than the ones fit on the oversampled train dataframes. The reason probably was the same as in the case with numerical indexing of categorical features: loss of valuable majority class data.

### Grid search

For the logistic regression classifier trained on the oversampled train dataframe after one-hot encoding, I use grid search to choose the best two parameters. I try the maximum number of iterations at 50 and 100 and the regularisation parameter at 0 and 0.1.

In [45]:
# Use grid search for choosing the best parameters for a logistic regression classifier.
gridLR = tune.ParamGridBuilder() \
 .addGrid(classifierLR.maxIter, 
 [50, 100]) \
 .addGrid(classifierLR.regParam, 
 [0, 0.01]) \
 .build()

In [46]:
# Launch the crossvalidator.
cvLR = tune.CrossValidator(estimator=pipelineLR_e, estimatorParamMaps=gridLR, evaluator=binary_evaluator)
# Test and evaluate the logistic regression classifier fit on the oversampled dataframe again.
cvModelLR = cvLR.fit(combined_oversamp_fraud_train_e)
results_cvLR = cvModelLR.transform(fraud_test_e)
print('Best area under PR, logistic regression: {0:.3f}'.format(binary_evaluator.evaluate(results_cvLR, {binary_evaluator.metricName: 'areaUnderPR'})))

Best area under PR, logistic regression: 0.766


One can observe a small improvement in the logistic regression classifier's performance in the area under PR metric: to 0.766 from 0.765.

In [47]:
# Print the best parameters. 
print('Best Param (regParam), logistic regression: ', cvModelLR.bestModel.stages[-1]._java_obj.parent().getRegParam())
print('Best Param (MaxIter), logistic regression: ', cvModelLR.bestModel.stages[-1]._java_obj.parent().getMaxIter())

Best Param (regParam), logistic regression:  0.0
Best Param (MaxIter), logistic regression:  50


The best maximum number of iterations is 50 rather than the default 100.

### Resources used

* Lopez-Rojas, E.A. Synthetic data from a financial payment system [Online]. Available from: https://www.kaggle.com/datasets/ealaxi/banksim1 [11 September 2022].
* Avci, T. Fraud Detection on Bank Payments [Online]. Available from: https://www.kaggle.com/code/turkayavci/fraud-detection-on-bank-payments [11 September 2022].
* Drabas, T., Lee, D. (2017). Learning PySpark. Birmingham - Mumbai: Packt Publishing.
* DeJesus, J. (2019). Point Biserial Correlation with Python [Online]. Available from: https://towardsdatascience.com/point-biserial-correlation-with-python-f7cd591bd3b1 [11 September 2022].
* Wan, J. (2020). Oversampling and Undersampling with PySpark [Online]. Available from: https://medium.com/@junwan01/oversampling-and-undersampling-with-pyspark-5dbc25cdf253 [11 September 2022].
* GeeksforGeeks (2022). ML | One Hot Encoding to treat Categorical data parameters [Online]. Available from: https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/ [11 September 2022].
* SparkByExamples.Com. (2022). PySpark When Otherwise | SQL Case When Usage [Online]. Available from: https://sparkbyexamples.com/pyspark/pyspark-when-otherwise/ [11 September 2022].
* SparkByExamples.Com. (2022). PySpark Replace Column Values in DataFrame [Online]. Available from: https://sparkbyexamples.com/pyspark/pyspark-replace-column-values/ [11 September 2022].
* Stack Overflow. How to find count of Null and Nan values for each column in a PySpark dataframe efficiently? [Online]. Available from: https://stackoverflow.com/questions/44627386/how-to-find-count-of-null-and-nan-values-for-each-column-in-a-pyspark-dataframe [11 September 2022].
* Stack Overflow. How to delete specific characters from a string in a PySpark dataframe? [Online]. Available from: https://stackoverflow.com/questions/66141218/how-to-delete-specific-characters-from-a-string-in-a-pyspark-dataframe [11 September 2022].
* Stack Overflow. pyspark p values and chisquaretest correlations [Online]. Available from: https://stackoverflow.com/questions/59055704/pyspark-p-values-and-chisquaretest-correlations [11 September 2022].
* Stack Overflow. How to use multiple columns in filter and lambda functions pyspark [Online]. Available from: https://stackoverflow.com/questions/60400741/how-to-use-multiple-columns-in-filter-and-lambda-functions-pyspark [11 September 2022].
* Stack Overflow. How do you perform one hot encoding with PySpark [Online]. Available from: https://stackoverflow.com/questions/55922787/how-do-you-perform-one-hot-encoding-with-pyspark [11 September 2022].
* Stack Overflow. How to extract model hyper-parameters from spark.ml in PySpark? [Online]. Available from: https://stackoverflow.com/questions/36697304/how-to-extract-model-hyper-parameters-from-spark-ml-in-pyspark [11 September 2022].