In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RobustScaler
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import *
from time import time

from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
class FraudDetection():
    def __init__(self):
        self.spark = None
        self.data = None
        self.rep_data = None # repeated data
        
    def create_spark_context(self, ram, rpt=False, ret=False):
        self.spark = SparkSession.\
            builder.\
            appName("Fraud Detector").\
            master("spark://spark-master:7077").\
            config("spark.executor.memory", "{}g".format(ram)).\
            getOrCreate()
        if rpt: print(self.spark.sparkContext.getConf().getAll())
        if ret: return self.spark
    
    def read_file(self, path, rpt=False, ret=False):
        self.data = self.spark.read.csv(path, header=True, inferSchema=True)
        if rpt: print('number of partitions: {}'.format(self.data.rdd.getNumPartitions()))
        if ret: return self.data
    
    def data_duplicator(self, number, rpt=False, ret=False):
        self.rep_data = self.data
        for i in range(number-1):
            self.rep_data = self.data.union(self.rep_data)
        if rpt: print("Created df with: {}, {}".format(self.rep_data .count(), len(self.rep_data .columns)))
        if ret: return self.rep_data

        
class Preprocess():
    def __init__(self, data):
        self.spark = None
        self.sub_sample = None
        self.data = data
        
    def scale_column(self, feature):
        self.data = self.data.withColumn(feature, self.data[feature].cast(IntegerType()))
        assembler = VectorAssembler().setInputCols([feature]).setOutputCol('f'+feature)
        self.data = assembler.transform(self.data)
        self.data = self.data.drop(feature)
        scaler = RobustScaler(inputCol="f"+feature, outputCol=feature,
                          withScaling=True, withCentering=False,
                          lower=0.25, upper=0.75)
        scalerModel = scaler.fit(self.data)
        self.data = scalerModel.transform(self.data)
        self.data = self.data.drop('f'+feature)
        unlist = udf(lambda x: float(list(x)[0]), DoubleType())
        self.data = self.data.withColumn(feature, unlist(feature))
        return self.data
    
    def robust_scale(self, scale_columns):
        for column in scale_columns:
            self.data = self.scale_column(column)
        return self.data
    
    def calculate_iqr_bound(self, feature, q1, q3, k, rpt=False):
        bound = self.sub_sample.filter(self.data.Class==1).approxQuantile(feature, [q1, q3], 0)
        if rpt: print(f'Feature: {feature}, Lower bound: {bound[0]}, Upper bound: {bound[1]}')
        iqr = bound[1] - bound[0]
        if rpt: print(f'Feature: {feature}, IQR: {iqr}')
        bound[0] = bound[0] - (iqr * k)
        bound[1] = bound[1] + (iqr * k)
        if rpt: print(f'Feature: {feature}, Cut-off Lower bound: {bound[0]}, Cut-off Upper bound: {bound[1]}')
        return bound
    
    def outlier_removal(self, features, q1=0.25, q3=0.75, k=1.5, rpt=False):
        frauds = self.data.filter(self.data.Class==1)
        self.sub_sample = frauds.union(self.data.filter(self.data.Class==0).limit(492))
        for feature in features:
            before_removal_count = self.sub_sample.count()
            bound = self.calculate_iqr_bound(feature, q1, q3, k, rpt=rpt)
            self.sub_sample = self.sub_sample.filter((col(feature) >= bound[0]) & (col(feature) <= bound[1]))
            after_removal_count = self.sub_sample.count()
            if rpt: print(f'before removal count: {before_removal_count}, after removal count: {after_removal_count}')
    
    def assemble_features(self):
        assembler = VectorAssembler(inputCols=['V{}'.format(i) for i in range(1,29)], outputCol='features')
        self.data = assembler.transform(self.data)
        return self.data

    
class Evaluator():
    def __init__(self):
        pass
    
    def accuracy(self, data):
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="class", metricName="accuracy")
        print('accuracy: {}'.format(accuracy_evaluator.evaluate(data)))
        
    def recall(self, data):
        recall_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="class", metricName="recallByLabel")
        print('recall: {}'.format(recall_evaluator.evaluate(data)))
    
    def recall(self, data):
        recall_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="class", metricName="recallByLabel")
        print('recall: {}'.format(recall_evaluator.evaluate(data))) 

In [19]:
ram=16
duplicate=3
splitation=[0.7, 0.1, 0.2]
detector = FraudDetection()
detector.create_spark_context(ram=ram)
detector.read_file("/opt/workspace/bank_sim.csv", True)
detector.data_duplicator(duplicate, True)

number of partitions: 8
Created df with: 1783929, 10


In [20]:
detector.data.limit(5).show()

+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|step|     customer|age|gender|zipcodeOri|     merchant|zipMerchant|           category|amount|fraud|
+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|   0|'C1093826151'|'4'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'|  4.55|    0|
|   0| 'C352968107'|'2'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 39.68|    0|
|   0|'C2054744914'|'4'|   'F'|   '28007'|'M1823072687'|    '28007'|'es_transportation'| 26.89|    0|
|   0|'C1760612790'|'3'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 17.25|    0|
|   0| 'C757503768'|'5'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 35.72|    0|
+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+



In [21]:
df = detector.data

In [22]:
df.groupBy(df.customer).sum('amount').collect()[:10]

[Row(customer="'C1316262830'", sum(amount)=6024.37),
 Row(customer="'C1089903335'", sum(amount)=5546.180000000001),
 Row(customer="'C483009385'", sum(amount)=5346.830000000001),
 Row(customer="'C1983734850'", sum(amount)=6931.089999999999),
 Row(customer="'C1815981756'", sum(amount)=4952.500000000001),
 Row(customer="'C1028143403'", sum(amount)=5960.92),
 Row(customer="'C328208842'", sum(amount)=6255.51),
 Row(customer="'C1475317372'", sum(amount)=8966.54),
 Row(customer="'C1873446390'", sum(amount)=3674.49),
 Row(customer="'C135716487'", sum(amount)=4507.99)]

In [23]:
df.groupBy(df.customer).mean('amount').collect()[:10]

[Row(customer="'C1316262830'", avg(amount)=34.424971428571425),
 Row(customer="'C1089903335'", avg(amount)=31.51238636363637),
 Row(customer="'C483009385'", avg(amount)=29.704611111111117),
 Row(customer="'C1983734850'", avg(amount)=69.31089999999999),
 Row(customer="'C1815981756'", avg(amount)=30.19817073170732),
 Row(customer="'C1028143403'", avg(amount)=32.752307692307696),
 Row(customer="'C328208842'", avg(amount)=35.143314606741576),
 Row(customer="'C1475317372'", avg(amount)=81.51400000000001),
 Row(customer="'C1873446390'", avg(amount)=40.827666666666666),
 Row(customer="'C135716487'", avg(amount)=27.827098765432098)]

In [14]:
df.groupBy(df.zipcodeOri).count().collect()

[Row(zipcodeOri="'28007'", count=594643)]

In [15]:
df.groupBy(df.merchant).count().collect()

[Row(merchant="'M857378720'", count=122),
 Row(merchant="'M97925176'", count=599),
 Row(merchant="'M1294758098'", count=191),
 Row(merchant="'M1788569036'", count=181),
 Row(merchant="'M348934600'", count=205426),
 Row(merchant="'M1823072687'", count=299693),
 Row(merchant="'M1416436880'", count=220),
 Row(merchant="'M1535107174'", count=1868),
 Row(merchant="'M50039827'", count=916),
 Row(merchant="'M117188757'", count=21),
 Row(merchant="'M1600850729'", count=2624),
 Row(merchant="'M1748431652'", count=274),
 Row(merchant="'M1726401631'", count=3),
 Row(merchant="'M1888755466'", count=912),
 Row(merchant="'M349281107'", count=2881),
 Row(merchant="'M933210764'", count=69),
 Row(merchant="'M480139044'", count=3508),
 Row(merchant="'M980657600'", count=1769),
 Row(merchant="'M495352832'", count=69),
 Row(merchant="'M209847108'", count=3814),
 Row(merchant="'M2080407379'", count=48),
 Row(merchant="'M1873032707'", count=250),
 Row(merchant="'M348875670'", count=107),
 Row(merchant="'M85