In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RobustScaler
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import *
from time import time

In [2]:
class FraudDetection():
    def __init__(self):
        self.spark = None
        self.data = None
        self.rep_data = None # repeated data
        
    def create_spark_context(self, ram, rpt=False, ret=False):
        self.spark = SparkSession.\
            builder.\
            appName("Fraud Detector").\
            master("spark://spark-master:7077").\
            config("spark.executor.memory", "{}g".format(ram)).\
            getOrCreate()
        if rpt: print(self.spark.sparkContext.getConf().getAll())
        if ret: return self.spark
    
    def read_file(self, path, rpt=False, ret=False):
        self.data = self.spark.read.csv(path, header=True, inferSchema=True)
        if rpt: print('number of partitions: {}'.format(self.data.rdd.getNumPartitions()))
        if ret: return self.data
    
    def data_duplicator(self, number, rpt=False, ret=False):
        self.rep_data = self.data
        for i in range(number-1):
            self.rep_data = self.data.union(self.rep_data)
        if rpt: print("Created df with: {}, {}".format(self.rep_data .count(), len(self.rep_data .columns)))
        if ret: return self.rep_data
    

In [43]:
class Preprocess():
    def __init__(self, data):
        self.spark = None
        self.sub_sample = None
        self.data = data
        
    def scale_column(self, feature):
        self.data = self.data.withColumn(feature, self.data[feature].cast(IntegerType()))
        assembler = VectorAssembler().setInputCols([feature]).setOutputCol('f'+feature)
        self.data = assembler.transform(self.data)
        self.data = self.data.drop(feature)
        scaler = RobustScaler(inputCol="f"+feature, outputCol=feature,
                          withScaling=True, withCentering=False,
                          lower=0.25, upper=0.75)
        scalerModel = scaler.fit(self.data)
        self.data = scalerModel.transform(self.data)
        self.data = self.data.drop('f'+feature)
        unlist = udf(lambda x: float(list(x)[0]), DoubleType())
        self.data = self.data.withColumn(feature, unlist(feature))
        return self.data
    
    def robust_scale(self, scale_columns):
        for column in scale_columns:
            self.data = self.scale_column(column)
        return self.data
    
    def calculate_iqr_bound(self, feature, q1, q3, k, rpt=False):
        bound = self.sub_sample.filter(self.data.Class==1).approxQuantile(feature, [q1, q3], 0)
        if rpt: print(f'Feature: {feature}, Lower bound: {bound[0]}, Upper bound: {bound[1]}')
        iqr = bound[1] - bound[0]
        if rpt: print(f'Feature: {feature}, IQR: {iqr}')
        bound[0] = bound[0] - (iqr * k)
        bound[1] = bound[1] + (iqr * k)
        if rpt: print(f'Feature: {feature}, Cut-off Lower bound: {bound[0]}, Cut-off Upper bound: {bound[1]}')
        return bound
    
    def outlier_removal(self, features, q1=0.25, q3=0.75, k=1.5, rpt=False):
        frauds = self.data.filter(self.data.Class==1)
        self.sub_sample = frauds.union(self.data.filter(self.data.Class==0).limit(492))
        for feature in features:
            before_removal_count = self.sub_sample.count()
            bound = self.calculate_iqr_bound(feature, q1, q3, k, rpt=rpt)
            self.sub_sample = self.sub_sample.filter((col(feature) >= bound[0]) & (col(feature) <= bound[1]))
            after_removal_count = self.sub_sample.count()
            if rpt: print(f'before removal count: {before_removal_count}, after removal count: {after_removal_count}')


In [44]:
def main(ram, duplicate):
    start = time()
    detector = FraudDetection()
    detector.create_spark_context(ram=ram)
    detector.read_file("/opt/workspace/creditcard.csv", True)
    detector.data_duplicator(duplicate, True)
    preprocessor = Preprocess(detector.rep_data)
    preprocessor.robust_scale(['Time', 'Amount'])
    preprocessor.outlier_removal(['V14', 'V12', 'V10'], rpt=True)
    print('Elapsed time is: {}'.format(time()-start))
    return detector

In [45]:
detector = main(ram=8, duplicate=1)

number of partitions: 5
Created df with: 284807, 31
Feature: V14, Lower bound: -9.79801207658904, Upper bound: -4.28799577702192
Feature: V14, IQR: 5.510016299567121
Feature: V14, Cut-off Lower bound: -18.063036525939722, Cut-off Upper bound: 3.9770286723287613
before removal count: 984, after removal count: 981
Feature: V12, Lower bound: -8.67967880327782, Upper bound: -2.89990738849473
Feature: V12, IQR: 5.77977141478309
Feature: V12, Cut-off Lower bound: -17.349335925452454, Cut-off Upper bound: 5.7697497336799035
before removal count: 981, after removal count: 977
Feature: V10, Lower bound: -7.50211219093686, Upper bound: -2.5166280017922
Feature: V10, IQR: 4.98548418914466
Feature: V10, Cut-off Lower bound: -14.980338474653852, Cut-off Upper bound: 4.961598281924791
before removal count: 977, after removal count: 947
Elapsed time is: 12.2393798828125
