In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RobustScaler
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import *
from time import time

In [None]:
class FraudDetection():
    def __init__(self):
        self.spark = None
        self.data = None
        self.rep_data = None # repeated data
        
    def create_spark_context(self, ram, rpt=False, ret=False):
        self.spark = SparkSession.\
            builder.\
            appName("Fraud Detector").\
            master("spark://spark-master:7077").\
            config("spark.executor.memory", "{}g".format(ram)).\
            getOrCreate()
        if rpt: print(self.spark.sparkContext.getConf().getAll())
        if ret: return self.spark
    
    def read_file(self, path, rpt=False, ret=False):
        self.data = self.spark.read.csv(path, header=True, inferSchema=True)
        if rpt: print('number of partitions: {}'.format(self.data.rdd.getNumPartitions()))
        if ret: return self.data
    
    def data_duplicator(self, number, rpt=False, ret=False):
        self.rep_data = self.data
        for i in range(number-1):
            self.rep_data = self.data.union(self.rep_data)
        if rpt: print("Created df with: {}, {}".format(self.rep_data .count(), len(self.rep_data .columns)))
        if ret: return self.rep_data
    

In [None]:
class Preprocess():
    def __init__(self, data):
        self.spark = None
        self.data = data
        
    def scale_column(self, feature):
        self.data = self.data.withColumn(feature, self.data[feature].cast(IntegerType()))
        assembler = VectorAssembler().setInputCols([feature]).setOutputCol('f'+feature)
        self.data = assembler.transform(self.data)
        self.data = self.data.drop(feature)
        scaler = RobustScaler(inputCol="f"+feature, outputCol=feature,
                          withScaling=True, withCentering=False,
                          lower=0.25, upper=0.75)
        scalerModel = scaler.fit(self.data)
        self.data = scalerModel.transform(self.data)
        self.data = self.data.drop('f'+feature)
        unlist = udf(lambda x: float(list(x)[0]), DoubleType())
        self.data = self.data.withColumn(feature, unlist(feature))
        return self.data
    
    def robust_scale(self, scale_columns):
        for column in scale_columns:
            self.data = self.scale_column(column)
        return self.data
    
    def calculate_iqr_bound(self, column, q1, q3, k):
        bound = self.data.filter(self.data.Class==1).approxQuantile(column, [q1, q3], 0)
        iqr = bound[1] - bound[0]
        bound[0] = bound[0] - (iqr * k)
        bound[1] = bound[1] + (iqr * k)
        return bound
    
    # TODO: complete outlier removal
    def outlier_removal(self, columns, q1=0.25, q3=0.75, k=1.5):
        for column in columns:
            print(self.data.filter(self.data.Class==1).count())
            bound = self.calculate_iqr_bound(column, q1, q3, k)
            print(self.data.agg(min(col(column)), max(col(column))).collect())
            print(bound)
            print(self.data.filter(self.data.Class==1).filter( (col(column) < bound[0]) | (col(column) > bound[1]) ).count())

In [None]:
def main(ram, duplicate):
    start = time()
    detector = FraudDetection()
    detector.create_spark_context(ram=ram)
    detector.read_file("/opt/workspace/creditcard.csv", True)
    detector.data_duplicator(duplicate, True)
    preprocessor = Preprocess(detector.rep_data)
    preprocessor.robust_scale(['Time', 'Amount'])
    preprocessor.outlier_removal(['V14', 'V12', 'V10'])
    print('Elapsed time is: {}'.format(time()-start))

In [None]:
main(ram=8, duplicate=1)