In [14]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RobustScaler
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf
from time import time

In [2]:
class FraudDetection():
    def __init__(self):
        self.spark = None
        self.data = None
        self.rep_data = None # repeated data
        
    def create_spark_context(self, ram, rpt=False, ret=False):
        self.spark = SparkSession.\
            builder.\
            appName("Fraud Detector").\
            master("spark://spark-master:7077").\
            config("spark.executor.memory", "{}g".format(ram)).\
            getOrCreate()
        if rpt: print(self.spark.sparkContext.getConf().getAll())
        if ret: return self.spark
    
    def read_file(self, path, rpt=False, ret=False):
        self.data = self.spark.read.csv(path, header=True, inferSchema=True)
        if rpt: print('number of partitions: {}'.format(self.data.rdd.getNumPartitions()))
        if ret: return self.data
    
    def data_duplicator(self, number, rpt=False, ret=False):
        self.rep_data = self.data
        for i in range(number-1):
            self.rep_data = self.data.union(self.rep_data)
        if rpt: print("Created df with: {}, {}".format(self.rep_data .count(), len(self.rep_data .columns)))
        if ret: return self.rep_data
    

In [3]:
class Preprocess():
    def __init__(self):
        self.spark = None
        self.data = None
        
    def scale_column(self, data, feature):
        data = data.withColumn(feature, data[feature].cast(IntegerType()))
        assembler = VectorAssembler().setInputCols([feature]).setOutputCol('f'+feature)
        data = assembler.transform(data)
        data = data.drop(feature)
        scaler = RobustScaler(inputCol="f"+feature, outputCol=feature,
                          withScaling=True, withCentering=False,
                          lower=0.25, upper=0.75)
        scalerModel = scaler.fit(data)
        data = scalerModel.transform(data)
        data = data.drop('f'+feature)
        unlist = udf(lambda x: float(list(x)[0]), DoubleType())
        data = data.withColumn(feature, unlist(feature))
        return data
    
    def scale(self, data, scale_columns):
        for column in scale_columns:
            data = self.scale_column(data, column)
        return data

In [12]:
def main(ram, duplicate):
    start = time()
    detector = FraudDetection()
    detector.create_spark_context(ram=ram)
    detector.read_file("/opt/workspace/creditcard.csv", True)
    detector.data_duplicator(duplicate, True)
    preprocessor = Preprocess()
    preprocessor.scale(detector.rep_data, ['Time', 'Amount'])
    print('Elapsed time is: {}'.format(time()-start))

In [15]:
main(ram=16, duplicate=10)

number of partitions: 8
Created df with: 2848070, 31
Elapsed time is: 8.54422664642334
