In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RobustScaler
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import *
from time import time

from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Keras / Deep Learning
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Activation, LSTM, GRU
from tensorflow.python.keras import optimizers, regularizers
from tensorflow.python.keras.optimizers import Adam

# Elephas for Deep Learning on Spark
from elephas.ml_model import ElephasEstimator



In [2]:
class FraudDetection():
    def __init__(self):
        self.spark = None
        self.data = None
        self.rep_data = None # repeated data
        
    def create_spark_context(self, ram, rpt=False, ret=False):
        self.spark = SparkSession.\
            builder.\
            appName("Fraud Detector").\
            master("spark://spark-master:7077").\
            config("spark.executor.memory", "{}g".format(ram)).\
            getOrCreate()
        if rpt: print(self.spark.sparkContext.getConf().getAll())
        if ret: return self.spark
    
    def read_file(self, path, rpt=False, ret=False):
        self.data = self.spark.read.csv(path, header=True, inferSchema=True)
        if rpt: print('number of partitions: {}'.format(self.data.rdd.getNumPartitions()))
        if ret: return self.data
    
    def data_duplicator(self, number, rpt=False, ret=False):
        self.rep_data = self.data
        for i in range(number-1):
            self.rep_data = self.data.union(self.rep_data)
        if rpt: print("Created df with: {}, {}".format(self.rep_data .count(), len(self.rep_data .columns)))
        if ret: return self.rep_data

        
class Preprocess():
    def __init__(self, data):
        self.spark = None
        self.sub_sample = None
        self.data = data
        
    def scale_column(self, feature):
        self.data = self.data.withColumn(feature, self.data[feature].cast(IntegerType()))
        assembler = VectorAssembler().setInputCols([feature]).setOutputCol('f'+feature)
        self.data = assembler.transform(self.data)
        self.data = self.data.drop(feature)
        scaler = RobustScaler(inputCol="f"+feature, outputCol=feature,
                          withScaling=True, withCentering=False,
                          lower=0.25, upper=0.75)
        scalerModel = scaler.fit(self.data)
        self.data = scalerModel.transform(self.data)
        self.data = self.data.drop('f'+feature)
        unlist = udf(lambda x: float(list(x)[0]), DoubleType())
        self.data = self.data.withColumn(feature, unlist(feature))
        return self.data
    
    def robust_scale(self, scale_columns):
        for column in scale_columns:
            self.data = self.scale_column(column)
        return self.data
    
    def calculate_iqr_bound(self, feature, q1, q3, k, rpt=False):
        bound = self.sub_sample.filter(self.data.Class==1).approxQuantile(feature, [q1, q3], 0)
        if rpt: print(f'Feature: {feature}, Lower bound: {bound[0]}, Upper bound: {bound[1]}')
        iqr = bound[1] - bound[0]
        if rpt: print(f'Feature: {feature}, IQR: {iqr}')
        bound[0] = bound[0] - (iqr * k)
        bound[1] = bound[1] + (iqr * k)
        if rpt: print(f'Feature: {feature}, Cut-off Lower bound: {bound[0]}, Cut-off Upper bound: {bound[1]}')
        return bound
    
    def outlier_removal(self, features, q1=0.25, q3=0.75, k=1.5, rpt=False):
        frauds = self.data.filter(self.data.Class==1)
        self.sub_sample = frauds.union(self.data.filter(self.data.Class==0).limit(492))
        for feature in features:
            before_removal_count = self.sub_sample.count()
            bound = self.calculate_iqr_bound(feature, q1, q3, k, rpt=rpt)
            self.sub_sample = self.sub_sample.filter((col(feature) >= bound[0]) & (col(feature) <= bound[1]))
            after_removal_count = self.sub_sample.count()
            if rpt: print(f'before removal count: {before_removal_count}, after removal count: {after_removal_count}')
    
    def assemble_features(self):
        assembler = VectorAssembler(inputCols=['V{}'.format(i) for i in range(1,29)], outputCol='features')
        self.data = assembler.transform(self.data)
        return self.data

    
class Evaluator():
    def __init__(self, label="class", prediction="prediction"):
        self.label = label
        self.prediction = prediction
    
    def accuracy(self, data):
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=self.label, predictionCol=self.prediction, metricName="accuracy")
        print('accuracy: {}'.format(accuracy_evaluator.evaluate(data)))
        
    def recall(self, data):
        recall_evaluator = MulticlassClassificationEvaluator(labelCol=self.label, predictionCol=self.prediction, metricName="recallByLabel")
        print('recall: {}'.format(recall_evaluator.evaluate(data)))
    
    def recall(self, data):
        recall_evaluator = MulticlassClassificationEvaluator(labelCol=self.label, predictionCol=self.prediction, metricName="recallByLabel")
        print('recall: {}'.format(recall_evaluator.evaluate(data))) 
    
    def auc_roc(self, data):
        roc_evaluator = BinaryClassificationEvaluator(rawPredictionCol=self.prediction, labelCol=self.label, metricName="areaUnderROC")
        auc_roc = roc_evaluator.evaluate(data)
        print(f'auc_roc: {auc_roc}') 

In [22]:
class Keras_Predictor():
    def __init__(self):
        self.model = None 
        model = Sequential()
        model.add(Dense(256, input_shape=(28,), activity_regularizer=regularizers.l2(0.01)))
        model.add(Activation('relu'))
        model.add(Dropout(rate=0.3))
        model.add(Dense(256, activity_regularizer=regularizers.l2(0.01)))
        model.add(Activation('relu'))
        model.add(Dropout(rate=0.3))
        model.add(Dense(2))
        model.add(Activation('sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam')

        # Set and Serialize Optimizer
        optimizer_conf = optimizers.Adam(lr=0.01)
        opt_conf = optimizers.serialize(optimizer_conf)
        
        # Initialize SparkML Estimator and Get Settings
        estimator = ElephasEstimator()
        estimator.setFeaturesCol("features")
        estimator.setLabelCol("Class")
        estimator.set_keras_model_config(model.to_yaml())
        estimator.set_categorical_labels(True)
        estimator.set_nb_classes(2)
        estimator.set_num_workers(1)
        estimator.set_epochs(25) 
        estimator.set_batch_size(64)
        estimator.set_verbosity(1)
        estimator.set_validation_split(0.10)
        estimator.set_optimizer_config(opt_conf)
        estimator.set_mode("synchronous")
        estimator.set_loss("binary_crossentropy")
        estimator.set_metrics(['acc'])

        # Create Deep Learning Pipeline
        self.dl_pipeline = Pipeline(stages=[estimator])
        
    def train(self, data):
        dl_pipeline = self.dl_pipeline
        start = time()
        self.model = dl_pipeline.fit(data)
        print('Elapsed time is: {}'.format(time()-start))
    
    def pred(self, data):
        return self.model.transform(data).select('Class', "prediction")
    

In [23]:
ram=16
duplicate=1
splitation=[0.7, 0.1, 0.2]
detector = FraudDetection()
detector.create_spark_context(ram=ram)
detector.read_file("/opt/workspace/creditcard.csv", True)
detector.data_duplicator(duplicate, True)
start = time()
preprocessor = Preprocess(detector.rep_data)
preprocessor.robust_scale(['Time', 'Amount'])
preprocessor.outlier_removal(['V14', 'V12', 'V10'], rpt=False)
detector.data = preprocessor.assemble_features()
train, validation, test = detector.data.randomSplit(splitation)
print('Elapsed time is: {}'.format(time()-start))

number of partitions: 8
Created df with: 284807, 31
Elapsed time is: 4.193568229675293


In [24]:
k = Keras_Predictor()
k.train(train)
predictions = k.pred(test)

e = Evaluator(label="Class", prediction="prediction")
e.accuracy(predictions)
e.auc_roc(predictions)

TypeError: 'int' object is not iterable

In [None]:
# With 1 Worker

start = time()
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="Class", featuresCol="features", numTrees=10)
pipeline = Pipeline(stages=[rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(train)

# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", "Class", "features").show(5)
predictions = predictions.withColumn("Class", predictions["Class"].cast(DoubleType()))

roc_evaluator = BinaryClassificationEvaluator(rawPredictionCol="Class", labelCol="prediction", metricName="areaUnderROC")
print('ROC: {}'.format(roc_evaluator.evaluate(predictions)))

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="Class", metricName="accuracy")
print('accuracy: {}'.format(accuracy_evaluator.evaluate(predictions)))
print('Elapsed time is: {}'.format(time()-start))