In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RobustScaler
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf, col, round
from pyspark.sql.functions import *
from time import time

from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
class FraudDetection():
    def __init__(self):
        self.spark = None
        self.data = None
        self.rep_data = None # repeated data
        
    def create_spark_context(self, ram, rpt=False, ret=False):
        self.spark = SparkSession.\
            builder.\
            appName("Fraud Detector").\
            master("spark://spark-master:7077").\
            config("spark.executor.memory", "{}g".format(ram)).\
            getOrCreate()
        if rpt: print(self.spark.sparkContext.getConf().getAll())
        if ret: return self.spark
    
    def read_file(self, path, rpt=False, ret=False):
        self.data = self.spark.read.csv(path, header=True, inferSchema=True)
        if rpt: print('number of partitions: {}'.format(self.data.rdd.getNumPartitions()))
        if ret: return self.data
    
    def data_duplicator(self, number, rpt=False, ret=False):
        self.rep_data = self.data
        for i in range(number-1):
            self.rep_data = self.data.union(self.rep_data)
        if rpt: print("Created df with: {}, {}".format(self.rep_data .count(), len(self.rep_data .columns)))
        if ret: return self.rep_data

        
class Preprocess():
    def __init__(self, data):
        self.spark = None
        self.sub_sample = None
        self.data = data
        
    def scale_column(self, feature):
        self.data = self.data.withColumn(feature, self.data[feature].cast(IntegerType()))
        assembler = VectorAssembler().setInputCols([feature]).setOutputCol('f'+feature)
        self.data = assembler.transform(self.data)
        self.data = self.data.drop(feature)
        scaler = RobustScaler(inputCol="f"+feature, outputCol=feature,
                          withScaling=True, withCentering=False,
                          lower=0.25, upper=0.75)
        scalerModel = scaler.fit(self.data)
        self.data = scalerModel.transform(self.data)
        self.data = self.data.drop('f'+feature)
        unlist = udf(lambda x: float(list(x)[0]), DoubleType())
        self.data = self.data.withColumn(feature, unlist(feature))
        return self.data
    
    def robust_scale(self, scale_columns):
        for column in scale_columns:
            self.data = self.scale_column(column)
        return self.data
    
    def calculate_iqr_bound(self, feature, q1, q3, k, rpt=False):
        bound = self.sub_sample.filter(self.data.Class==1).approxQuantile(feature, [q1, q3], 0)
        if rpt: print(f'Feature: {feature}, Lower bound: {bound[0]}, Upper bound: {bound[1]}')
        iqr = bound[1] - bound[0]
        if rpt: print(f'Feature: {feature}, IQR: {iqr}')
        bound[0] = bound[0] - (iqr * k)
        bound[1] = bound[1] + (iqr * k)
        if rpt: print(f'Feature: {feature}, Cut-off Lower bound: {bound[0]}, Cut-off Upper bound: {bound[1]}')
        return bound
    
    def outlier_removal(self, features, q1=0.25, q3=0.75, k=1.5, rpt=False):
        frauds = self.data.filter(self.data.Class==1)
        self.sub_sample = frauds.union(self.data.filter(self.data.Class==0).limit(492))
        for feature in features:
            before_removal_count = self.sub_sample.count()
            bound = self.calculate_iqr_bound(feature, q1, q3, k, rpt=rpt)
            self.sub_sample = self.sub_sample.filter((col(feature) >= bound[0]) & (col(feature) <= bound[1]))
            after_removal_count = self.sub_sample.count()
            if rpt: print(f'before removal count: {before_removal_count}, after removal count: {after_removal_count}')
    
    def assemble_features(self):
        assembler = VectorAssembler(inputCols=['V{}'.format(i) for i in range(1,29)], outputCol='features')
        self.data = assembler.transform(self.data)
        return self.data

    
class Evaluator():
    def __init__(self):
        pass
    
    def accuracy(self, data):
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="class", metricName="accuracy")
        print('accuracy: {}'.format(accuracy_evaluator.evaluate(data)))
        
    def recall(self, data):
        recall_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="class", metricName="recallByLabel")
        print('recall: {}'.format(recall_evaluator.evaluate(data)))
    
    def recall(self, data):
        recall_evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="class", metricName="recallByLabel")
        print('recall: {}'.format(recall_evaluator.evaluate(data))) 

In [3]:
ram=16
duplicate=1
splitation=[0.7, 0.1, 0.2]
detector = FraudDetection()
detector.create_spark_context(ram=ram)
detector.read_file("/opt/workspace/bank_sim.csv", True)
detector.data_duplicator(duplicate, True)

number of partitions: 8
Created df with: 594643, 10


In [4]:
df = detector.data

In [5]:
df.limit(5).show()

+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|step|     customer|age|gender|zipcodeOri|     merchant|zipMerchant|           category|amount|fraud|
+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+
|   0|'C1093826151'|'4'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'|  4.55|    0|
|   0| 'C352968107'|'2'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 39.68|    0|
|   0|'C2054744914'|'4'|   'F'|   '28007'|'M1823072687'|    '28007'|'es_transportation'| 26.89|    0|
|   0|'C1760612790'|'3'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 17.25|    0|
|   0| 'C757503768'|'5'|   'M'|   '28007'| 'M348934600'|    '28007'|'es_transportation'| 35.72|    0|
+----+-------------+---+------+----------+-------------+-----------+-------------------+------+-----+



In [6]:
def index_column(df, column):
    indexer = StringIndexer(inputCol=column, outputCol=column+"Index")
    df = indexer.fit(df).transform(df)
    df = df.withColumn(column, df[column+"Index"].cast(IntegerType()))
    df = df.drop(column+"Index")
    return df

In [7]:
df = index_column(df, 'merchant')
df = index_column(df, 'category')
df = index_column(df, 'customer')
df = index_column(df, 'age')
df = index_column(df, 'gender')

In [8]:
def merchant_fraud_probablity(merchant):
    merchant_df = df.filter(df.merchant==merchant)
    return merchant_df.filter(merchant_df.fraud==1).count()/merchant_df.count()

In [9]:
merchants = df.toPandas()['merchant'].unique()
merchants_fraud_probablity = {}
for merchant in merchants:
    merchants_fraud_probablity[merchant] = merchant_fraud_probablity(int(merchant))

In [10]:
def merchant_probablity(merchant):
    return merchants_fraud_probablity[merchant]

In [11]:
df = df.rdd.map(lambda x: x + (merchant_probablity(x["merchant"]),)).toDF(df.columns + ["merchanttProbablity"])

In [12]:
df.show()

+----+--------+---+------+----------+--------+-----------+--------+------+-----+-------------------+
|step|customer|age|gender|zipcodeOri|merchant|zipMerchant|category|amount|fraud|merchanttProbablity|
+----+--------+---+------+----------+--------+-----------+--------+------+-----+-------------------+
|   0|    1795|  2|     1|   '28007'|       1|    '28007'|       0|  4.55|    0|                0.0|
|   0|    1620|  0|     1|   '28007'|       1|    '28007'|       0| 39.68|    0|                0.0|
|   0|    3796|  2|     0|   '28007'|       0|    '28007'|       0| 26.89|    0|                0.0|
|   0|    1273|  1|     1|   '28007'|       1|    '28007'|       0| 17.25|    0|                0.0|
|   0|    2814|  3|     1|   '28007'|       1|    '28007'|       0| 35.72|    0|                0.0|
|   0|     623|  1|     0|   '28007'|       1|    '28007'|       0| 25.81|    0|                0.0|
|   0|     586|  4|     0|   '28007'|       1|    '28007'|       0|   9.1|    0|           

In [13]:
inputCols = [
 'customer',
 'age',
 'gender',
 'merchant',
 'category',
 'amount',
#  'merchanttProbablity'
]

In [14]:
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')

In [15]:
df = assembler.transform(df)

In [16]:
df.show()

+----+--------+---+------+----------+--------+-----------+--------+------+-----+-------------------+--------------------+
|step|customer|age|gender|zipcodeOri|merchant|zipMerchant|category|amount|fraud|merchanttProbablity|            features|
+----+--------+---+------+----------+--------+-----------+--------+------+-----+-------------------+--------------------+
|   0|    1795|  2|     1|   '28007'|       1|    '28007'|       0|  4.55|    0|                0.0|[1795.0,2.0,1.0,1...|
|   0|    1620|  0|     1|   '28007'|       1|    '28007'|       0| 39.68|    0|                0.0|[1620.0,0.0,1.0,1...|
|   0|    3796|  2|     0|   '28007'|       0|    '28007'|       0| 26.89|    0|                0.0|[3796.0,2.0,0.0,0...|
|   0|    1273|  1|     1|   '28007'|       1|    '28007'|       0| 17.25|    0|                0.0|[1273.0,1.0,1.0,1...|
|   0|    2814|  3|     1|   '28007'|       1|    '28007'|       0| 35.72|    0|                0.0|[2814.0,3.0,1.0,1...|
|   0|     623|  1|     

In [17]:
train, validation, test = df.randomSplit([0.7, 0.1, 0.2])

In [18]:
# With 1 Worker

start = time()
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="fraud", featuresCol="features", numTrees=10)
pipeline = Pipeline(stages=[rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(train)

# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions = predictions.withColumn("fraud", predictions["fraud"].cast(DoubleType()))

roc_evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="fraud", metricName="areaUnderROC")
print('ROC: {}'.format(roc_evaluator.evaluate(predictions)))

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="fraud", predictionCol="prediction", metricName="accuracy")
print('accuracy: {}'.format(accuracy_evaluator.evaluate(predictions)))
print('Elapsed time is: {}'.format(time()-start))

+----------+-----+--------------------+
|prediction|fraud|            features|
+----------+-----+--------------------+
|       1.0|    1|[1.0,3.0,0.0,9.0,...|
|       0.0|    0|[4.0,4.0,0.0,1.0,...|
|       0.0|    0|[5.0,0.0,0.0,1.0,...|
|       0.0|    0|[10.0,5.0,1.0,17....|
|       0.0|    0|[24.0,1.0,1.0,1.0...|
+----------+-----+--------------------+
only showing top 5 rows

ROC: 0.7182501879635009
accuracy: 0.9921012658227848
Elapsed time is: 18.287577867507935
