In [1]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

In [8]:
df = spark.read.csv(r"F:\Data_Engineering\Apache_Spark\data\fraud_detection.csv", 
                    header=True, 
                    inferSchema=True)

In [9]:
df.columns

['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'isFlaggedFraud']

In [10]:
df = df.drop("isFraud", "isFlaggedFraud")

In [11]:
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|           0.0|           0.0|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+
only showing top 5 rows



# Input Data Streaming

In [13]:
df.groupBy("step").count().show(5)

+----+-----+
|step|count|
+----+-----+
|  12|36153|
|   1| 2708|
|  13|37515|
|   6| 1660|
|  16|42471|
+----+-----+
only showing top 5 rows



In [None]:
%%time
steps = df.select("step").distinct().collect()

for step in steps[:]:
    _df = df.where(f"step = {step[0]}")
    # adding coalesce(1) to save dataframe to one file
    _df.coalesce(1).write.mode("append").option("header", "true").csv("data/fraud")


In [19]:
!cd data/fraud

In [21]:
part = spark.read.csv("data/fraud/part-00000-1c0e7ca8-c227-46ff-80ce-949291c702aa-c000.csv",
        header=True, 
        inferSchema=True)

In [23]:
part.groupBy("step").count().show()

+----+-----+
|step|count|
+----+-----+
|  41|36348|
+----+-----+



# Streaming Processing

In [24]:
dataSchema = part.schema

In [25]:
dataSchema

StructType([StructField('step', IntegerType(), True), StructField('type', StringType(), True), StructField('amount', DoubleType(), True), StructField('nameOrig', StringType(), True), StructField('oldbalanceOrg', DoubleType(), True), StructField('newbalanceOrig', DoubleType(), True), StructField('nameDest', StringType(), True), StructField('oldbalanceDest', DoubleType(), True), StructField('newbalanceDest', DoubleType(), True)])

In [26]:
streaming = (
    spark.readStream.schema(dataSchema)
    .option("maxFilesPerTrigger", 1)
    .csv("data/fraud")
)

Transform: nameDest col is the ID of the recipient of the transaction

In [27]:
dest_count = streaming.groupBy("nameDest").count().orderBy(F.desc("count"))

In [28]:
activityQuery = (
    dest_count.writeStream.queryName("dest_counts")
    .format("memory")
    .outputMode("complete")
    .start()   
)
#only needed in production
# activityQuery.awaitTermination()

import time

for x in range(50):
    _df = spark.sql(
        "SELECT * FROM dest_counts WHERE nameDest != 'nameDest' AND count>=2"
    )
    if _df.count() > 0:
        _df.show(10)
    time.sleep(0.5)

+-----------+-----+
|   nameDest|count|
+-----------+-----+
|C2131465140|    9|
| C499714286|    9|
|C1760966565|    7|
|C1239707538|    7|
|C1968101532|    7|
|C1139127799|    7|
| C709613653|    7|
|C1193495878|    7|
|C1907159141|    7|
|C1390358265|    7|
+-----------+-----+
only showing top 10 rows

+-----------+-----+
|   nameDest|count|
+-----------+-----+
|C2131465140|    9|
| C499714286|    9|
|C1760966565|    7|
|C1239707538|    7|
|C1968101532|    7|
|C1139127799|    7|
| C709613653|    7|
|C1193495878|    7|
|C1907159141|    7|
|C1390358265|    7|
+-----------+-----+
only showing top 10 rows

+-----------+-----+
|   nameDest|count|
+-----------+-----+
|C2131465140|    9|
| C499714286|    9|
|C1760966565|    7|
|C1239707538|    7|
|C1968101532|    7|
|C1139127799|    7|
| C709613653|    7|
|C1193495878|    7|
|C1907159141|    7|
|C1390358265|    7|
+-----------+-----+
only showing top 10 rows

+-----------+-----+
|   nameDest|count|
+-----------+-----+
|C1590550415|   34|
| 

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "D:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "D:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "c:\Users\hanhn\AppData\Local\Programs\Python\Python39\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [29]:
# check if stream is active
spark.streams.active[0].isActive

True

In [30]:
activityQuery.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [31]:
activityQuery.stop()