In [1]:
!sudo wget -O /usr/local/spark/jars/hadoop-aws-3.3.1.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar

--2023-01-04 23:35:12--  https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 870644 (850K) [application/java-archive]
Saving to: ‘/usr/local/spark/jars/hadoop-aws-3.3.1.jar’


2023-01-04 23:35:13 (6.44 MB/s) - ‘/usr/local/spark/jars/hadoop-aws-3.3.1.jar’ saved [870644/870644]



In [2]:
!sudo wget -O /usr/local/spark/jars/aws-java-sdk-bundle-1.11.901.jar https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar

--2023-01-04 23:35:19--  https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 192321266 (183M) [application/java-archive]
Saving to: ‘/usr/local/spark/jars/aws-java-sdk-bundle-1.11.901.jar’


2023-01-04 23:35:31 (15.7 MB/s) - ‘/usr/local/spark/jars/aws-java-sdk-bundle-1.11.901.jar’ saved [192321266/192321266]



In [37]:
from pyspark.conf import SparkConf
from pyspark.ml.feature import Imputer, StringIndexer, IndexToString
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import when, count, col, lit, udf, isnan
from pyspark.sql.types import *
from pyspark import SparkContext
import os
import requests

In [154]:
AWS_S3_CUSTOM_ENDPOINT = "http://storage:9000"
AWS_ACCESS_KEY_ID = "minioadmin"
AWS_SECRET_ACCESS_KEY = "minioadmin"

FRAUD_DETECTION_DB="ml_fraud_detection_db"
FRAUD_DETECTION_SRC_TBL=f"{FRAUD_DETECTION_DB}.tb_fraud_score"

PREDICT_URL = "http://model_server:8501/v1/models/fraud-model:predict"
ID_COLUMN = "TransactionID"


OUTPUT_DIR = "s3a://ml-data/fraud_detection_output/"

In [5]:
conf = SparkConf()
conf.setMaster("local[6]")
conf.set("spark.driver.memory", "10g")
conf.set("spark.executor.memory", "4g")
conf.set("spark.executor.cores", "1")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.hadoop.parquet.enable.summary-metadata", "false")
conf.set("spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
conf.set("hive.metastore.uris", "thrift://hive-metastore:9083")

<pyspark.conf.SparkConf at 0x7f4ddf6f1e50>

In [6]:
conf.set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID)
conf.set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.endpoint", AWS_S3_CUSTOM_ENDPOINT)
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

<pyspark.conf.SparkConf at 0x7f4ddf6f1e50>

In [7]:
spark = SparkSession.builder.appName("ScoreProcessor").config(conf=conf).enableHiveSupport().getOrCreate()

In [80]:
source_df = spark.sql(f"select {ID_COLUMN}, ARRAY(c1) as c1, ARRAY(c13) as c13, ARRAY(c5) as c5, ARRAY(card1) as card1, ARRAY(card2) as card2, ARRAY(card3) as card3, ARRAY(card4) as card4, ARRAY(card5) as card5, ARRAY(card6) as card6, ARRAY(d1) as d1, ARRAY(d10) as d10, ARRAY(d11) as d11, ARRAY(d15) as d15, ARRAY(d3) as d3, ARRAY(d4) as d4, ARRAY(d5) as d5, ARRAY(dist1) as dist1, ARRAY(m2) as m2, ARRAY(m3) as m3, ARRAY(m4) as m4, ARRAY(m5) as m5, ARRAY(m6) as m6, ARRAY(m7) as m7, ARRAY(m8) as m8, ARRAY(m9) as m9, ARRAY(p_emaildomain) as p_emaildomain, ARRAY(productcd) as productcd, ARRAY(r_emaildomain) as r_emaildomain, ARRAY(transactionamt) as transactionamt, ARRAY(v10) as v10, ARRAY(v12) as v12, ARRAY(v15) as v15, ARRAY(v19) as v19, ARRAY(v2) as v2, ARRAY(v22) as v22, ARRAY(v23) as v23, ARRAY(v25) as v25, ARRAY(v29) as v29, ARRAY(v3) as v3, ARRAY(v35) as v35, ARRAY(v4) as v4, ARRAY(v6) as v6, ARRAY(v8) as v8 from {FRAUD_DETECTION_SRC_TBL}")

In [81]:
source_df.createOrReplaceTempView("score_source_tbl")

## First Transformation

In [85]:
feature_columns = [c for c in source_df.columns if c != ID_COLUMN ]

In [88]:
feature_columns_str = ",".join(feature_columns)

In [89]:
first_transformation_df = spark.sql(f"select {ID_COLUMN}, ARRAY(struct({feature_columns_str})) as instances, 'json_serving' as signature_name from score_source_tbl")

In [90]:
first_transformation_df.createOrReplaceTempView("first_transform_tbl")

In [91]:
spark.sql("select * from first_transform_tbl").show()

+-------------+--------------------+--------------+
|TransactionID|           instances|signature_name|
+-------------+--------------------+--------------+
|      3663549|[{[6.0], [115.0],...|  json_serving|
|      3663550|[{[3.0], [12.0], ...|  json_serving|
|      3663551|[{[2.0], [22.0], ...|  json_serving|
|      3663552|[{[5.0], [7.0], [...|  json_serving|
|      3663553|[{[6.0], [14.0], ...|  json_serving|
|      3663554|[{[5.0], [10.0], ...|  json_serving|
|      3663555|[{[1.0], [2.0], [...|  json_serving|
|      3663556|[{[3.0], [11.0], ...|  json_serving|
|      3663557|[{[152.0], [407.0...|  json_serving|
|      3663558|[{[2.0], [8.0], [...|  json_serving|
|      3663559|[{[2.0], [3.0], [...|  json_serving|
|      3663560|[{[4.0], [17.0], ...|  json_serving|
|      3663561|[{[3.0], [1.0], [...|  json_serving|
|      3663562|[{[3.0], [9.0], [...|  json_serving|
|      3663563|[{[1.0], [3.0], [...|  json_serving|
|      3663564|[{[6.0], [13.0], ...|  json_serving|
|      36635

## Second Transformation

In [92]:
second_transformation_df = spark.sql(f"select {ID_COLUMN}, to_json(struct(instances, signature_name)) as data, '{PREDICT_URL}' as url from first_transform_tbl")

In [93]:
second_transformation_df.createOrReplaceTempView("second_transform_tbl")

In [94]:
spark.sql("select * from second_transform_tbl").show(truncate=False, n=1)

+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------+
|TransactionID|data                                                                                                                                                                                                                                                                   

## Predict UDF

In [139]:
def predict_function(data, url):
    prediction = requests.post(url, data).json()
    value = prediction['predictions'][0][0]
    # print(value)
    return round(value, 2)

In [143]:
req_data = '{"instances":[{"c1":[6.0],"c13":[115.0],"c5":[3.0],"card1":[10409.0],"card2":[111.0],"card3":[150.0],"card4":["visa"],"card5":[226.0],"card6":["debit"],"d1":[419.0],"d10":[418.0],"d11":[203.0],"d15":[409.0],"d3":[27.0],"d4":[398.0],"d5":[27.0],"dist1":[1.0],"m2":["T"],"m3":["F"],"m4":["M1"],"m5":["T"],"m6":["F"],"m7":["T"],"m8":["T"],"m9":["T"],"p_emaildomain":["gmail.com"],"productcd":["W"],"r_emaildomain":["scranton.edu"],"transactionamt":[31.95],"v10":[1.0],"v12":[0.0],"v15":[0.0],"v19":[0.0],"v2":[1.0],"v22":[0.0],"v23":[1.0],"v25":[0.0],"v29":[0.0],"v3":[1.0],"v35":[1.0],"v4":[1.0],"v6":[1.0],"v8":[1.0]}],"signature_name":"json_serving"}'
predict_function(req_data, PREDICT_URL)

0.0

In [144]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, FloatType

predict_udf = udf(lambda data:predict_function(data))  
spark.udf.register("predict", predict_function,FloatType())

<function __main__.predict_function(data, url)>

In [145]:
predict_df = spark.sql(f"select {ID_COLUMN}, predict(data, url) as isfraud from second_transform_tbl")

In [146]:
predict_df.cache()
predict_df.createOrReplaceTempView("predict_tbl")

In [126]:
spark.sql("select count(*) as total_records from predict_tbl").show(truncate=False)

+-------------+
|total_records|
+-------------+
|506691       |
+-------------+



In [147]:
spark.sql(f"select {ID_COLUMN}, isfraud from predict_tbl where isfraud>0.85").show(truncate=False)

+-------------+-------+
|TransactionID|isfraud|
+-------------+-------+
+-------------+-------+



In [159]:
final_df = spark.sql(f"select {ID_COLUMN}, '0' as isFraud from predict_tbl")

In [160]:
final_df.coalesce(1).write.mode("overwrite").option("header",True).csv(OUTPUT_DIR)