# Initializing Spark

In [1]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("Simple data mining with Synthetic Financial Dataset").getOrCreate()
    

In [2]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001D9A8962F28>


# Loading Dataset 

In [5]:
df = spark.read.csv("D:\Kuliah\Smt6\Big Data\Tugas\PS_20174392719_1491204439457_log.csv", header=True, inferSchema=True)

In [6]:
#Show Dataset

df.show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|M123070170

In [7]:
#Count how many rows in dataset

df.count()

6362620

In [8]:
#Show Dataset Schema

df.schema

StructType(List(StructField(step,IntegerType,true),StructField(type,StringType,true),StructField(amount,DoubleType,true),StructField(nameOrig,StringType,true),StructField(oldbalanceOrg,DoubleType,true),StructField(newbalanceOrig,DoubleType,true),StructField(nameDest,StringType,true),StructField(oldbalanceDest,DoubleType,true),StructField(newbalanceDest,DoubleType,true),StructField(isFraud,IntegerType,true),StructField(isFlaggedFraud,IntegerType,true)))

In [9]:
#Register the DataFrame as a SQL temporary view

df.createOrReplaceTempView("finance")

In [10]:
#Test counting how many rows is in the SQL View

Test = spark.sql("SELECT COUNT(*) AS NumberOfRows FROM finance")
Test.show()

+------------+
|NumberOfRows|
+------------+
|     6362620|
+------------+



# Clustering

In [29]:
# 1. Jumlah uang yang terlibat per transaksi untuk mendeteksi apakah transaksi merupakan fraud

Q1 = spark.sql("SELECT DISTINCT type as TransactionType, amount, isFraud FROM finance")
Q1.show()

+---------------+-------+-------+
|TransactionType| amount|isFraud|
+---------------+-------+-------+
|        CASH_IN| 142.71|      0|
|        CASH_IN| 623.03|      0|
|        CASH_IN| 695.79|      0|
|        CASH_IN| 935.54|      0|
|        CASH_IN|1366.77|      0|
|        CASH_IN| 1376.7|      0|
|        CASH_IN|1478.27|      0|
|        CASH_IN|2022.21|      0|
|        CASH_IN| 2351.5|      0|
|        CASH_IN|2913.35|      0|
|        CASH_IN|3447.53|      0|
|        CASH_IN|3995.33|      0|
|        CASH_IN|4752.94|      0|
|        CASH_IN|5241.73|      0|
|        CASH_IN|5627.86|      0|
|        CASH_IN|6297.93|      0|
|        CASH_IN|6730.26|      0|
|        CASH_IN|7193.08|      0|
|        CASH_IN|7493.19|      0|
|        CASH_IN|8513.77|      0|
+---------------+-------+-------+
only showing top 20 rows



In [30]:
# Assembling Vector
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["amount"],
    outputCol='features')

data = assembler.transform(Q1)
data.show()

+---------------+-------+-------+---------+
|TransactionType| amount|isFraud| features|
+---------------+-------+-------+---------+
|        CASH_IN| 142.71|      0| [142.71]|
|        CASH_IN| 623.03|      0| [623.03]|
|        CASH_IN| 695.79|      0| [695.79]|
|        CASH_IN| 935.54|      0| [935.54]|
|        CASH_IN|1366.77|      0|[1366.77]|
|        CASH_IN| 1376.7|      0| [1376.7]|
|        CASH_IN|1478.27|      0|[1478.27]|
|        CASH_IN|2022.21|      0|[2022.21]|
|        CASH_IN| 2351.5|      0| [2351.5]|
|        CASH_IN|2913.35|      0|[2913.35]|
|        CASH_IN|3447.53|      0|[3447.53]|
|        CASH_IN|3995.33|      0|[3995.33]|
|        CASH_IN|4752.94|      0|[4752.94]|
|        CASH_IN|5241.73|      0|[5241.73]|
|        CASH_IN|5627.86|      0|[5627.86]|
|        CASH_IN|6297.93|      0|[6297.93]|
|        CASH_IN|6730.26|      0|[6730.26]|
|        CASH_IN|7193.08|      0|[7193.08]|
|        CASH_IN|7493.19|      0|[7493.19]|
|        CASH_IN|8513.77|      0

In [31]:
# Train model
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(data)

In [32]:
# Make a prediction
predictions = model.transform(data)
predictions.show()

+---------------+-------+-------+---------+----------+
|TransactionType| amount|isFraud| features|prediction|
+---------------+-------+-------+---------+----------+
|        CASH_IN| 142.71|      0| [142.71]|         0|
|        CASH_IN| 623.03|      0| [623.03]|         0|
|        CASH_IN| 695.79|      0| [695.79]|         0|
|        CASH_IN| 935.54|      0| [935.54]|         0|
|        CASH_IN|1366.77|      0|[1366.77]|         0|
|        CASH_IN| 1376.7|      0| [1376.7]|         0|
|        CASH_IN|1478.27|      0|[1478.27]|         0|
|        CASH_IN|2022.21|      0|[2022.21]|         0|
|        CASH_IN| 2351.5|      0| [2351.5]|         0|
|        CASH_IN|2913.35|      0|[2913.35]|         0|
|        CASH_IN|3447.53|      0|[3447.53]|         0|
|        CASH_IN|3995.33|      0|[3995.33]|         0|
|        CASH_IN|4752.94|      0|[4752.94]|         0|
|        CASH_IN|5241.73|      0|[5241.73]|         0|
|        CASH_IN|5627.86|      0|[5627.86]|         0|
|        C

In [33]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.979909302618393


In [34]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[155198.2371543]
[22703689.4269483]
[2797190.98391877]


In [35]:
import pixiedust

In [None]:
display(predictions)

![cluster](image/chart.PNG)