# 0.0 IMPORTS AND DATA LOADING

In [44]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql import SparkSession 
from pyspark.sql.functions import isnan, when, count, col, sum
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.stat import Correlation

In [4]:
spark = SparkSession.builder.getOrCreate()

df = spark.read.csv('Fraud.csv',inferSchema=True,header=True)

df = df.drop(df.isFlaggedFraud)

# 1.0 DATA DESCRIPTION

In [7]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)



In [8]:
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|           0.0|           0.0|      0|
+----+--------+--------+-----------+-------------+--------------+-------

In [9]:
print(df.count())
print(len(df.columns))

6362620
10


In [10]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+
|step|type|amount|nameOrig|oldbalanceOrg|newbalanceOrig|nameDest|oldbalanceDest|newbalanceDest|isFraud|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+
|   0|   0|     0|       0|            0|             0|       0|             0|             0|      0|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+



In [11]:
df.summary().toPandas()

Unnamed: 0,summary,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,count,6362620.0,6362620,6362620.0,6362620,6362620.0,6362620.0,6362620,6362620.0,6362620.0,6362620.0
1,mean,243.39724563151657,,179861.90354913048,,833883.1040744851,855113.6685785787,,1100701.6665196505,1224996.3982019336,0.0012908204481801
2,stddev,142.3319710491305,,603858.2314629437,,2888242.67303754,2924048.502954241,,3399180.1129944543,3674128.9421196654,0.0359047968016043
3,min,1.0,CASH_IN,0.0,C1000000639,0.0,0.0,C1000004082,0.0,0.0,0.0
4,25%,156.0,,13390.35,,0.0,0.0,,0.0,0.0,0.0
5,50%,239.0,,74864.38,,14211.23,0.0,,132623.66,214605.81,0.0
6,75%,335.0,,208722.05,,107294.0,144232.62,,942942.08,1111684.56,0.0
7,max,743.0,TRANSFER,92445516.64,C999999784,59585040.37,49585040.37,M999999784,356015889.35,356179278.92,1.0


# 2.0 FEATURE ENGINEERING

In [12]:
df = df.withColumn('changeOrig', (df.newbalanceOrig - df.oldbalanceOrg))

df = df.withColumn('changeDest', (df.newbalanceDest - df.oldbalanceDest))

In [14]:
df.limit(5).toPandas()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,changeOrig,changeDest
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,-9839.64,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,-1864.28,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,-181.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,-181.0,-21182.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,-11668.14,0.0


# 3.0 FILTERING DATA

In [21]:
df = df.drop(*['step','nameOrig','nameDest'])

In [22]:
df.limit(5).toPandas()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,changeOrig,changeDest
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,-9839.64,0.0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,-1864.28,0.0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,-181.0,0.0
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,-181.0,-21182.0
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,-11668.14,0.0


# 4.0 EXPLORATORY DATA ANALYSIS

In [15]:
df.groupBy('isFraud').count().show()

+-------+-------+
|isFraud|  count|
+-------+-------+
|      1|   8213|
|      0|6354407|
+-------+-------+



In [16]:
df.groupBy('type').sum('isFraud').show()

+--------+------------+
|    type|sum(isFraud)|
+--------+------------+
|TRANSFER|        4097|
| CASH_IN|           0|
|CASH_OUT|        4116|
| PAYMENT|           0|
|   DEBIT|           0|
+--------+------------+



In [17]:
df.groupBy('type').count().show()

+--------+-------+
|    type|  count|
+--------+-------+
|TRANSFER| 532909|
| CASH_IN|1399284|
|CASH_OUT|2237500|
| PAYMENT|2151495|
|   DEBIT|  41432|
+--------+-------+



# 5.0 DATA PREPARATION

In [45]:
typeIndexer = StringIndexer(inputCol='type', outputCol='typeIndex')

In [46]:
df5 = typeIndexer.fit(df).transform(df)

In [51]:
df5 = df5.drop('type')

In [58]:
train, test = df5.randomSplit([0.7,0.3],seed=42)

ones = train.filter(col("isFraud") == 1)
zeros = train.filter(col("isFraud") == 0)

zeros = zeros.limit(ones.count())

train = ones.union(zeros)

In [61]:
train.columns

['amount',
 'oldbalanceOrg',
 'newbalanceOrig',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'changeOrig',
 'changeDest',
 'typeIndex']

In [62]:
assembler = VectorAssembler(inputCols=['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest',
                                       'changeOrig','changeDest','typeIndex'],
                           outputCol='features')

In [65]:
train = assembler.transform(train)
train = train.select(col('features'),col('isFraud'))

IllegalArgumentException: Output column features already exists.

In [70]:
test = assembler.transform(test)
test = test.select(col('features'),col('isFraud'))