In [1]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql import SparkSession 
from pyspark.sql.functions import isnan, when, count, col

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv('Fraud.csv',inferSchema=True,header=True)

In [4]:
df = df.drop(df.isFlaggedFraud)

In [5]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)



In [6]:
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|           0.0|           0.0|      0|
+----+--------+--------+-----------+-------------+--------------+-------

In [7]:
print(df.count())
print(len(df.columns))

6362620
10


In [8]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+
|step|type|amount|nameOrig|oldbalanceOrg|newbalanceOrig|nameDest|oldbalanceDest|newbalanceDest|isFraud|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+
|   0|   0|     0|       0|            0|             0|       0|             0|             0|      0|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+



In [9]:
df.summary().toPandas()

Unnamed: 0,summary,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,count,6362620.0,6362620,6362620.0,6362620,6362620.0,6362620.0,6362620,6362620.0,6362620.0,6362620.0
1,mean,243.39724563151657,,179861.90354913048,,833883.1040744851,855113.6685785787,,1100701.6665196505,1224996.3982019336,0.0012908204481801
2,stddev,142.3319710491305,,603858.2314629437,,2888242.67303754,2924048.502954241,,3399180.1129944543,3674128.9421196654,0.0359047968016043
3,min,1.0,CASH_IN,0.0,C1000000639,0.0,0.0,C1000004082,0.0,0.0,0.0
4,25%,156.0,,13390.35,,0.0,0.0,,0.0,0.0,0.0
5,50%,239.0,,74864.38,,14211.23,0.0,,132623.66,214605.81,0.0
6,75%,335.0,,208722.05,,107294.0,144232.62,,942942.08,1111684.56,0.0
7,max,743.0,TRANSFER,92445516.64,C999999784,59585040.37,49585040.37,M999999784,356015889.35,356179278.92,1.0


In [10]:
df = df.withColumn('changeOrig', (df.newbalanceOrig - df.oldbalanceOrg))

In [11]:
df = df.withColumn('changeDest', (df.newbalanceDest - df.oldbalanceDest))