In [1]:
# Import Modules
import pandas as pd
import scipy.stats as stats
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# Read csv file into dataframe
df = pd.read_csv('paysim.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [3]:
# Pearson's Chi-squared test of categorical variable "Type" for category "isFraud" 
# H0 : P(isFraud == 1) = P(isFraud == 0) or the variable is independent
# Ha : P(isFraud == 1) ≠ P(isFraud == 0) or the varibale is dependent 

type_split = pd.crosstab(df['type'] , df['isFraud'])
type_split

isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,1399284,0
CASH_OUT,2233384,4116
DEBIT,41432,0
PAYMENT,2151495,0
TRANSFER,528812,4097


In [13]:
# P-Value of 0.0 would reject the null hypothesis at any level, or the test result shows dependence.
stats.chi2_contingency(type_split)

(22082.53571319108, 0.0, 4, array([[1.39747778e+06, 1.80622440e+03],
        [2.23461179e+06, 2.88821075e+03],
        [4.13785187e+04, 5.34812728e+01],
        [2.14871781e+06, 2.77719374e+03],
        [5.32221110e+05, 6.87889834e+02]]))

In [5]:
# Split the dataframe by "isfraud" category
fraud_yes = df[df.isFraud == 1]
fraud_no = df[df.isFraud == 0]

In [6]:
fraud_no.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0
mean,243.24,178197.04,832828.71,855970.23,1101420.87,1224925.68,0.0,0.0
std,142.14,596236.98,2887144.03,2924986.96,3399201.79,3673815.71,0.0,0.0
min,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13368.4,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74684.72,14069.0,0.0,133311.8,214881.7,0.0,0.0
75%,334.0,208364.76,106969.5,144730.74,944144.58,1111975.34,0.0,0.0
max,718.0,92445516.64,43818855.3,43686616.33,356015889.35,356179278.92,0.0,0.0


In [7]:
fraud_yes.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0
mean,368.41,1467967.3,1649667.61,192392.63,544249.62,1279707.62,1.0,0.0
std,216.39,2404252.95,3547719.44,1965666.46,3336420.95,3908816.53,0.0,0.04
min,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,181.0,127091.33,125822.44,0.0,0.0,0.0,1.0,0.0
50%,367.0,441423.44,438983.45,0.0,0.0,4676.42,1.0,0.0
75%,558.0,1517771.48,1517771.48,0.0,147828.66,1058725.22,1.0,0.0
max,743.0,10000000.0,59585040.37,49585040.37,236230516.82,236726494.66,1.0,1.0


In [8]:
# Manually calculated t-stat, for sanity check on t-test for obtaining p-value of difference of means test on "Amount"
# H0 : Amount(isFraud == 1) = Amount(isFraud == 0) 
# Ha : Amount(isFraud == 1) ≠ Amount(isFraud == 0) 

nonfraud_amount = fraud_no.amount
fraud_amount = fraud_yes.amount

n1 = len(nonfraud_amount)
n0 = len(fraud_amount)
print("The number of nonfraud amounts is " + str(n1))
print("The number of fraud amounts is " + str(n0))

s1 = np.std(nonfraud_amount, ddof=1)
s0 = np.std(fraud_amount, ddof=1)
print("The std amount nonfraud is " + str(s1))
print("The std amount with fraud is " + str(s0))

x1 = np.mean(nonfraud_amount)
x0 = np.mean(fraud_amount)
print("The mean amount nonfraud is " + str(x1))
print("The mean amount with fraud is " + str(x0))

sp = np.sqrt(((n0 -1) * s0 ** 2 + (n1 -1) * s1 ** 2) / (n0 + n1 -2))
print("The calculated stdev_diff is " + str(sp))

print("The mean fraud minus nonfraud amount is " + str(np.mean(fraud_amount)-np.mean(nonfraud_amount)))

t_stat = (x0 - x1) / (sp * np.sqrt(((1/n0) + (1/n1))))
print("The calculated t-stat is " + str(t_stat))            


The number of nonfraud amounts is 6354407
The number of fraud amounts is 8213
The std amount nonfraud is 596236.9813471739
The std amount with fraud is 2404252.9472401612
The mean amount nonfraud is 178197.04172739814
The mean amount with fraud is 1467967.299140387
The calculated stdev_diff is 602079.9804398556
The mean fraud minus nonfraud amount is 1289770.257412989
The calculated t-stat is 194.01200466038233


In [9]:
# P-Value of 0.0 would reject the null hypothesis at any level
# Or the test result shows the mean Amount is not the same for "isFraud" categories 
from scipy.stats import ttest_ind
ttest_ind(fraud_amount, nonfraud_amount)

Ttest_indResult(statistic=194.01200466037974, pvalue=0.0)