The idea of this Kernel is to test Amount data using statistical approaches.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
%matplotlib inline

## Explore data ##

In [None]:
data = pd.read_csv('../input/creditcard.csv', header=0)

In [None]:
#split to froud and legal
data_f = data[data['Class'] == 1]
data_l = data[data['Class'] == 0]

fig = plt.figure(num=None, figsize=(11, 4))

ax1 = fig.add_subplot(121)
ax1.hist(data_f.Amount)
ax1.title.set_text('Fraud')

ax2 = fig.add_subplot(122)
ax2.hist(data_l.Amount)
ax2.title.set_text('Not Fraud')

plt.tight_layout()
plt.show()

## Three-Sigma Rule ##

In [None]:
amounts = data.Amount

avg = np.average(amounts)
stdev = np.std(amounts)
print('Average for all Transactions =', avg)
print('Standart devition for all Transactions =', stdev)
data_3sigma = data[(data.Amount >= avg + 3 * stdev) | (data.Amount <= avg - 3 * stdev)]

fig2 = plt.figure(num=None, figsize=(11, 4))

ax1 = fig2.add_subplot(121)
ax1.hist(data_3sigma[data_3sigma.Class == 0].Amount)
ax1.title.set_text('Not Fraud')

ax2 = fig2.add_subplot(122)
ax2.hist(data_3sigma[data_3sigma.Class == 1].Amount)
ax2.title.set_text('Fraud')

plt.tight_layout()
plt.show()

tp = data_3sigma[data_3sigma.Class == 1].Amount.count()
fp = data_3sigma[data_3sigma.Class == 0].Amount.count()
fn = data[data.Class == 1].Amount.count() - data_3sigma[data_3sigma.Class == 1].Amount.count()

print('True Positive hits:', tp)
print('False Posititve hits:', fp)
print('False Negative:', fn)

precision = tp / (tp + fp)
recall = tp / (tp + fn)

print('F1 score:', 2 * recall * precision / (recall + precision))

Looks like this rule doesn't work with this data.
## Benford's Law (first digit) ##

In [None]:
#getting the real data frequency
a = amounts[amounts >= 1].apply(lambda x: x // 10**(len(str(math.floor(x))) - 1))
num_counts = a.value_counts()
num_total = a.count()
num_percent = num_counts.apply(lambda x: 100 * x / num_total)

#getting expected frequency for every digit from 1 to 9
exp=pd.Series()
for i in range(1,10):
     exp.set_value(i, math.log(1 + 1/i,9) * 100)

df = pd.DataFrame(columns=['Expected', 'Real'])
df['Expected'] = exp
df['Real'] = num_percent
        
print('Expected frequencies :\n', exp.sort_index(), '\n')
print('Real frequencies:\n',num_percent.sort_index())

df.plot.bar(figsize = (13,7))

There are suspicious fluctuations in digit '1' and digit '9'.  Tiny fraud data could not give this kind of fluctuations but let's check the amount of digits.

In [None]:
a = data_f.Amount
a = a[a >= 1].apply(lambda x: x // 10**(len(str(math.floor(x))) - 1))
a = a.value_counts()
print(a.sort_index())

## Benford's Law (first two digits) ##

In [None]:
#getting the real data frequency
a = amounts[amounts >= 10].apply(lambda x: x // 10**(len(str(math.floor(x))) - 2))
num_counts = a.value_counts()
num_total = a.count()
num_percent = num_counts.apply(lambda x: 100 * x / num_total)

#getting expected frequency for every digit from 1 to 9
exp=pd.Series()
for i in range(10,100):
     exp.set_value(i, math.log(1 + 1/i,89) * 100)

df = pd.DataFrame(columns=['Expected', 'Real'])
df['Expected'] = exp
df['Real'] = num_percent

df.plot.bar(figsize = (14,9))

In [None]:
a = data_f.Amount
a = a[a >= 10].apply(lambda x: x // 10**(len(str(math.floor(x))) - 2))
a = a.value_counts()
print(a.sort_values(ascending=False).head(10))

As we could see, Distributions do not even look like similar, but to be sure we can use some tests.

## Z-Statistics ##

In [None]:
for index, row in df.iterrows():
    print('Digit', index, 
          'Z-Statistics', (math.fabs(row.Expected/100 - row.Real/100) - (1 / (2 * num_total))) / math.sqrt((row.Expected/100 * (1 - row.Expected/100)) / num_total))

At a significance level of **5 percent**, the cutoff score is **1.96**. The most of digits exceeds this cutoff score.

## Chi-Square Test ##
At a significance level of **5 percent**, the cutoff score is **112.02**. 

In [None]:
chsq = 0
for index, row in df.iterrows():
    chsq = chsq + (row.Real*num_total/100 - row.Expected*num_total/100)**2 / (row.Expected*num_total/100)
print('Chi-Square', chsq)

This means that if the calculated chi-square value exceeds 112.02 then the null hypothesis of conformity of the first-two digits must be rejected and we would conclude that the data does not conform to Benford’s Law.