  # Challenge

  ## Identifying Outliers using Standard Deviation

In [6]:
# initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine
import psycopg2
%matplotlib inline

In [2]:
# create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")



In [3]:
# code a function to identify outliers based on standard deviation
# loading data for three random card holders from the database
query = """

SELECT transaction.date, credit_card.id_card_holder, card_holder.name, credit_card.card, transaction.amount, merchant.merchant_name, merchant_category.merchant_category_name

FROM card_holder
LEFT JOIN credit_card
ON credit_card.id_card_holder = card_holder.id

LEFT JOIN transaction
ON transaction.card = credit_card.card

LEFT JOIN merchant 
ON merchant.id_merchant = transaction.id_merchant

LEFT JOIN merchant_category
ON merchant_category.id_merchant_category = merchant.id_merchant_category


"""
fraud_detection_df = pd.read_sql_query(query, engine)

fraud_detection_df.set_index(["id_card_holder"],inplace=True)

fraud_detection_df.head()

random_card_holders = fraud_detection_df.sample(n=3,replace=True)
random_card_holders

Unnamed: 0_level_0,date,name,card,amount,merchant_name,merchant_category_name
id_card_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,2018-02-26 07:25:47,Megan Price,376027549341849,7.23,Robles Inc,bar
18,2018-08-17 17:32:11,Malik Carlson,344119623920892,11.24,Robles Inc,bar
7,2018-07-08 14:11:48,Sean Taylor,4539990688484983,17.16,Wilson and Sons,restaurant


In [7]:
# find anomalous transactions for 3 random card holders
rand_card_holders = fraud_detection_df.loc[[7,12,18]]
rand_card_holders.head()

Unnamed: 0_level_0,date,name,card,amount,merchant_name,merchant_category_name
id_card_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,2018-01-04 01:35:21,Sean Taylor,3516952396080247,20.33,Greer Inc,bar
7,2018-01-04 03:05:18,Sean Taylor,3516952396080247,1685.0,"Kelly, Dyer and Schmitt",food truck
7,2018-01-15 06:23:30,Sean Taylor,4539990688484983,15.87,White-Hall,bar
7,2018-01-21 05:38:08,Sean Taylor,3516952396080247,16.29,Carter-Blackwell,pub
7,2018-01-23 06:23:05,Sean Taylor,3516952396080247,2.67,Fisher-Bolton,restaurant


  ## Identifying Outliers Using Interquartile Range

In [17]:
first_card_holder = rand_card_holders.loc[7]
second_card_holder = rand_card_holders.loc[12]
third_card_holder = rand_card_holders.loc[18]

first_stats = first_card_holder.describe()
second_stats = second_card_holder.describe()
third_stats = third_card_holder.describe()

print(first_stats)
print(second_stats)
print(third_stats)

print("-----------OUTLIERS----------------")
print("Amounts higher than 75% percentile")
print("-----------------------------------")

first_card_holder_max = first_card_holder.max()
print(first_card_holder_max)

second_card_holder_max = second_card_holder.max()
print(second_card_holder_max)

third_card_holder_max = third_card_holder.max()
print(third_card_holder_max)

outlier_first = first_card_holder["amount"].mean() < first_card_holder["amount"].mean() + 3 * first_card_holder["amount"].std()

print("--------------------------------------------")
print("Amounts that are higher than 3 times St. Dev.")
print("--------------------------------------------")
print(f"Outlier cardholder 12:{outlier_first}")
print("---------------------------------------------")

outlier_second = second_card_holder["amount"].mean() < second_card_holder["amount"].mean() + 3 * second_card_holder["amount"].std()
print(f"Outlier cardholder 15:{outlier_second}")
print("------------------------------------------------")

outlier_third = third_card_holder["amount"].mean() < third_card_holder["amount"].mean() + 3 * third_card_holder["amount"].std()
print(f"Outlier cardholder 16:{outlier_third}")

            amount
count   139.000000
mean     82.225612
std     314.551436
min       0.730000
25%       3.655000
50%      10.450000
75%      16.025000
max    2249.000000
            amount
count   213.000000
mean     74.007371
std     283.654229
min       0.700000
25%       4.120000
50%      10.280000
75%      15.670000
max    1802.000000
            amount
count   133.000000
mean     90.982030
std     324.714216
min       0.670000
25%       3.460000
50%      10.370000
75%      16.160000
max    1839.000000
-----------OUTLIERS----------------
Amounts higher than 75% percentile
-----------------------------------
date                      2018-12-30 22:16:50
name                              Sean Taylor
card                         4539990688484983
amount                                   2249
merchant_name                   Young-Navarro
merchant_category_name             restaurant
dtype: object
date                           2018-12-27 03:58:44
name                                   

In [28]:
# code a function to identify outliers based on interquartile range
import scipy
from scipy.stats import iqr

print("---------------IQR-----------------")

iqr_first = iqr(first_card_holder["amount"])
print(iqr_first)
iqr_second = iqr(second_card_holder["amount"])
print(iqr_second)
iqr_third = iqr(third_card_holder["amount"])
print(iqr_third)

print("----------------Q3--------------------")

q3_first = np.percentile(first_card_holder["amount"], 75)
print(q3_first)
q3_second = np.percentile(second_card_holder["amount"], 75)
print(q3_second)
q3_third = np.percentile(third_card_holder["amount"], 75)
print(q3_third)

---------------IQR-----------------
12.369999999999997
11.55
12.7
----------------Q3--------------------
16.025
15.67
16.16


In [38]:
# find anomalous transactions for 3 random card holders

outlier_first = []
outlier_second = []
outlier_third = []

for amount in first_card_holder["amount"]: 
    if amount > (q3_first + 1.5 * iqr_first):
        outlier_first.append(amount)

print('Cardholder 12 outlier in the dataset is', amount) 

for amount in second_card_holder["amount"]: 
    if amount > (q3_second + 1.5 * iqr_second):
         outlier_second.append(amount)

print('Cardholder 15 outlier in the dataset is', amount) 

for amount in third_card_holder["amount"]: 
    if amount > (q3_third + 1.5 * iqr_third):
         outlier_third.append(amount)
         
print('Cardholder 16 outlier in the dataset is', amount) 

Cardholder 12 outlier in the dataset is 3.85
Cardholder 15 outlier in the dataset is 17.44
Cardholder 16 outlier in the dataset is 12.25
