  # Challenge

  ## Identifying Outliers using Standard Deviation

In [60]:
# initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine
import psycopg2
%matplotlib inline

In [3]:
# create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")



In [4]:
# code a function to identify outliers based on standard deviation
# loading data for three random card holders from the database
query = """

SELECT transaction.date, credit_card.id_card_holder, card_holder.name, credit_card.card, transaction.amount, merchant.merchant_name, merchant_category.merchant_category_name

FROM card_holder
LEFT JOIN credit_card
ON credit_card.id_card_holder = card_holder.id

LEFT JOIN transaction
ON transaction.card = credit_card.card

LEFT JOIN merchant 
ON merchant.id_merchant = transaction.id_merchant

LEFT JOIN merchant_category
ON merchant_category.id_merchant_category = merchant.id_merchant_category


"""
fraud_detection_df = pd.read_sql_query(query, engine)

fraud_detection_df.set_index(["id_card_holder"],inplace=True)

fraud_detection_df.head()

random_card_holders = fraud_detection_df.sample(n=3,replace=True)
random_card_holders

Unnamed: 0_level_0,date,name,card,amount,merchant_name,merchant_category_name
id_card_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,2018-10-30 10:17:41,Megan Price,376027549341849,1.58,Russell-Thomas,restaurant
4,2018-07-06 16:16:33,Danielle Green,4263694062533017,8.74,Robles Inc,bar
15,2018-12-10 04:30:42,Kyle Tucker,6500236164848279,1.78,"Bond, Lewis and Rangel",restaurant


In [10]:
# find anomalous transactions for 3 random card holders
rand_card_holders = fraud_detection_df.loc[[4,12,15]]
rand_card_holders.head()

Unnamed: 0_level_0,date,name,card,amount,merchant_name,merchant_category_name
id_card_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,2018-01-01 23:13:30,Danielle Green,4263694062533017,19.03,Miller-Blevins,pub
4,2018-01-02 01:13:21,Danielle Green,4263694062533017,11.24,"Williams, Wright and Wagner",coffee shop
4,2018-01-08 19:31:23,Danielle Green,584226564303,17.43,Santos-Fitzgerald,pub
4,2018-01-13 18:34:36,Danielle Green,4263694062533017,2.22,"Curry, Scott and Richardson",bar
4,2018-01-14 03:20:21,Danielle Green,584226564303,14.69,"Scott, Hess and Finley",bar


  ## Identifying Outliers Using Interquartile Range

In [194]:
first_card_holder = rand_card_holders.loc[4]
second_card_holder = rand_card_holders.loc[12]
third_card_holder = rand_card_holders.loc[15]

first_stats = first_card_holder.describe()
second_stats = second_card_holder.describe()
third_stats = third_card_holder.describe()

print(first_stats)
print(second_stats)
print(third_stats)

print("-----------OUTLIERS----------------")

treshold_first = (first_card_holder["amount"].mean()) + 3 * first_card_holder["amount"].std()
outlier_first = first_card_holder["amount"] > treshold_first
outlier_first.append(outlier_first)

treshold_second = (second_card_holder["amount"].mean()) + 3 * second_card_holder["amount"].std()
outlier_second= second_card_holder["amount"] > treshold_second

treshold_third = (third_card_holder["amount"].mean()) + 3 * third_card_holder["amount"].std()
outlier_third = third_card_holder["amount"] > treshold_third

print("--------------------------------------------")

print("Amounts that are higher than mean plus 3 times standard dev.")
print("--------------------------------------------")
print(f"Outlier cardholder 4: {outlier_first}")
print(f"Outlier cardholder 12: {outlier_second}")
print(f"Outlier cardholder 15: {outlier_third}")

print("---------------------------------------------")

           amount
count  148.000000
mean     9.261824
std      5.751302
min      0.700000
25%      3.640000
50%     10.205000
75%     12.172500
max     21.500000
            amount
count   213.000000
mean     74.007371
std     283.654229
min       0.700000
25%       4.120000
50%      10.280000
75%      15.670000
max    1802.000000
           amount
count  138.000000
mean     9.567609
std      5.849357
min      0.760000
25%      3.652500
50%     10.345000
75%     15.040000
max     20.810000
-----------OUTLIERS----------------
--------------------------------------------
Amounts that are higher than mean plus 3 times standard dev.
--------------------------------------------
Outlier cardholder 4: id_card_holder
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    False
4    F

In [61]:
# code a function to identify outliers based on interquartile range

import scipy
from scipy.stats import iqr

print("---------------IQR-----------------")

iqr_first = iqr(first_card_holder["amount"])
print(iqr_first)
iqr_second = iqr(second_card_holder["amount"])
print(iqr_second)
iqr_third = iqr(third_card_holder["amount"])
print(iqr_third)

print("----------------Q3--------------------")

q3_first = np.percentile(first_card_holder["amount"], 75)
print(q3_first)
q3_second = np.percentile(second_card_holder["amount"], 75)
print(q3_second)
q3_third = np.percentile(third_card_holder["amount"], 75)
print(q3_third)

---------------IQR-----------------
8.532499999999999
11.55
11.3875
----------------Q3--------------------
12.1725
15.67
15.04


In [141]:
# find anomalous transactions for 3 random card holders

outlier_first = []
outlier_second = []
outlier_third = []


for amount in first_card_holder["amount"]: 
    if amount > (q3_first + 1.5 * iqr_first):
        outlier_first.append(amount)

print('Cardholder 4 outlier in the dataset is', outlier_first) 

for amount in second_card_holder["amount"]: 
    if amount > (q3_second + 1.5 * iqr_second):
         outlier_second.append(amount)

print('Cardholder 12 outlier in the dataset is', outlier_second) 

for amount in third_card_holder["amount"]: 
    if amount > (q3_third + 1.5 * iqr_third):
           outlier_third.append(amount)
         
print('Cardholder 15 outlier in the dataset is', outlier_third) 

Cardholder 4 outlier in the dataset is []
Cardholder 12 outlier in the dataset is [1031.0, 1678.0, 1530.0, 852.0, 1102.0, 1592.0, 1108.0, 1075.0, 233.0, 1123.0, 1802.0, 748.0]
Cardholder 15 outlier in the dataset is []
