  # Challenge

  ## Identifying Outliers using Standard Deviation

In [2]:
# initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine
import psycopg2
%matplotlib inline

In [3]:
# create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")

In [4]:
# code a function to identify outliers based on standard deviation
# loading data for three random card holders from the database
query = """

SELECT transaction.date, credit_card.id_card_holder, card_holder.name, credit_card.card, transaction.amount, merchant.merchant_name, merchant_category.merchant_category_name

FROM card_holder
LEFT JOIN credit_card
ON credit_card.id_card_holder = card_holder.id

LEFT JOIN transaction
ON transaction.card = credit_card.card

LEFT JOIN merchant 
ON merchant.id_merchant = transaction.id_merchant

LEFT JOIN merchant_category
ON merchant_category.id_merchant_category = merchant.id_merchant_category


"""
fraud_detection_df = pd.read_sql_query(query, engine)

fraud_detection_df.set_index(["id_card_holder"],inplace=True)

fraud_detection_df.head()

random_card_holders = fraud_detection_df.sample(n=3,replace=True)
random_card_holders

Unnamed: 0_level_0,date,name,card,amount,merchant_name,merchant_category_name
id_card_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,2018-03-02 17:30:33,Megan Price,5297187379298983,10.19,Ramirez-Carr,coffee shop
19,2018-12-06 15:28:17,Peter Mckay,3561072557118696,18.42,"Bryant, Thomas and Collins",pub
17,2018-06-25 05:30:04,Michael Carroll,6011987562414062,14.54,"Bryant, Thomas and Collins",pub


In [6]:
# find anomalous transactions for 3 random card holders
rand_card_holders = fraud_detection_df.loc[[12,17,19]]
rand_card_holders.head()

Unnamed: 0_level_0,date,name,card,amount,merchant_name,merchant_category_name
id_card_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,2018-01-02 05:45:43,Megan Price,5297187379298983,16.69,"Vega, Jones and Castro",food truck
12,2018-01-02 13:17:15,Megan Price,501879657465,2.64,Marshall-Rojas,coffee shop
12,2018-01-02 23:27:46,Megan Price,501879657465,1031.0,Baxter-Smith,restaurant
12,2018-01-06 04:43:33,Megan Price,5297187379298983,5.31,Kennedy-Chen,bar
12,2018-01-06 23:33:29,Megan Price,376027549341849,7.6,Wood-Ramirez,bar


  ## Identifying Outliers Using Interquartile Range

In [10]:
first_card_holder = rand_card_holders.loc[12]
second_card_holder = rand_card_holders.loc[17]
third_card_holder = rand_card_holders.loc[19]

first_stats = first_card_holder.describe()
second_stats = second_card_holder.describe()
third_stats = third_card_holder.describe()

print(first_stats)
print(second_stats)
print(third_stats)

print("-----------OUTLIERS----------------")

treshold_first = (first_card_holder["amount"].mean()) + 3 * first_card_holder["amount"].std()
outlier_first = first_card_holder["amount"] > treshold_first

treshold_second = (second_card_holder["amount"].mean()) + 3 * second_card_holder["amount"].std()
outlier_second= second_card_holder["amount"] > treshold_second

treshold_third = (third_card_holder["amount"].mean()) + 3 * third_card_holder["amount"].std()
outlier_third = third_card_holder["amount"] > treshold_third

print("--------------------------------------------")

print("Amounts that are higher than mean plus 3 times standard dev.")

print("--------------------------------------------")

for amount_first in first_card_holder["amount"]:
    if amount_first > treshold_first:
        print(f"Outlier Transactions for Cardholder 12: {amount_first}")
    

for amount_second in second_card_holder["amount"]:
    if amount_second > treshold_second:
       print(f"Outlier Transactions for Cardholder 17: {amount_second}")
       

for amount_third in third_card_holder["amount"]:
    if amount_third > treshold_third:
      print(f"Outlier Transactions for Cardholder 19: {amount_third}")

            amount
count   213.000000
mean     74.007371
std     283.654229
min       0.700000
25%       4.120000
50%      10.280000
75%      15.670000
max    1802.000000
          amount
count  61.000000
mean   10.063115
std     5.634782
min     0.720000
25%     5.190000
50%    10.290000
75%    14.540000
max    22.490000
           amount
count  205.000000
mean     8.920732
std      5.600654
min      0.610000
25%      3.530000
50%     10.150000
75%     12.220000
max     22.290000
-----------OUTLIERS----------------
--------------------------------------------
Amounts that are higher than mean plus 3 times standard dev.
--------------------------------------------
Outlier Transactions for Cardholder 12: 1031.0
Outlier Transactions for Cardholder 12: 1678.0
Outlier Transactions for Cardholder 12: 1530.0
Outlier Transactions for Cardholder 12: 1102.0
Outlier Transactions for Cardholder 12: 1592.0
Outlier Transactions for Cardholder 12: 1108.0
Outlier Transactions for Cardholder 12: 1075.

In [11]:
# code a function to identify outliers based on interquartile range
import scipy
from scipy.stats import iqr

print("---------------IQR-----------------")

iqr_first = iqr(first_card_holder["amount"])
print(iqr_first)
iqr_second = iqr(second_card_holder["amount"])
print(iqr_second)
iqr_third = iqr(third_card_holder["amount"])
print(iqr_third)

print("----------------Q3--------------------")

q3_first = np.percentile(first_card_holder["amount"], 75)
print(q3_first)
q3_second = np.percentile(second_card_holder["amount"], 75)
print(q3_second)
q3_third = np.percentile(third_card_holder["amount"], 75)
print(q3_third)

---------------IQR-----------------
11.55
9.349999999999998
8.690000000000001
----------------Q3--------------------
15.67
14.54
12.22


In [13]:
# find anomalous transactions for 3 random card holders
outlier_first = []
outlier_second = []
outlier_third = []

for amount in first_card_holder["amount"]: 
    if amount > (q3_first + 1.5 * iqr_first):
        outlier_first.append(amount)

print('Cardholder 12 outlier in the dataset is', outlier_first) 

for amount in second_card_holder["amount"]: 
    if amount > (q3_second + 1.5 * iqr_second):
         outlier_second.append(amount)

print('Cardholder 17 outlier in the dataset is', outlier_second) 

for amount in third_card_holder["amount"]: 
    if amount > (q3_third + 1.5 * iqr_third):
           outlier_third.append(amount)
         
print('Cardholder 19 outlier in the dataset is', outlier_third) 

Cardholder 12 outlier in the dataset is [1031.0, 1678.0, 1530.0, 852.0, 1102.0, 1592.0, 1108.0, 1075.0, 233.0, 1123.0, 1802.0, 748.0]
Cardholder 17 outlier in the dataset is []
Cardholder 19 outlier in the dataset is []
