  # Challenge

  ## Identifying Outliers using Standard Deviation

In [238]:
# initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine



In [239]:
# create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/transaction_db")

# generate query string
query = """
    select  a.transactionid, a.transactiondate, a.transactionamount, 
            a.creditcardnumber, a.merchantid, b.cardholderid, c.merchantname, d.merchantcategoryname
    FROM cctransaction as a 
    INNER join creditcard as b 
    ON a.creditcardnumber = b.creditcardnumber
    INNER join merchant as c
    ON a.merchantid = c.merchantid
    INNER join merchantcategory as d
    ON c.merchantcategoryid = d.merchantcategoryid
    ORDER BY b.cardholderid, a.creditcardnumber, a.transactiondate, a.transactionamount;
"""

# Read the SQL query into a DataFrame
txn_all = pd.read_sql(query, engine)

# Show the Dataframe
txn_all

Unnamed: 0,transactionid,transactiondate,transactionamount,creditcardnumber,merchantid,cardholderid,merchantname,merchantcategoryname
0,3490,2018-01-02 16:14:55,3.12,3517111172421930,21,1,Robertson-Smith,pub
1,1436,2018-01-10 13:41:23,11.50,3517111172421930,49,1,"Davis, Lowe and Baxter",food truck
2,1560,2018-01-14 13:30:29,10.94,3517111172421930,19,1,Santos-Fitzgerald,pub
3,2978,2018-01-28 14:38:33,19.93,3517111172421930,71,1,Greene LLC,restaurant
4,1914,2018-01-29 06:32:49,10.24,3517111172421930,49,1,"Davis, Lowe and Baxter",food truck
5,3205,2018-01-30 16:34:45,16.91,3517111172421930,112,1,Greer Inc,bar
6,1545,2018-01-31 09:42:00,10.49,3517111172421930,77,1,"Brown, Ballard and Glass",restaurant
7,1516,2018-02-01 20:44:12,10.24,3517111172421930,51,1,Fisher-Bolton,restaurant
8,1836,2018-02-14 11:14:37,11.38,3517111172421930,18,1,Romero-Jordan,food truck
9,2785,2018-02-19 12:45:15,15.59,3517111172421930,38,1,Brown LLC,bar


In [240]:
# code a function to identify outliers based on standard deviation
def find_outliers_std(df_in, col_name):
    
    # Set upper and lower limit to 3 standard deviation
    data_std = df_in[col_name].std()
    data_mean = df_in[col_name].mean()
    outliers_cut_off = data_std * 3
    
    print(f"Standard Deviation: ", data_std, "Mean: ", data_mean, "Outliers Cutoff: ", outliers_cut_off)
    
    lower_limit = data_mean - outliers_cut_off 
    upper_limit = data_mean + outliers_cut_off
    
    print(f"Lower Limit: ", lower_limit, "Upper Limit: ", upper_limit)
    
    df_out = df_in.loc[(df_in[col_name] < lower_limit) | (df_in[col_name] > upper_limit)]
    
    return df_out    

In [241]:
# find anomalous transactions for 3 random card holders

cchidlist = txn_all.cardholderid.unique().tolist()
sample_cardholder = random.sample(cchidlist, 3)

outliers_std = pd.DataFrame()

for cchid in sample_cardholder:
    print(f"Cardholder ID: ", cchid)
    
    txn_data_for_cchid = pd.DataFrame()
    df_out = pd.DataFrame()
    
    txn_data_for_cchid = txn_all[txn_all["cardholderid"] == cchid]
    df_out = find_outliers_std(txn_data_for_cchid, 'transactionamount')
    
    outliers_std = outliers_std.append(df_out)
    
outliers_std

Cardholder ID:  6
Standard Deviation:  391.2879258989864 Mean:  115.3125 Outliers Cutoff:  1173.863777696959
Lower Limit:  -1058.551277696959 Upper Limit:  1289.176277696959
Cardholder ID:  3
Standard Deviation:  329.35605432335615 Mean:  139.17224137931035 Outliers Cutoff:  988.0681629700684
Lower Limit:  -848.8959215907581 Upper Limit:  1127.2404043493789
Cardholder ID:  18
Standard Deviation:  324.71421622143686 Mean:  90.982030075188 Outliers Cutoff:  974.1426486643106
Lower Limit:  -883.1606185891226 Upper Limit:  1065.1246787394987


Unnamed: 0,transactionid,transactiondate,transactionamount,creditcardnumber,merchantid,cardholderid,merchantname,merchantcategoryname
591,2710,2018-04-21 19:41:51,2108.0,3581345943543942,130,6,"Brown, Estrada and Powers",coffee shop
603,3225,2018-07-03 14:56:36,1398.0,3581345943543942,8,6,Russell-Thomas,restaurant
610,1459,2018-08-05 01:06:38,1379.0,3581345943543942,145,6,Hood-Phillips,bar
615,3125,2018-09-02 06:17:00,2001.0,3581345943543942,18,6,Romero-Jordan,food truck
616,2984,2018-09-11 15:16:47,1856.0,3581345943543942,138,6,Mccullough-Murphy,food truck
255,1334,2018-07-11 16:55:22,1159.0,30078299053512,107,3,Rowe-Abbott,pub
256,1349,2018-07-14 06:09:18,1160.0,30078299053512,136,3,Martinez-Robinson,bar
2357,3098,2018-02-19 22:48:25,1839.0,344119623920892,95,18,Baxter-Smith,restaurant
2367,1359,2018-04-03 03:23:37,1077.0,344119623920892,100,18,Townsend-Anderson,restaurant
2372,3139,2018-06-03 20:02:28,1814.0,344119623920892,123,18,"Boone, Davis and Townsend",pub


  ## Identifying Outliers Using Interquartile Range

In [242]:
# code a function to identify outliers based on interquartile range

def find_outliers_iqr(df_in, col_name): 
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1  #Interquartile range
    
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
      
    df_out = df_in.loc[(df_in[col_name] < fence_low) | (df_in[col_name] > fence_high)]
    
    return df_out

In [244]:
# find anomalous transactions for 3 random card holders

cchidlist = txn_all.cardholderid.unique().tolist()
cchidlist
sample_cardholder = random.sample(cchidlist, 3)
sample_cardholder

outliers_iqr = pd.DataFrame()

for cchid in sample_cardholder:
    print(f"Cardholder ID: ", cchid)
    
    txn_data_for_cchid = pd.DataFrame()
    df_out = pd.DataFrame()
    
    txn_data_for_cchid = txn_all[txn_all["cardholderid"] == cchid]
    df_out = find_outliers_iqr(txn_data_for_cchid, 'transactionamount')
    
    outliers_iqr = outliers_iqr.append(df_out)
    
outliers_iqr

Cardholder ID:  12
Cardholder ID:  2
Cardholder ID:  22


Unnamed: 0,transactionid,transactiondate,transactionamount,creditcardnumber,merchantid,cardholderid,merchantname,merchantcategoryname
1527,99,2018-01-02 23:27:46,1031.0,501879657465,95,12,Baxter-Smith,restaurant
1531,2667,2018-01-23 06:29:37,1678.0,501879657465,92,12,Garcia-White,pub
1541,2610,2018-03-12 00:44:01,1530.0,501879657465,20,12,Kim-Lopez,coffee shop
1543,236,2018-03-20 10:19:25,852.0,501879657465,35,12,Jarvis-Turner,pub
1553,1622,2018-06-21 13:16:25,1102.0,501879657465,128,12,"Pitts, Salinas and Garcia",coffee shop
1558,3318,2018-06-27 01:27:09,1592.0,501879657465,136,12,Martinez-Robinson,bar
1559,1129,2018-06-28 21:13:52,1108.0,501879657465,35,12,Jarvis-Turner,pub
1577,1856,2018-09-23 19:20:23,1075.0,501879657465,13,12,Giles and Sons,pub
1586,2240,2018-11-23 09:08:05,233.0,501879657465,47,12,Martin Inc,restaurant
1587,1204,2018-11-25 20:44:07,1123.0,501879657465,59,12,Williams Group,bar
