In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print(f'pd=={pd.__version__}')
print(f'np=={np.__version__}')
print(f'sns=={sns.__version__}')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


pd==2.2.0
np==1.26.4
sns==0.13.2


In [3]:
# read in train and test datasets
train_transaction = pd.read_csv('/Users/oskarwallberg/desktop/kaggle-datasets/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('/Users/oskarwallberg/desktop/kaggle-datasets/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID')

test_transaction.columns = train_transaction.columns.drop(labels='isFraud') # ensure congruent columns

train_transaction.shape, test_transaction.shape

((590540, 393), (506691, 392))

In [4]:
fraud_transaction = train_transaction.loc[train_transaction['isFraud']==1]
valid_transaction = train_transaction.loc[train_transaction['isFraud']==0]
assert fraud_transaction.shape[0] + valid_transaction.shape[0] == train_transaction.shape[0]
fraud_transaction.shape, valid_transaction.shape

((20663, 393), (569877, 393))

In [149]:
FRAUD_RATE = fraud_transaction.shape[0] / train_transaction.shape[0]
FRAUD_RATE

0.03499000914417313

In [5]:
# Preprocessing steps
#   1. Encoding features as numerical
#   2. Filling NaNs
#   3. FE, transforming, enriching, creating features
#   4. Selecting features
#   5. SMOTE and other over-/undersampling techniques
#   6. Outlier detection and filtering
# Done!

In [6]:
# Numerical encoding
object_columns = train_transaction.select_dtypes(include=object).columns
binary_object_columns = ['M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9'] # M1-M3, M5-M9, encode as 0/1 for False/True
diverse_object_columns = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M4'] # EDA and encode as best suited

for col in object_columns:
    print(col, train_transaction[col].nunique())

ProductCD 5
card4 4
card6 4
P_emaildomain 59
R_emaildomain 60
M1 2
M2 2
M3 2
M4 3
M5 2
M6 2
M7 2
M8 2
M9 2


In [153]:
# Diverse object columns EDA
def make_pivot(df:pd.DataFrame, index:str) -> pd.DataFrame:
    """..."""
    df[index] = df[index].fillna('NaN')
    df_pivot = df.pivot_table(index=index, columns='isFraud', aggfunc='size')
    df_pivot = df_pivot.fillna(0)
    df_pivot.columns = ['valid', 'fraud']
    df_pivot['count'] = df_pivot['valid'] + df_pivot['fraud']
    df_pivot[['valid_norm', 'fraud_norm']] = df_pivot[['valid', 'fraud']].div(other=df_pivot['count'], axis=0)
    df_pivot = df_pivot.sort_values(by='fraud_norm', ascending=False)
    return df_pivot
    


In [154]:
# Encoding ProductCD (product code feature)
pivot_prodcd = make_pivot(df=train_transaction, index='ProductCD')
pivot_prodcd

# NOTE Highest rate of fraud in descending order: C -> S -> H -> R -> W
# C : 11.7% : 68519
# S :  5.9% : 11628
# H :  4.8% : 33024
# R :  3.8% : 37699
# W :  2.0% : 439670

# Could encode as hierarchical with (C, S, H, R, W, NaN) -> (5, 4, 3, 2, 1, 0)

Unnamed: 0_level_0,valid,fraud,count,valid_norm,fraud_norm
ProductCD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,60511,8008,68519,0.883127,0.116873
S,10942,686,11628,0.941004,0.058996
H,31450,1574,33024,0.952338,0.047662
R,36273,1426,37699,0.962174,0.037826
W,430701,8969,439670,0.979601,0.020399


In [155]:
# Encoding card4 (payment card type)
pivot_card4 = make_pivot(df=train_transaction, index='card4')
pivot_card4

# NOTE Highest rate of fraud in descending order: discover -> visa -> mastercard -> american express
# discover	        0.077282
# visa	            0.034756
# mastercard        0.034331
# american express  0.028698

# Could encode as (discover, visa, mastercard, express, NaN) -> (4, 3, 2, 1, 0)

Unnamed: 0_level_0,valid,fraud,count,valid_norm,fraud_norm
card4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
discover,6137,514,6651,0.922718,0.077282
visa,371394,13373,384767,0.965244,0.034756
mastercard,182721,6496,189217,0.965669,0.034331
american express,8089,239,8328,0.971302,0.028698
,1536,41,1577,0.974001,0.025999


In [156]:
# Encoding card6 ()
pivot_card6 = make_pivot(df=train_transaction, index='card6')
pivot_card6

# NOTE Create two columns : credit, debit - two flag columns 
# all 'debit or credit' will become 'debit'
# charge card is encoded as 0, 0


Unnamed: 0_level_0,valid,fraud,count,valid_norm,fraud_norm
card6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
credit,139036.0,9950.0,148986.0,0.933215,0.066785
,1532.0,39.0,1571.0,0.975175,0.024825
debit,429264.0,10674.0,439938.0,0.975737,0.024263
charge card,15.0,0.0,15.0,1.0,0.0
debit or credit,30.0,0.0,30.0,1.0,0.0


In [160]:
# Encoding P_emaildomain ()
pivot_pdomain = make_pivot(df=train_transaction, index='P_emaildomain')
pivot_pdomain = pd.concat([pivot_pdomain, pd.DataFrame(data=[FRAUD_RATE], index=['scranton.edu'], columns=['fraud_norm'])])
pivot_pdomain.head(60)
# Ideas for how to encode:
#   1. Split domain strings into domain name, ending, landcode and potentially other types as separate features
#       - if ends with com / net 
#       - if ends witt 2 letter word (country code)
#       - first item is domain host
#   2. Order by fraud rate and partition into groups after some threshold values
#   3. Map each full domain string to its fraud rate - happen to match test data quite well

# Sketch of splitting domain strings...
# train_transaction.loc[train_transaction['P_emaildomain'].str.contains('.'), 'P_emaildomain'].str.split('.')
# df['domain_type'] = df['email'].str.extract(r'(.com|.org)$')


Unnamed: 0,valid,fraud,count,valid_norm,fraud_norm
protonmail.com,45.0,31.0,76.0,0.592105,0.407895
mail.com,453.0,106.0,559.0,0.810376,0.189624
outlook.es,381.0,57.0,438.0,0.869863,0.130137
aim.com,275.0,40.0,315.0,0.873016,0.126984
outlook.com,4614.0,482.0,5096.0,0.905416,0.094584
hotmail.es,285.0,20.0,305.0,0.934426,0.065574
live.com.mx,708.0,41.0,749.0,0.94526,0.05474
hotmail.com,42854.0,2396.0,45250.0,0.94705,0.05295
gmail.com,218412.0,9943.0,228355.0,0.956458,0.043542
yahoo.fr,138.0,5.0,143.0,0.965035,0.034965


In [None]:
# train P                    train R                   test P                     test R                   
# aim.com                315 aim.com                36 aim.com                153 aim.com                41
# anonymous.com        36998 anonymous.com       20529 anonymous.com        34064 anonymous.com       19115
# aol.com              28289 aol.com              3701 aol.com              24048 aol.com              3538
# att.net               4033 att.net               430 att.net               3614 att.net               440
# bellsouth.net         1909 bellsouth.net         422 bellsouth.net         1528 bellsouth.net         373
# cableone.net           159 cableone.net           27 cableone.net           152 cableone.net           19
# centurylink.net        205 centurylink.net        12 centurylink.net        181 centurylink.net        16
# cfl.rr.com             172 cfl.rr.com             37 cfl.rr.com             146 cfl.rr.com             20
# charter.net            816 charter.net           127 charter.net            627 charter.net           136
# comcast.net           7888 comcast.net          1812 comcast.net           6586 comcast.net          1701
# cox.net               1393 cox.net               459 cox.net               1264 cox.net               395
# earthlink.net          514 earthlink.net          79 earthlink.net          465 earthlink.net          91
# embarqmail.com         260 embarqmail.com         68 embarqmail.com         204 embarqmail.com         72
# frontier.com           280 frontier.com           52 frontier.com           314 frontier.com           58
# frontiernet.net        195 frontiernet.net        14 frontiernet.net        202 frontiernet.net        24
# gmail                  496 gmail                  95 gmail                  497 gmail                 101
# gmail.com           228355 gmail.com           57147 gmail.com           207448 gmail.com           61738
# gmx.de                 149 gmx.de                147 gmx.de                 149 gmx.de                150
# hotmail.co.uk          112 hotmail.co.uk         105 hotmail.co.uk          222 hotmail.co.uk         212
# hotmail.com          45250 hotmail.com         27509 hotmail.com          40399 hotmail.com         25657
# hotmail.de              43 hotmail.de             42 hotmail.de              87 hotmail.de             88
# hotmail.es             305 hotmail.es            292 hotmail.es             322 hotmail.es            303
# hotmail.fr             295 hotmail.fr            293 hotmail.fr             379 hotmail.fr            374
# icloud.com            6267 icloud.com           1398 icloud.com            6049 icloud.com           1422
# juno.com               322 juno.com               53 juno.com               252 juno.com               58
# live.com              3041 live.com              762 live.com              2679 live.com              682
# live.com.mx            749 live.com.mx           754 live.com.mx            721 live.com.mx           710
# live.fr                 56 live.fr                55 live.fr                 50 live.fr                50
# mac.com                436 mac.com               218 mac.com                426 mac.com               212
# mail.com               559 mail.com              122 mail.com               597 mail.com              219
# me.com                1522 me.com                556 me.com                1191 me.com                539
# msn.com               4092 msn.com               852 msn.com               3388 msn.com               846
# netzero.com            230 netzero.com            14 netzero.com            157 netzero.com            10
# netzero.net            196 netzero.net             9 netzero.net            123 netzero.net            10
# optonline.net         1011 optonline.net         187 optonline.net          926 optonline.net         163
# outlook.com           5096 outlook.com          2507 outlook.com           4838 outlook.com          2504
# outlook.es             438 outlook.es            433 outlook.es             425 outlook.es            420
# prodigy.net.mx         207 prodigy.net.mx        207 prodigy.net.mx          96 prodigy.net.mx         96
# protonmail.com          76 protonmail.com         41 protonmail.com          83 protonmail.com         34
# ptd.net                 68 ptd.net                27 ptd.net                 72 ptd.net                43
# q.com                  189 q.com                  25 q.com                  173 q.com                  20
# roadrunner.com         305 roadrunner.com         53 roadrunner.com         278 roadrunner.com         48
# rocketmail.com         664 rocketmail.com         69 rocketmail.com         441 rocketmail.com         57
# sbcglobal.net         2970 sbcglobal.net         552 sbcglobal.net         2797 sbcglobal.net         611
# sc.rr.com              164 sc.rr.com               8 sc.rr.com              113 sc.rr.com               6
#   * NOTE MISSING *         scranton.edu           63 scranton.edu             2 scranton.edu            6
# servicios-ta.com        35 servicios-ta.com       35 servicios-ta.com        45 servicios-ta.com       45
# suddenlink.net         175 suddenlink.net         25 suddenlink.net         148 suddenlink.net         30
# twc.com                230 twc.com                29 twc.com                209 twc.com                32
# verizon.net           2705 verizon.net           620 verizon.net           2306 verizon.net           582
# web.de                 240 web.de                237 web.de                 278 web.de                277
# windstream.net         305 windstream.net         47 windstream.net         247 windstream.net         57
# yahoo.co.jp             32 yahoo.co.jp            33 yahoo.co.jp             69 yahoo.co.jp            71
# yahoo.co.uk             49 yahoo.co.uk            39 yahoo.co.uk             54 yahoo.co.uk            43
# yahoo.com           100934 yahoo.com           11842 yahoo.com            81850 yahoo.com            9563
# yahoo.com.mx          1543 yahoo.com.mx         1508 yahoo.com.mx          1284 yahoo.com.mx         1235
# yahoo.de                74 yahoo.de               75 yahoo.de                63 yahoo.de               64
# yahoo.es               134 yahoo.es               57 yahoo.es               138 yahoo.es               67
# yahoo.fr               143 yahoo.fr              137 yahoo.fr               201 yahoo.fr              178
# ymail.com             2396 ymail.com             207 ymail.com             1679 ymail.com             198

# domain name -> fraud rate
# impute mean fraud rate for scranton...
# if unknown -> 0

In [74]:
# Encode R_emaildomain (receiver email domain)
pivot_rdomain = make_pivot(df=train_transaction, index='R_emaildomain')
pivot_rdomain

# Same method here as for P_emaildomain

Unnamed: 0_level_0,valid,fraud,count,valid_norm,fraud_norm
R_emaildomain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
protonmail.com,2.0,39.0,41.0,0.04878,0.95122
mail.com,76.0,46.0,122.0,0.622951,0.377049
netzero.net,7.0,2.0,9.0,0.777778,0.222222
outlook.com,2093.0,414.0,2507.0,0.834862,0.165138
outlook.es,376.0,57.0,433.0,0.86836,0.13164
icloud.com,1218.0,180.0,1398.0,0.871245,0.128755
gmail.com,50336.0,6811.0,57147.0,0.880816,0.119184
hotmail.com,25369.0,2140.0,27509.0,0.922207,0.077793
earthlink.net,73.0,6.0,79.0,0.924051,0.075949
hotmail.es,272.0,20.0,292.0,0.931507,0.068493


In [145]:
test_transaction['R_emaildomain'].value_counts().sort_index()

R_emaildomain
aim.com                41
anonymous.com       19115
aol.com              3538
att.net               440
bellsouth.net         373
cableone.net           19
centurylink.net        16
cfl.rr.com             20
charter.net           136
comcast.net          1701
cox.net               395
earthlink.net          91
embarqmail.com         72
frontier.com           58
frontiernet.net        24
gmail                 101
gmail.com           61738
gmx.de                150
hotmail.co.uk         212
hotmail.com         25657
hotmail.de             88
hotmail.es            303
hotmail.fr            374
icloud.com           1422
juno.com               58
live.com              682
live.com.mx           710
live.fr                50
mac.com               212
mail.com              219
me.com                539
msn.com               846
netzero.com            10
netzero.net            10
optonline.net         163
outlook.com          2504
outlook.es            420
prodigy.net.mx         9

In [76]:
# Encode M4 (match 4)
pivot_m4 = make_pivot(df=train_transaction, index='M4')
pivot_m4

# Hierarchical encoding (M2, M0, M1, NaN) -> (...rates...)
# Map each value to its fraud rate

Unnamed: 0_level_0,valid,fraud,count,valid_norm,fraud_norm
M4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M2,53056,6809,59865,0.886261,0.113739
M0,189207,7198,196405,0.963351,0.036649
M1,51397,1429,52826,0.972949,0.027051


In [168]:
train_transaction_pp = train_transaction.copy()
train_transaction_pp.shape

(590540, 393)

In [169]:
# Encode binary features T/F/NaN as 1/0/-1
train_transaction_pp[binary_object_columns] = train_transaction_pp[binary_object_columns].map(lambda x: 1 if x=='T' else 0 if x=='F' else -1)
train_transaction_pp[binary_object_columns]

Unnamed: 0_level_0,M1,M2,M3,M5,M6,M7,M8,M9
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2987000,1,1,1,0,1,-1,-1,-1
2987001,-1,-1,-1,1,1,-1,-1,-1
2987002,1,1,1,0,0,0,0,0
2987003,-1,-1,-1,1,0,-1,-1,-1
2987004,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...
3577535,1,1,1,1,0,0,0,1
3577536,1,0,0,0,1,0,0,0
3577537,1,0,0,-1,1,-1,-1,-1
3577538,1,1,1,0,1,-1,-1,-1


In [170]:
# NOTE Til next time
# Make empty dataframe and add each encoded feature to it
# Check dataframe for correct encoding
# Add dataframe to training df and drop the object columns
# -------------------------------------------------------#

# map from index to series values - from product codes to fraud rates
train_transaction_pp['ProductCD'] = train_transaction_pp['ProductCD'].map(pivot_prodcd['fraud_norm'])
train_transaction_pp['card4'] = train_transaction_pp['card4'].map(pivot_card4['fraud_norm'])
train_transaction_pp['card6'] = train_transaction_pp['card6'].map(pivot_card6['fraud_norm'])
train_transaction_pp['M4'] = train_transaction_pp['M4'].map(pivot_m4['fraud_norm'])

# encode emaildomain features

p_domain_lists = train_transaction['P_emaildomain'].fillna('NaN').str.split('.')

# NOTE EXCLUDE NOTE  p_country_code = p_domain_lists.transform(lambda l: l[-1] if len(l[-1])==2 else np.NaN) # NOTE Majority is NaN! Only 4% has country codes...
# p_email_host = p_domain_lists.transform(lambda l: l[0] if l[0]!='NaN' else np.NaN)
# p_email_tld = p_domain_lists.transform(lambda l: 'com' if 'com' in l else 'net' if 'net' in l else np.NaN) # TLD stands for 'top level domain'

# train_transaction_pp['P_email_host'] = p_email_host
# train_transaction_pp['P_email_tld'] = p_email_tld

# map domain names to fraud rate - impute for the one missing
p_domain_fraud_rate = train_transaction['P_emaildomain'].map(pivot_pdomain['fraud_norm'])
train_transaction_pp['P_domain_fraud_rate'] = p_domain_fraud_rate

# partition fraud rates into degree of risks
# <1%           low risk 0
# >=1% & <5% medium risk 1
# >=5%         high risk 2
# NaNs       medium risk 1
p_domain_risk_group = p_domain_fraud_rate.transform(lambda x: 0 if x<0.01 else 1 if 0.01<=x<0.05 else 2 if 0.05<=x else np.NaN)
train_transaction_pp['P_domain_risk'] = p_domain_risk_group


# train_transaction_pp[diverse_object_columns] = train_transaction_pp[diverse_object_columns].fillna(-1)

In [172]:
train_transaction_pp[diverse_object_columns]

Unnamed: 0_level_0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M4
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2987000,0.020399,0.077282,0.066785,,,0.113739
2987001,0.020399,0.034331,0.066785,gmail.com,,0.036649
2987002,0.020399,0.034756,0.024263,outlook.com,,0.036649
2987003,0.020399,0.034331,0.024263,yahoo.com,,0.036649
2987004,0.047662,0.034331,0.066785,gmail.com,,
...,...,...,...,...,...,...
3577535,0.020399,0.034756,0.024263,,,0.036649
3577536,0.020399,0.034331,0.024263,gmail.com,,0.036649
3577537,0.020399,0.034331,0.024263,gmail.com,,
3577538,0.020399,0.034331,0.024263,aol.com,,0.036649
