In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

print(f'pd=={pd.__version__}')
print(f'np=={np.__version__}')
print(f'sns=={sns.__version__}')

In [None]:
# read in train and test datasets
train_transaction = pd.read_csv('/Users/oskarwallberg/desktop/kaggle-datasets/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('/Users/oskarwallberg/desktop/kaggle-datasets/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID')

test_transaction.columns = train_transaction.columns.drop(labels='isFraud') # ensure congruent columns

train_transaction.shape, test_transaction.shape

In [None]:
fraud_transaction = train_transaction.loc[train_transaction['isFraud']==1]
valid_transaction = train_transaction.loc[train_transaction['isFraud']==0]
assert fraud_transaction.shape[0] + valid_transaction.shape[0] == train_transaction.shape[0]
fraud_transaction.shape, valid_transaction.shape

In [None]:
FRAUD_RATE = fraud_transaction.shape[0] / train_transaction.shape[0]
FRAUD_RATE

In [None]:
# Preprocessing steps
#   1. Encoding features as numerical
#   2. Filling NaNs
#   3. NOTE SKIP! FE, transforming, enriching, creating features
#   4. Selecting features (FS)
#   5. SMOTE and other over-/undersampling techniques
#   6. Outlier detection and filtering
# Done!

### Exploring Encoding Possibilities
---

In [None]:
# Numerical encoding
object_columns = train_transaction.select_dtypes(include=object).columns
binary_object_columns = ['M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9'] # M1-M3, M5-M9, encode as 0/1 for False/True
diverse_object_columns = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M4'] # EDA and encode as best suited

for col in object_columns:
    print(col, train_transaction[col].nunique())

In [None]:
# Diverse object columns EDA
def make_pivot(df:pd.DataFrame, index:str) -> pd.DataFrame:
    """..."""
    df[index] = df[index].fillna('NaN')
    df_pivot = df.pivot_table(index=index, columns='isFraud', aggfunc='size')
    df_pivot = df_pivot.fillna(0)
    df_pivot.columns = ['valid', 'fraud']
    df_pivot['count'] = df_pivot['valid'] + df_pivot['fraud']
    df_pivot[['valid_norm', 'fraud_norm']] = df_pivot[['valid', 'fraud']].div(other=df_pivot['count'], axis=0)
    df_pivot = df_pivot.sort_values(by='fraud_norm', ascending=False)
    return df_pivot
    


In [None]:
# Encoding ProductCD (product code feature)
pivot_prodcd = make_pivot(df=train_transaction, index='ProductCD')
pivot_prodcd

# NOTE Highest rate of fraud in descending order: C -> S -> H -> R -> W
# C : 11.7% : 68519
# S :  5.9% : 11628
# H :  4.8% : 33024
# R :  3.8% : 37699
# W :  2.0% : 439670

# Could encode as hierarchical with (C, S, H, R, W, NaN) -> (5, 4, 3, 2, 1, 0)

In [None]:
# Encoding card4 (payment card type)
pivot_card4 = make_pivot(df=train_transaction, index='card4')
pivot_card4

# NOTE Highest rate of fraud in descending order: discover -> visa -> mastercard -> american express
# discover	        0.077282
# visa	            0.034756
# mastercard        0.034331
# american express  0.028698

# Could encode as (discover, visa, mastercard, express, NaN) -> (4, 3, 2, 1, 0)

In [None]:
# Encoding card6 ()
pivot_card6 = make_pivot(df=train_transaction, index='card6')
pivot_card6

# NOTE Create two columns : credit, debit - two flag columns 
# all 'debit or credit' will become 'debit'
# charge card is encoded as 0, 0


In [None]:
# Encoding P_emaildomain ()
pivot_pdomain = make_pivot(df=train_transaction, index='P_emaildomain')
pivot_pdomain = pd.concat([pivot_pdomain, pd.DataFrame(data=[FRAUD_RATE], index=['scranton.edu'], columns=['fraud_norm'])])
pivot_pdomain.head(60)
# Ideas for how to encode:
#   1. Split domain strings into domain name, ending, landcode and potentially other types as separate features
#       - if ends with com / net 
#       - if ends witt 2 letter word (country code)
#       - first item is domain host
#   2. Order by fraud rate and partition into groups after some threshold values
#   3. Map each full domain string to its fraud rate - happen to match test data quite well

# Sketch of splitting domain strings...
# train_transaction.loc[train_transaction['P_emaildomain'].str.contains('.'), 'P_emaildomain'].str.split('.')
# df['domain_type'] = df['email'].str.extract(r'(.com|.org)$')


In [None]:
# train P                    train R                   test P                     test R                   
# aim.com                315 aim.com                36 aim.com                153 aim.com                41
# anonymous.com        36998 anonymous.com       20529 anonymous.com        34064 anonymous.com       19115
# aol.com              28289 aol.com              3701 aol.com              24048 aol.com              3538
# att.net               4033 att.net               430 att.net               3614 att.net               440
# bellsouth.net         1909 bellsouth.net         422 bellsouth.net         1528 bellsouth.net         373
# cableone.net           159 cableone.net           27 cableone.net           152 cableone.net           19
# centurylink.net        205 centurylink.net        12 centurylink.net        181 centurylink.net        16
# cfl.rr.com             172 cfl.rr.com             37 cfl.rr.com             146 cfl.rr.com             20
# charter.net            816 charter.net           127 charter.net            627 charter.net           136
# comcast.net           7888 comcast.net          1812 comcast.net           6586 comcast.net          1701
# cox.net               1393 cox.net               459 cox.net               1264 cox.net               395
# earthlink.net          514 earthlink.net          79 earthlink.net          465 earthlink.net          91
# embarqmail.com         260 embarqmail.com         68 embarqmail.com         204 embarqmail.com         72
# frontier.com           280 frontier.com           52 frontier.com           314 frontier.com           58
# frontiernet.net        195 frontiernet.net        14 frontiernet.net        202 frontiernet.net        24
# gmail                  496 gmail                  95 gmail                  497 gmail                 101
# gmail.com           228355 gmail.com           57147 gmail.com           207448 gmail.com           61738
# gmx.de                 149 gmx.de                147 gmx.de                 149 gmx.de                150
# hotmail.co.uk          112 hotmail.co.uk         105 hotmail.co.uk          222 hotmail.co.uk         212
# hotmail.com          45250 hotmail.com         27509 hotmail.com          40399 hotmail.com         25657
# hotmail.de              43 hotmail.de             42 hotmail.de              87 hotmail.de             88
# hotmail.es             305 hotmail.es            292 hotmail.es             322 hotmail.es            303
# hotmail.fr             295 hotmail.fr            293 hotmail.fr             379 hotmail.fr            374
# icloud.com            6267 icloud.com           1398 icloud.com            6049 icloud.com           1422
# juno.com               322 juno.com               53 juno.com               252 juno.com               58
# live.com              3041 live.com              762 live.com              2679 live.com              682
# live.com.mx            749 live.com.mx           754 live.com.mx            721 live.com.mx           710
# live.fr                 56 live.fr                55 live.fr                 50 live.fr                50
# mac.com                436 mac.com               218 mac.com                426 mac.com               212
# mail.com               559 mail.com              122 mail.com               597 mail.com              219
# me.com                1522 me.com                556 me.com                1191 me.com                539
# msn.com               4092 msn.com               852 msn.com               3388 msn.com               846
# netzero.com            230 netzero.com            14 netzero.com            157 netzero.com            10
# netzero.net            196 netzero.net             9 netzero.net            123 netzero.net            10
# optonline.net         1011 optonline.net         187 optonline.net          926 optonline.net         163
# outlook.com           5096 outlook.com          2507 outlook.com           4838 outlook.com          2504
# outlook.es             438 outlook.es            433 outlook.es             425 outlook.es            420
# prodigy.net.mx         207 prodigy.net.mx        207 prodigy.net.mx          96 prodigy.net.mx         96
# protonmail.com          76 protonmail.com         41 protonmail.com          83 protonmail.com         34
# ptd.net                 68 ptd.net                27 ptd.net                 72 ptd.net                43
# q.com                  189 q.com                  25 q.com                  173 q.com                  20
# roadrunner.com         305 roadrunner.com         53 roadrunner.com         278 roadrunner.com         48
# rocketmail.com         664 rocketmail.com         69 rocketmail.com         441 rocketmail.com         57
# sbcglobal.net         2970 sbcglobal.net         552 sbcglobal.net         2797 sbcglobal.net         611
# sc.rr.com              164 sc.rr.com               8 sc.rr.com              113 sc.rr.com               6
#   * NOTE MISSING *         scranton.edu           63 scranton.edu             2 scranton.edu            6
# servicios-ta.com        35 servicios-ta.com       35 servicios-ta.com        45 servicios-ta.com       45
# suddenlink.net         175 suddenlink.net         25 suddenlink.net         148 suddenlink.net         30
# twc.com                230 twc.com                29 twc.com                209 twc.com                32
# verizon.net           2705 verizon.net           620 verizon.net           2306 verizon.net           582
# web.de                 240 web.de                237 web.de                 278 web.de                277
# windstream.net         305 windstream.net         47 windstream.net         247 windstream.net         57
# yahoo.co.jp             32 yahoo.co.jp            33 yahoo.co.jp             69 yahoo.co.jp            71
# yahoo.co.uk             49 yahoo.co.uk            39 yahoo.co.uk             54 yahoo.co.uk            43
# yahoo.com           100934 yahoo.com           11842 yahoo.com            81850 yahoo.com            9563
# yahoo.com.mx          1543 yahoo.com.mx         1508 yahoo.com.mx          1284 yahoo.com.mx         1235
# yahoo.de                74 yahoo.de               75 yahoo.de                63 yahoo.de               64
# yahoo.es               134 yahoo.es               57 yahoo.es               138 yahoo.es               67
# yahoo.fr               143 yahoo.fr              137 yahoo.fr               201 yahoo.fr              178
# ymail.com             2396 ymail.com             207 ymail.com             1679 ymail.com             198

# domain name -> fraud rate
# impute mean fraud rate for scranton...
# if unknown -> 0

In [None]:
# Encode R_emaildomain (receiver email domain)
pivot_rdomain = make_pivot(df=train_transaction, index='R_emaildomain')
pivot_rdomain

# Same method here as for P_emaildomain

In [None]:
test_transaction['R_emaildomain'].value_counts().sort_index()

In [None]:
# Encode M4 (match 4)
pivot_m4 = make_pivot(df=train_transaction, index='M4')
pivot_m4

# Hierarchical encoding (M2, M0, M1, NaN) -> (...rates...)
# Map each value to its fraud rate

### Encoding the Features to Numerical Values
---

In [None]:
train_transaction_pp = train_transaction.copy()
train_transaction_pp.shape

In [None]:
# Encode binary features T/F/NaN as 1/0/-1
train_transaction_pp[binary_object_columns] = train_transaction[binary_object_columns].map(lambda x: 1 if x=='T' else 0 if x=='F' else -1)
train_transaction_pp[binary_object_columns]

In [None]:
# map from index to series values - from product codes to fraud rates
train_transaction_pp['ProductCD'] = train_transaction['ProductCD'].map(pivot_prodcd['fraud_norm'])
train_transaction_pp['card4'] = train_transaction['card4'].map(pivot_card4['fraud_norm'])
train_transaction_pp['card6'] = train_transaction['card6'].map(pivot_card6['fraud_norm'])
train_transaction_pp['M4'] = train_transaction['M4'].map(pivot_m4['fraud_norm'])

# encode emaildomain features

# p_domain_lists = train_transaction['P_emaildomain'].fillna('NaN').str.split('.')

# NOTE EXCLUDE NOTE  p_country_code = p_domain_lists.transform(lambda l: l[-1] if len(l[-1])==2 else np.NaN) # NOTE Majority is NaN! Only 4% has country codes...
# p_email_host = p_domain_lists.transform(lambda l: l[0] if l[0]!='NaN' else np.NaN)
# p_email_tld = p_domain_lists.transform(lambda l: 'com' if 'com' in l else 'net' if 'net' in l else np.NaN) # TLD stands for 'top level domain'
# train_transaction_pp['P_email_host'] = p_email_host
# train_transaction_pp['P_email_tld'] = p_email_tld

# map domain names to fraud rate - impute for the one missing
train_transaction_pp['P_domain_fraud_rate'] = train_transaction['P_emaildomain'].map(pivot_pdomain['fraud_norm'])

# partition fraud rates into degree of risks
# <1%           low risk 0
# >=1% & <5% medium risk 1
# >=5%         high risk 2
# NaNs       medium risk 1
train_transaction_pp['P_domain_risk_group'] = train_transaction_pp['P_domain_fraud_rate'].transform(lambda x: 0 if x<0.01 else 1 if 0.01<=x<0.05 else 2 if 0.05<=x else np.NaN)

In [None]:
diverse_encoded_columns = ['ProductCD', 'card4', 'card6', 'P_domain_fraud_rate', 'P_domain_risk_group', 'M4']
train_transaction_pp[diverse_encoded_columns]

In [None]:
train_transaction_pp = train_transaction_pp.drop(columns=train_transaction_pp.select_dtypes(object).columns) # faster this way since fewer object column, 2 to 390-ish numerical
train_transaction_pp.select_dtypes(object).columns # should be no object columns left!

In [None]:
# Preprocessing steps
#   1. Encoding features as numerical
#   2. Filling NaNs
#   3. NOTE SKIP! FE, transforming, enriching, creating features
#   4. Selecting features
#   5. SMOTE and other over-/undersampling techniques
#   6. Outlier detection and filtering
# Done!

### Handling NaNs
---

In [None]:
# This many columns contains some form of NaNs
train_transaction_pp.isna().any(axis=0).sum()

In [None]:
# Simple solution to impute all columns not containing -1 with -1 for NaNs
# NOTE lazy solution but a quick fix and can be iterated on further down the line
is_nan = train_transaction_pp.isna().any(axis=0)
is_neg = train_transaction_pp.lt(0).any(axis=0)
columns_nan_special = train_transaction_pp.loc[:, (is_nan & is_neg)].columns # the columns both containing NaNs to be patched and that already contains negative values, in need of a different solution
columns_nan_normal = train_transaction_pp.loc[:, (is_nan & ~is_neg)].columns
print(f'Normal imputing (-1): {columns_nan_normal.shape[0]}')
print(f'Special imputing (x): {columns_nan_special.shape[0]}')


In [None]:
train_transaction_pp[columns_nan_normal] = train_transaction_pp[columns_nan_normal].fillna(-1)
print(f'NaN cols remaining: {train_transaction_pp.isna().any(axis=0).sum()}') # remaining NaN columns requiring special handling

In [None]:
# Remarkably few negative values in each of the columns...
# XXX Are negative Timedeltas reasonable in this scenario or improper data?
train_transaction_pp[columns_nan_special].lt(0).sum(axis=0)

In [None]:
# All 25th percentiles are at 0, is probably the most reasonable imputation besides the value -1
train_transaction_pp[columns_nan_special].describe().T

In [None]:
train_transaction_pp[list(columns_nan_special)+['isFraud']].loc[train_transaction_pp[columns_nan_special].lt(0).any(axis=1)]

In [None]:
# Considering the still substantial number of values, the few negative may as well be treated as outliers. Well wait with this claim for now and see later...
train_transaction_pp[columns_nan_special].info()

In [None]:
sns.heatmap(data=train_transaction_pp[columns_nan_special].isna(), cbar=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(14, 12))
fig.suptitle("NaN special case columns - Distribution of Values", fontweight="bold", fontsize=16)
gs = plt.GridSpec(nrows=3, ncols=2)
for i, ax_loc in enumerate(gs):
    col = columns_nan_special[i]
    ax = fig.add_subplot(ax_loc)
    ax.set_title(col)
    train_transaction_pp[col].hist(bins=20)
plt.show()

In [None]:
train_transaction_pp[columns_nan_special] = train_transaction_pp[columns_nan_special].fillna(-1) # -1 still seems to be the best impute value still (maybe 0 / the mode)
print(f'NaN cols remaining: {train_transaction_pp.isna().any(axis=0).sum()}')

In [None]:
# Seems OK!
train_transaction_pp.info(verbose=True, show_counts=True)

### Feature Engineering, SKIP For now!
---

In [None]:
# to be implemented...
# NOTE for future_
#   : add DXn columns as shown by winning competitors - perhaps those versions are more interpretable for the model??

### Feature Selection
---

#### Correlation Analysis

In [None]:
# Correlation Matricies
columns_vesta = np.array([f"V{i}" for i in range(1, 340)])
columns_non_vesta = train_transaction_pp.columns.drop(labels=columns_vesta)

In [None]:
def plot_corr_heatmap(df:pd.DataFrame, annot=False, figsize=(12, 10)):
    """Create and plot heatmap of feature correlation matrix"""
    corr = df.corr()
    
    mask = np.zeros_like(corr, dtype=bool) # not masking any at the moment, all masking is false
    triu_index = np.triu_indices_from(mask) # argument to mask upper triangle of correlation matrix, redundant otherwise
    mask[triu_index] = True

    fig = plt.figure(figsize=figsize)
    sns.heatmap(data=corr, mask=mask, vmin=-1, center=0, vmax=1, cmap="coolwarm", cbar=True, annot=annot, fmt=".2g")
    plt.show()

In [None]:
plot_corr_heatmap(df=train_transaction_pp[columns_non_vesta])
# From this investivation, the suspect column families are:
#   : count columns (CX)
#   : match columns (MX)
# We view them separately, the reason being that viewing all column as once becomes unreadable

In [None]:
columns_cx_drop = ["C1", "C2", "C6", "C7", "C8", "C9", "C10", "C11", "C14"] # candidates to drop, no correlation over 90%
columns_cx = [f"C{i}" for i in range(1, 15)]
columns_cx_keep = list(set(columns_cx) - set(columns_cx_drop))
plot_corr_heatmap(df=train_transaction_pp[["isFraud"]+columns_cx_keep], annot=True, figsize=(6, 5))

In [None]:
columns_mx_drop = ["M2", "M3", "M7", "M9"] # cadidates to drop, no correlation over 90%
columns_mx = [f"M{i}" for i in range(1, 10)]
columns_mx_keep = list(set(columns_mx) - set(columns_mx_drop))
plot_corr_heatmap(df=train_transaction_pp[["isFraud"]+columns_mx_keep], annot=True, figsize=(6, 5))

In [None]:
# D1 and D2 strongly correlated, drop D1 since weaker correlation with isFraud
train_transaction_pp[["isFraud", "D1", "D2"]].corr()
columns_dx_drop = ["D1"]
columns_dx = [f"D{i}" for i in range(1, 16)]
columns_dx_keep = list(set(columns_dx)-set(columns_dx_drop))
plot_corr_heatmap(df=train_transaction_pp[["isFraud"]+columns_dx_keep], annot=True, figsize=(12, 10))

In [None]:
# plot_corr_heatmap(df=train_transaction_pp[["isFraud"]+list(columns_v100)], annot=True, figsize=(18, 16))
corr_vesta = train_transaction_pp[['isFraud']+list(columns_vesta)].corr()
corr_vesta.shape

In [None]:
# might keep, many corr: 
# 'V173', 'V188'!, 'V191', 'V194', 'V195', 'V197', 'V198'
# 'V223', 'V240', 'V241', 'V242'!, 'V244'!, 'V247', 'V248', 'V249', 'V250', 'V251', 'V252'

# might keep, some corr:
# ... only above 0.2
# 'V170', 'V171', 'V176', 'V188', 'V189', 'V190', 'V199', 'V200', 'V201',
# 'V228', 'V230', 'V242', 'V243', 'V244', 'V246', 'V257', 'V258', 

# might keep, few corr:
# 'V44', 'V45', 'V111', 'V123', 'V125', 'V169', 'V170', 'V171', 'V175', 'V181', 'V185', 'V187',
# 'V220', 'V222', 'V229', 'V235', 'V238', 'V239', 'V258'!!, 'V282', 'V283', 'V303', 'V304',


# columns_corr_drop_axis1 = corr_vesta.gt(0.9).sum(axis=1) + corr_vesta.lt(-0.9).sum(axis=1) 
num_high_corr_by_column = corr_vesta.gt(0.9).sum(axis=0) + corr_vesta.lt(-0.9).sum(axis=0) - 1 # minus 1 since all columns have 100% correlation with itself

# Quick filter for features to drop
# if less than 0.07 correlation with isFraud and more than 3 highly correlated features -> Arrata!
feature_corr_threshold = num_high_corr_by_column.gt(3)  # multicollinearity
isfraud_corr_threshold = corr_vesta['isFraud'].lt(0.07) # to little impact

vesta_drop_mask = feature_corr_threshold & isfraud_corr_threshold

columns_vx_drop = num_high_corr_by_column.loc[vesta_drop_mask].index.to_list()
columns_vx_keep = num_high_corr_by_column.loc[~vesta_drop_mask].index.to_list()

len(columns_vx_drop), len(columns_vx_keep)

In [None]:
# Top 20 Vesta features correlated with isFraud
plot_corr_heatmap(df=train_transaction_pp[corr_vesta['isFraud'].sort_values(ascending=False)[:20].index], annot=True, figsize=(16, 14))


In [None]:
# The number of columns to drop from the correlation analysis. 
columns_corr_drop = columns_cx_drop+columns_mx_drop+columns_dx_drop+columns_vx_drop
len(columns_corr_drop)

In [None]:
# Name explanation: preprocessed (pp) train transaction data after 1st round of feature selection (fs1)
train_transaction_pp_fs1 = train_transaction_pp.drop(columns=columns_corr_drop).copy()
train_transaction_pp_fs1.shape

In [None]:
# Dataset is now 1.2 GB instead of 1.7 GB!
train_transaction_pp_fs1.info()

#### Using LR w. Lasso (L1) regularisation for Feature Selection
* LR is practical for binary labels (isFraud)

In [None]:
logreg = LogisticRegression(
    penalty="l1",
    solver="saga",
    C=1,
    max_iter=100,
    verbose=True,
    n_jobs=-1,
    class_weight={0: 1, 1:4},
    # Cs=2,           # inverse regularisation strengths, logarithmic range of 10 values
    # cv=3,           # CV folds
    # penalty="l1",   # Lasso (L1) regularisation
    # solver="saga",  # better for large dataset, liblinear for small datasets
    # max_iter=100,   # adjust for strong regularisation (especially for saga solver)
    # verbose=True,
    # n_jobs=-1,
)

logreg

In [None]:
sc = StandardScaler(with_mean=True, with_std=True)

sc

In [None]:
y = train_transaction_pp_fs1['isFraud'].copy()
X = train_transaction_pp_fs1.drop(columns='isFraud').copy()

X_sc = pd.DataFrame(
    index=X.index,
    columns=X.columns,
    data=sc.fit_transform(X=X),
)

X_sc.shape, y.shape

In [None]:
# NOTE RELEVANT FOR LATER ON
# logreg.fit(X_sc, y)

In [None]:
# NOTE RELEVANT FOR LATER ON
# feature_importance = pd.Series(data=logreg.coef_[0], index=X_sc.columns).abs().sort_values(ascending=False)
# for (index, importance) in feature_importance.items():
#     print(index, importance)

In [None]:
train_transaction_pp_fs1

In [None]:
kaggle_dataset_path = "/Users/oskarwallberg/Desktop/kaggle-datasets/ieee-fraud-detection/"
os.listdir(path=kaggle_dataset_path)

In [None]:
name_data_pp_fs1 = "train_transaction_pp_fs1.csv"
train_transaction_pp_fs1.to_csv(path_or_buf=kaggle_dataset_path+name_data_pp_fs1, index=True)