In [6]:
import pandas as pd

In [8]:
train_transactions = pd.read_csv("../ieee_fraud_data/train_transaction.csv")
test_transactions = pd.read_csv("../ieee_fraud_data/test_transaction.csv")

In [152]:
t1 = pd.read_csv("../ieee_fraud_data/test_transaction.csv")

In [153]:
t1 = t1["TransactionID"]

In [11]:
### TransactionDT seems to be in seconds, we can transform to days
train_transactions["TransactionDay"] = (train_transactions["TransactionDT"]/86400).astype(int)
test_transactions["TransactionDay"] = (test_transactions["TransactionDT"]/86400).astype(int)

In [108]:
y_train = train_transactions["isFraud"].copy()
train_transactions = train_transactions.drop(columns=["isFraud"])

##### What features to drop first ?

In [12]:
train_transactions[["dist1", "dist2"]].isna().sum()/len(train_transactions)
# This yields dist1, dist2 having > 50% of values NA, will drop them completely
train_transactions = train_transactions.drop(columns=["dist1", "dist2"])
test_transactions = test_transactions.drop(columns=["dist1", "dist2"])

In [13]:
train_transactions[[f"M{i}" for i in range(1, 10)]].isna().sum()/len(train_transactions)
# This yields M7, M8, M9 having > 50% of values NA, will drop them completely
train_transactions = train_transactions.drop(columns=["M7", "M8", "M9"])
test_transactions = test_transactions.drop(columns=["M7", "M8", "M9"])

In [14]:
train_transactions[[f"D{i}" for i in range(1, 16)]].isna().sum()/len(train_transactions)
# This yields D5, D6, D7, D8, D9, D12, D13, D14 having > 50% of values NA, will drop them completely
train_transactions = train_transactions.drop(columns=["D5", "D6", "D7", "D8", "D9", "D12", "D13", "D14"])
test_transactions = test_transactions.drop(columns=["D5", "D6", "D7", "D8", "D9", "D12", "D13", "D14"])

In [15]:
# V* columns are too many and would take ages to figure things out. First thing we could try is
# to try running PCA and reduce the dimension, but will refrain from this unless the rest of the
# features are completely useless
train_transactions = train_transactions.drop(columns=[f"V{i}" for i in range(1, 340)])
test_transactions = test_transactions.drop(columns=[f"V{i}" for i in range(1, 340)])

In [39]:
train_transactions.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
       'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3',
       'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14',
       'D1', 'D2', 'D3', 'D4', 'D10', 'D11', 'D15', 'M1', 'M2', 'M3', 'M4',
       'M5', 'M6', 'TransactionDay', 'card1_addr1',
       'card1_addr1_P_emaildomain'],
      dtype='object')

In [73]:
def encode_labels(train, test, column):
    all_labels, _ = pd.concat([train[column], test[column]], axis=0).factorize(sort=True)
    
    train[column] = all_labels[:len(train)].astype(int)
    test[column] = all_labels[len(train):].astype(int)

def frequency_encode(train, test, columns):
    # Apparently this gives good results in lots of kaggle competitions
    # I couldn't find much online why the frequency is actually useful
    # as a feature. My intuition is that the model might match low or
    # high frequencies with card fraud in a case or another. Eg. if 
    # most of the frauds are coming from a certain email domain, the
    # frequency is going to be a useful feature.
    n = len(train) + len(test)
    for column in columns:
        f = (pd.concat(
                [train[column], test[column]], axis=0
            ).value_counts(dropna=True)/n).to_dict()    
        train[f"{column}_FE"] = train[column].map(f).astype(float)
        test[f"{column}_FE"] = test[column].map(f).astype(float)

def concatenate(train, test, col1, col2):
    new_name = f"{col1}_{col2}"
    train[new_name] = train[col1].astype(str)+ "_" + train[col2].astype(str)
    test[new_name] = test[col1].astype(str) + "_" + test[col2].astype(str)
    
    encode_labels(train, test, new_name)
    

def groupby(train, test, index, column, aggregation):
    # We want for example, the mean transaction value for every card1_address1 pair
    new_name = f"{column}_{index}_{aggregation}"
    
    f = pd.concat([train[[index, column]], test[[index, column]]])
    f = f.groupby([index])[column].agg([aggregation]).reset_index().rename(columns={aggregation: new_name})
    f.index = list(f[index])
    f = f[new_name].to_dict()
    train[new_name] = train[index].map(f).astype(float)
    test[new_name] = train[index].map(f).astype(float)
    train[new_name].fillna(-1, inplace=True)
    test[new_name].fillna(-1, inplace=True)

In [86]:
train_transactions[train_transactions["card1_addr1"] == 13832][["card1", "TransactionAmt", "addr1", "P_emaildomain"]]

Unnamed: 0,card1,TransactionAmt,addr1,P_emaildomain
0,13926,68.5,315.0,
55544,13926,40.0,315.0,gmail.com
263717,13926,317.5,315.0,comcast.net


In [110]:
train_transactions = train_transactions.drop(columns=["TransactionID", "TransactionDT"])
test_transactions = test_transactions.drop(columns=["TransactionID", "TransactionDT"])

In [104]:
concatenate(train_transactions, test_transactions, 'card1','addr1')
concatenate(train_transactions, test_transactions, "card1_addr1", "P_emaildomain")
frequency_encode(train_transactions, test_transactions, ['addr1','card1','card2','card3','P_emaildomain'])

groupby(train_transactions, test_transactions, "card1", "TransactionAmt", "mean")
groupby(train_transactions, test_transactions, "card1", "TransactionAmt", "std")
groupby(train_transactions, test_transactions, "card1_addr1", "TransactionAmt", "mean")
groupby(train_transactions, test_transactions, "card1_addr1", "TransactionAmt", "std")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "TransactionAmt", "mean")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "TransactionAmt", "std")

groupby(train_transactions, test_transactions, "card1", "D11", "mean")
groupby(train_transactions, test_transactions, "card1", "D11", "std")
groupby(train_transactions, test_transactions, "card1_addr1", "D11", "mean")
groupby(train_transactions, test_transactions, "card1_addr1", "D11", "std")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "D11", "mean")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "D11", "std")

In [141]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [139]:
train_transactions.isna().sum()

TransactionAmt                                        0
card1                                                 0
card2                                              8933
card3                                              1565
addr1                                             65706
addr2                                             65706
C1                                                    0
C2                                                    0
C3                                                    0
C4                                                    0
C5                                                    0
C6                                                    0
C7                                                    0
C8                                                    0
C9                                                    0
C10                                                   0
C11                                                   0
C12                                             

In [136]:
idxT = train_transactions.index[:3*len(train_transactions)//4]
idxV = test_transactions.index[3*len(test_transactions)//4:]

In [144]:
xgb = HistGradientBoostingClassifier(
    #n_estimators=2000,
    max_depth=12,
    learning_rate=0.02, 
    #subsample=0.8,
#     colsample_bytree=0.4, 
#     missing=-1, 
#    eval_metric='auc'
)
xgb.fit(train_transactions.loc[idxT], y_train[idxT])

HistGradientBoostingClassifier(learning_rate=0.02, max_depth=12)

In [145]:
xgb.score(train_transactions.loc[idxV],y_train[idxV])

0.9712172286122536

In [157]:
#res = xgb.predict(test_transactions)
prediction = pd.DataFrame({"isFraud": xgb.predict(test_transactions), "TransactionId": t1})

In [162]:
prediction.to_csv("result.csv", index=False)