In [1]:
import pandas as pd
import numpy as np

In [2]:
train_transactions = pd.read_csv("../ieee_fraud_data/train_transaction.csv")
test_transactions = pd.read_csv("../ieee_fraud_data/test_transaction.csv")

In [3]:
t1 = pd.read_csv("../ieee_fraud_data/test_transaction.csv")

In [None]:
t1 = t1["TransactionID"]

In [4]:
### TransactionDT seems to be in seconds, we can transform to days
train_transactions["TransactionDay"] = (train_transactions["TransactionDT"]/86400).astype(int)
test_transactions["TransactionDay"] = (test_transactions["TransactionDT"]/86400).astype(int)

In [5]:
y_train = train_transactions["isFraud"].copy()
train_transactions = train_transactions.drop(columns=["isFraud"])

##### What features to drop first ?

In [None]:
train_transactions[["dist1", "dist2"]].isna().sum()/len(train_transactions)
# This yields dist1, dist2 having > 50% of values NA, will drop them completely
train_transactions = train_transactions.drop(columns=["dist1", "dist2"])
test_transactions = test_transactions.drop(columns=["dist1", "dist2"])

In [None]:
train_transactions[[f"M{i}" for i in range(1, 10)]].isna().sum()/len(train_transactions)
# This yields M7, M8, M9 having > 50% of values NA, will drop them completely
train_transactions = train_transactions.drop(columns=["M7", "M8", "M9"])
test_transactions = test_transactions.drop(columns=["M7", "M8", "M9"])

In [None]:
train_transactions[[f"D{i}" for i in range(1, 16)]].isna().sum()/len(train_transactions)
# This yields D5, D6, D7, D8, D9, D12, D13, D14 having > 50% of values NA, will drop them completely
train_transactions = train_transactions.drop(columns=["D5", "D6", "D7", "D8", "D9", "D12", "D13", "D14"])
test_transactions = test_transactions.drop(columns=["D5", "D6", "D7", "D8", "D9", "D12", "D13", "D14"])

In [None]:
# V* columns are too many and would take ages to figure things out. First thing we could try is
# to try running PCA and reduce the dimension, but will refrain from this unless the rest of the
# features are completely useless
train_transactions = train_transactions.drop(columns=[f"V{i}" for i in range(1, 340)])
test_transactions = test_transactions.drop(columns=[f"V{i}" for i in range(1, 340)])

In [None]:
train_transactions.columns

In [6]:
def encode_labels(train, test, column):
    all_labels, _ = pd.concat([train[column], test[column]], axis=0).factorize(sort=True)
    
    train[column] = all_labels[:len(train)].astype(int)
    test[column] = all_labels[len(train):].astype(int)

def frequency_encode(train, test, columns):
    # Apparently this gives good results in lots of kaggle competitions
    # I couldn't find much online why the frequency is actually useful
    # as a feature. My intuition is that the model might match low or
    # high frequencies with card fraud in a case or another. Eg. if 
    # most of the frauds are coming from a certain email domain, the
    # frequency is going to be a useful feature.
    n = len(train) + len(test)
    for column in columns:
        f = (pd.concat(
                [train[column], test[column]], axis=0
            ).value_counts(dropna=True)/n).to_dict()    
        train[f"{column}_FE"] = train[column].map(f).astype(float)
        test[f"{column}_FE"] = test[column].map(f).astype(float)

def concatenate(train, test, col1, col2):
    new_name = f"{col1}_{col2}"
    train[new_name] = train[col1].astype(str)+ "_" + train[col2].astype(str)
    test[new_name] = test[col1].astype(str) + "_" + test[col2].astype(str)
    
    encode_labels(train, test, new_name)
    

def groupby(train, test, index, column, aggregation):
    # We want for example, the mean transaction value for every card1_address1 pair
    new_name = f"{column}_{index}_{aggregation}"
    
    f = pd.concat([train[[index, column]], test[[index, column]]])
    f = f.groupby([index])[column].agg([aggregation]).reset_index().rename(columns={aggregation: new_name})
    f.index = list(f[index])
    f = f[new_name].to_dict()
    train[new_name] = train[index].map(f).astype(float)
    test[new_name] = train[index].map(f).astype(float)
    train[new_name].fillna(-1, inplace=True)
    test[new_name].fillna(-1, inplace=True)

def groupby_unique(train, test, columns, card_id):
    # every card_id identifies a card
    # but generally a card_id does transactions only from a 
    # handful of email addresses, locations and details as 
    # such. So it's worth creating features that contain
    # the number of unique email addresses, addresses, distances, etc
    
    for column in columns:
        f = pd.concat([train[[card_id] + [column]], test[[card_id] + [column]]], axis=0)
        grouped = f.groupby(card_id)[column].agg(["nunique"])["nunique"].to_dict()
        train[f"{column}_{card_id}_count"] = train[card_id].map(grouped).astype("float32")
        test[f"{column}_{card_id}_count"] = test[card_id].map(grouped).astype("float32")

In [None]:
train_transactions[train_transactions["card1_addr1"] == 13832][["card1", "TransactionAmt", "addr1", "P_emaildomain"]]

In [None]:
train_transactions = train_transactions.drop(columns=["TransactionID", "TransactionDT"])
test_transactions = test_transactions.drop(columns=["TransactionID", "TransactionDT"])

In [9]:
for i in range(1, 16):
    train_transactions[f"D{i}_norm"] = train_transactions["TransactionDay"] - train_transactions[f"D{i}"]
    test_transactions[f"D{i}_norm"] = test_transactions["TransactionDay"] - train_transactions[f"D{i}"]

In [11]:
concatenate(train_transactions, test_transactions, "card1","addr1")
concatenate(train_transactions, test_transactions, "card1_addr1", "P_emaildomain")
frequency_encode(train_transactions, test_transactions, [
    "addr1", "card1", "card2", "card3", "P_emaildomain", "card1_addr1", "card1_addr1_P_emaildomain"])

groupby(train_transactions, test_transactions, "card1", "TransactionAmt", "mean")
groupby(train_transactions, test_transactions, "card1", "TransactionAmt", "std")
groupby(train_transactions, test_transactions, "card1_addr1", "TransactionAmt", "mean")
groupby(train_transactions, test_transactions, "card1_addr1", "TransactionAmt", "std")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "TransactionAmt", "mean")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "TransactionAmt", "std")

groupby(train_transactions, test_transactions, "card1", "D11", "mean")
groupby(train_transactions, test_transactions, "card1", "D11", "std")
groupby(train_transactions, test_transactions, "card1_addr1", "D11", "mean")
groupby(train_transactions, test_transactions, "card1_addr1", "D11", "std")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "D11", "mean")
groupby(train_transactions, test_transactions, "card1_addr1_P_emaildomain", "D11", "std")

In [12]:
concatenate(train_transactions, test_transactions, "card1_addr1", "D1_norm")

In [None]:
train_transactions = train_transactions.drop(
    columns=[
        "card1", "addr1", "P_emaildomain", "D1",
        "card1_addr1", "card1_addr1_P_emaildomain",
        #"card1_addr1_FE", "card1_addr1_P_emaildomain_FE"
    ]
)
test_transactions = test_transactions.drop(
    columns=[
        "card1", "addr1", "P_emaildomain", "D1",
        "card1_addr1", "card1_addr1_P_emaildomain",
        #"card1_addr1_FE", "card1_addr1_P_emaildomain_FE"
    ]
)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GroupKFold

In [None]:
#TODO figure out what to do with these
# train_transactions = train_transactions.drop(columns=[
#     "ProductCD", "M1", "M2", "M3", "M4", "M5", "M6", "P_emaildomain", "R_emaildomain", "card4", "card5", "card6"]
# )
# test_transactions = test_transactions.drop(columns=[
#     "ProductCD", "M1", "M2", "M3", "M4", "M5", "M6", "P_emaildomain", "R_emaildomain", "card4", "card5", "card6"]
# )

In [None]:
idxT = train_transactions.index[:3*len(train_transactions)//4]
idxV = test_transactions.index[3*len(test_transactions)//4:]

In [None]:
hgbc = HistGradientBoostingClassifier(    
    max_depth=12,
    learning_rate=0.02, 
)
hgbc.fit(train_transactions.loc[idxT], y_train[idxT])

In [None]:
hgbc.score(train_transactions.loc[idxV],y_train[idxV])

In [None]:
prediction = pd.DataFrame({"isFraud": hgbc.predict(test_transactions), "TransactionId": t1})

In [None]:
prediction.to_csv("result.csv", index=False)

#### xgboost 
- faster than the GradientBoosting from sklearn
- can deal with missing values
- can fit directly categorical features without encoding

In [None]:
import xgboost as xgb

In [None]:
clf = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        # USE CPU
        #nthread=4,
        #tree_method='hist' 
        # USE GPU
        #tree_method='gpu_hist' 
    )

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
for column in ["M7", "M8", "M9"]:
    train_transactions[column] = label_encoder.fit_transform(train_transactions[column])
    test_transactions[column] = label_encoder.fit_transform(test_transactions[column])

In [None]:
for column in ["ProductCD", "card4", "card6", "R_emaildomain",
               "M1", "M2", "M3", "M4", "M5", "M6",]:
    train_transactions[column] = label_encoder.fit_transform(train_transactions[column])
    test_transactions[column] = label_encoder.fit_transform(test_transactions[column])

In [None]:
import datetime
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
train_transactions['DT_M'] = train_transactions['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train_transactions['DT_M'] = (train_transactions['DT_M'].dt.year-2017)*12 + train_transactions['DT_M'].dt.month 

test_transactions['DT_M'] = test_transactions['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
test_transactions['DT_M'] = (test_transactions['DT_M'].dt.year-2017)*12 + test_transactions['DT_M'].dt.month 

In [None]:
oof = np.zeros(len(train_transactions))
preds = np.zeros(len(test_transactions))

skf = GroupKFold(n_splits=6)

for i, (idxT, idxV) in enumerate( skf.split(train_transactions, y_train, groups=train_transactions['DT_M'])):
        month = train_transactions.iloc[idxV]['DT_M'].iloc[0]
        print('Fold',i,'withholding month',month)
        print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
        clf = xgb.XGBClassifier(
            n_estimators=5000,
            max_depth=12,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.4,
            missing=-1,
            eval_metric='auc',
            # USE CPU
            #nthread=4,
            #tree_method='hist'
            # USE GPU
            #tree_method='gpu_hist' 
        )        
        h = clf.fit(train_transactions.iloc[idxT], y_train.iloc[idxT], 
                eval_set=[(train_transactions.iloc[idxV],y_train.iloc[idxV])],
                verbose=100, early_stopping_rounds=200)
    
        oof[idxV] += clf.predict_proba(train_transactions.iloc[idxV])[:,1]
        preds += clf.predict_proba(train_transactions)[:,1]/skf.n_splits

In [None]:
h = clf.fit(train_transactions.loc[idxT], y_train[idxT], 
        eval_set=[(train_transactions.loc[idxV],y_train[idxV])],
        verbose=50, early_stopping_rounds=100)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,train_transactions.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).iloc[:50])
plt.title('XGB95 Most Important Features')
plt.tight_layout()
plt.show()

In [13]:
prediction = pd.DataFrame({"isFraud": clf.predict_proba(test_transactions), "TransactionId": t1})

NameError: name 'clf' is not defined

In [None]:
t1

In [None]:
prediction.to_csv("result_xgb.csv", index=False)

In [None]:
# this is given,
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
train_transactions[""]

In [None]:
test_transactions