# Example of using ftcv

In [1]:
import pandas as pd
import os 
from datetime import datetime, timedelta
import warnings
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')
os.chdir("..")

  from pandas import MultiIndex, Int64Index


Use synthetic data generated from synthetic-transaction-generator

In [2]:
df = pd.read_csv("example/sim_data_2020-01-01_2022-07-01.csv", index_col=0)

df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['fraud_identified_date'] = pd.to_datetime(df['fraud_identified_date'])

df = df.sort_values('transaction_date')
df = df.reset_index(drop=True)

X = df[['transaction_amount','1','2','3','4','5','6','7','8']]
y = df['fraud']
date_ref = df['transaction_date']
fraud_detected_date_ref = df['fraud_identified_date']

Obtain idx of each train/test fold with FraudTimeSplit

In [3]:
from ftcv.FraudTimeSplit import FraudTimeSplit

ftscv = FraudTimeSplit(n_splits=5, fraud_lag=90)
for train_idx, test_idx in ftscv.split(X, y, date_ref, fraud_detected_date_ref):
    print("TRAIN", train_idx) 
    print("TEST", test_idx)

TRAIN Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            67468, 67510, 67527, 67529, 67887, 67986, 68111, 68309, 68523,
            68763],
           dtype='int64', length=29710)
TEST Int64Index([ 28024,  28025,  28026,  28028,  28029,  28030,  28031,  28032,
             28033,  28034,
            ...
            136825, 136828, 136868, 136897, 136939, 136940, 137151, 137153,
            137154, 137401],
           dtype='int64', length=69169)
TRAIN Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            136825, 136828, 136868, 136897, 136939, 136940, 137151, 137153,
            137154, 137401],
           dtype='int64', length=98879)
TEST Int64Index([ 97253,  97254,  97255,  97256,  97257,  97258,  97259,  97260,
             97261,  97262,
            ...
            205240, 205241, 205249, 205266, 205579, 205581, 205601, 205627,
       

Obtain idx of each train/test fold with FraudTimeSplit and shorter fraud_lag. You will see that more training dataset have been selected

In [4]:
from ftcv.FraudTimeSplit import FraudTimeSplit

ftscv = FraudTimeSplit(n_splits=5, fraud_lag=30)
for train_idx, test_idx in ftscv.split(X, y, date_ref, fraud_detected_date_ref):
    print("TRAIN", train_idx) 
    print("TEST", test_idx)

TRAIN Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            67468, 67510, 67527, 67529, 67887, 67986, 68111, 68309, 68523,
            68763],
           dtype='int64', length=55730)
TEST Int64Index([ 55349,  55350,  55351,  55352,  55353,  55354,  55355,  55357,
             55358,  55359,
            ...
            136825, 136828, 136868, 136897, 136939, 136940, 137151, 137153,
            137154, 137401],
           dtype='int64', length=69177)
TRAIN Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            136825, 136828, 136868, 136897, 136939, 136940, 137151, 137153,
            137154, 137401],
           dtype='int64', length=124907)
TEST Int64Index([124538, 124539, 124541, 124542, 124543, 124544, 124545, 124546,
            124547, 124548,
            ...
            205240, 205241, 205249, 205266, 205579, 205581, 205601, 205627,
      

carry out cross validation on XGBClassifer

In [5]:
from ftcv.fraud_time_cross_validate import fraud_time_cross_validate

xgb = XGBClassifier()
fraud_time_cross_validate(
    xgb,
    X,
    y,
    date_ref,
    fraud_detected_date_ref,
    n_splits=3,
    fraud_lag=90
)



[0.9740034494118559, 0.9762788539612574, 0.9768259103075116]

You can also use sklearn.pipeline

In [6]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()), 
        ('lr', LogisticRegression())
    ]
)

fraud_time_cross_validate(
    pipe,
    X,
    y,
    date_ref,
    fraud_detected_date_ref,
    n_splits=3,
    fraud_lag=90, 
)

[0.9689634790360558, 0.970396358259155, 0.9694431608133087]

with different fraud_lag

In [7]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()), 
        ('lr', LogisticRegression())
    ]
)

fraud_time_cross_validate(
    pipe,
    X,
    y,
    date_ref,
    fraud_detected_date_ref,
    n_splits=3,
    fraud_lag=30, 
)

[0.9694081713529031, 0.9702573898278922, 0.9690822283218]