# Prepare services

In [2]:
import boto3
import sagemaker
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

s3 = boto3.client('s3')

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [3]:
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()

In [None]:
print(role)

In [5]:
# unneeded, kept for reference
default_bucket = sagemaker_session.default_bucket()

# Inspect data

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

data_bucket = 'creditcardfraud-project'
prefix = 'simulated_data/simulated-data-raw/data'
sample_file_name = '2018-04-01.csv'
sample_file_ref = "{}/{}".format(prefix, sample_file_name)

sample_data_object = s3.get_object(Bucket=data_bucket, Key=sample_file_ref)
transactions_df = pd.read_csv(io.BytesIO(sample_data_object['Body'].read()))

In [11]:
transactions_df.dtypes

TRANSACTION_ID         int64
TX_DATETIME           object
CUSTOMER_ID            int64
TERMINAL_ID            int64
TX_AMOUNT            float64
TX_TIME_SECONDS        int64
TX_TIME_DAYS           int64
TX_FRAUD               int64
TX_FRAUD_SCENARIO      int64
dtype: object

## Data conversion:
The feature store won't accept object type as valid. Check https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_featurestore.html

In [59]:
transactions_df['TX_DATETIME'] = transactions_df['TX_DATETIME'].astype('string')

# Feature store


## Creation (offline store, i.e. not for real time)

As documentation mentions: The offline store is an append-only store, enabling Feature Store to maintain a historical record of all feature values. Data is stored in the offline store in Parquet format for optimized storage and query access.

In [6]:
from time import gmtime, strftime, sleep

transaction_data_feature_group_name = "transactions-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

SDK documentation for Feature group:
https://sagemaker.readthedocs.io/en/stable/api/prep_data/feature_store.html?highlight=featurestore#feature-group

In [59]:
from sagemaker.feature_store.feature_group import FeatureGroup

transaction_feature_group = FeatureGroup(
    name=transaction_data_feature_group_name, sagemaker_session=sagemaker_session
)

In [63]:
import time

current_time_sec = int(round(time.time()))

record_identifier_feature_name = "TRANSACTION_ID"

In [63]:
# append EventTime feature
transactions_df["EventTime"] = pd.Series([current_time_sec]*len(transactions_df), dtype="float64")

In [64]:
transactions_df

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,EventTime
0,0,2018-04-01 00:00:31,596,3156,57.16,31,0,0,0,1.638561e+09
1,1,2018-04-01 00:02:10,4961,3412,81.51,130,0,0,0,1.638561e+09
2,2,2018-04-01 00:07:56,2,1365,146.00,476,0,0,0,1.638561e+09
3,3,2018-04-01 00:09:29,4128,8737,64.49,569,0,0,0,1.638561e+09
4,4,2018-04-01 00:10:34,927,9906,50.99,634,0,0,0,1.638561e+09
...,...,...,...,...,...,...,...,...,...,...
9483,9483,2018-04-01 23:56:50,3289,6699,48.69,86210,0,0,0,1.638561e+09
9484,9484,2018-04-01 23:58:14,3230,6664,85.97,86294,0,0,0,1.638561e+09
9485,9485,2018-04-01 23:58:31,296,3702,120.88,86311,0,0,0,1.638561e+09
9486,9486,2018-04-01 23:59:28,2557,146,8.02,86368,0,0,0,1.638561e+09


In [65]:
# Load feature definitions to the feature group.
transaction_feature_group.load_feature_definitions(data_frame=transactions_df)

[FeatureDefinition(feature_name='TRANSACTION_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_DATETIME', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='CUSTOMER_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TERMINAL_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_AMOUNT', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='TX_TIME_SECONDS', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_TIME_DAYS', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_FRAUD', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_FRAUD_SCENARIO', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.FRACTIO

In [66]:
baseURI = 'creditcardfraud-project'
offline_feature_store_bucket = 's3://{}/{}'.format(baseURI, prefix)

In [None]:
# Create feature group

prefix = 'featurestore'

transaction_feature_group.create(
    s3_uri=f"s3://{data_bucket}/{prefix}",
    record_identifier_name='TRANSACTION_ID',
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True,
)

Feature Store is created now. You can check it at `s3_uri`

## Check status

In [None]:
transaction_feature_group.describe()

In [None]:
# We use the boto client to list FeatureGroups
sagemaker_session.boto_session.client('sagemaker', region_name=region).list_feature_groups()

In [119]:
def check_feature_group_status(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group to be Created")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

check_feature_group_status(transaction_feature_group)

FeatureGroup transactions-feature-group-03-19-47-20 successfully created.


## Ingest data into Feature Store

In [79]:
transaction_feature_group.ingest(
    data_frame=transactions_df, max_workers=3, wait=True
)

IngestionManagerPandas(feature_group_name='transactions-feature-group-03-19-47-20', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7feec1afc1d0>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7feebd025e10>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

## Retrieve data

In [82]:
# Check a sample record
transaction_id = 35
sample_record = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name=region)\
                .get_record(FeatureGroupName=transaction_data_feature_group_name, RecordIdentifierValueAsString=str(transaction_id))
print(sample_record)

{'ResponseMetadata': {'RequestId': '19801580-6399-42bd-8a57-6149e9aaf1d9', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '19801580-6399-42bd-8a57-6149e9aaf1d9', 'content-type': 'application/json', 'content-length': '563', 'date': 'Fri, 03 Dec 2021 22:54:07 GMT'}, 'RetryAttempts': 0}, 'Record': [{'FeatureName': 'TRANSACTION_ID', 'ValueAsString': '35'}, {'FeatureName': 'TX_DATETIME', 'ValueAsString': '2018-04-01 00:31:51'}, {'FeatureName': 'CUSTOMER_ID', 'ValueAsString': '1753'}, {'FeatureName': 'TERMINAL_ID', 'ValueAsString': '8676'}, {'FeatureName': 'TX_AMOUNT', 'ValueAsString': '115.78'}, {'FeatureName': 'TX_TIME_SECONDS', 'ValueAsString': '1911'}, {'FeatureName': 'TX_TIME_DAYS', 'ValueAsString': '0'}, {'FeatureName': 'TX_FRAUD', 'ValueAsString': '0'}, {'FeatureName': 'TX_FRAUD_SCENARIO', 'ValueAsString': '0'}, {'FeatureName': 'EventTime', 'ValueAsString': '1638560847.0'}]}


In [83]:
several_records = sagemaker_session.boto_session.client(
    "sagemaker-featurestore-runtime", region_name=region
).batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": transaction_data_feature_group_name,
            "RecordIdentifiersValueAsString": ["10", "11", "12"],
        },
    ]
)

print(several_records)

{'ResponseMetadata': {'RequestId': '3a0f9430-b254-4b0c-872b-7063ac7bf614', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '3a0f9430-b254-4b0c-872b-7063ac7bf614', 'content-type': 'application/json', 'content-length': '2033', 'date': 'Fri, 03 Dec 2021 22:58:05 GMT'}, 'RetryAttempts': 0}, 'Records': [{'FeatureGroupName': 'transactions-feature-group-03-19-47-20', 'RecordIdentifierValueAsString': '12', 'Record': [{'FeatureName': 'TRANSACTION_ID', 'ValueAsString': '12'}, {'FeatureName': 'TX_DATETIME', 'ValueAsString': '2018-04-01 00:18:01'}, {'FeatureName': 'CUSTOMER_ID', 'ValueAsString': '1948'}, {'FeatureName': 'TERMINAL_ID', 'ValueAsString': '3372'}, {'FeatureName': 'TX_AMOUNT', 'ValueAsString': '54.51'}, {'FeatureName': 'TX_TIME_SECONDS', 'ValueAsString': '1081'}, {'FeatureName': 'TX_TIME_DAYS', 'ValueAsString': '0'}, {'FeatureName': 'TX_FRAUD', 'ValueAsString': '0'}, {'FeatureName': 'TX_FRAUD_SCENARIO', 'ValueAsString': '0'}, {'FeatureName': 'EventTime', 'ValueAsString': '16

## Clean up/deletion

In [84]:
# uncomment next line and run to remove the feature group
#transaction_feature_group.delete()

## List feature groups

In [186]:
response = sagemaker_client.list_feature_groups()
for fg in response['FeatureGroupSummaries']:
    fg_name = fg['FeatureGroupName']
    print(f'Found feature group: {fg_name}')

Found feature group: transactions-feature-group-03-19-47-20
Found feature group: transactions-engineered-feature-group-05-18-35-51
Found feature group: transactions-engineered-feature-group-04-01-05-59


# Data transformation

In [86]:
transactions_df['TX_DATETIME'] = pd.to_datetime(transactions_df['TX_DATETIME'])

In [47]:
def is_weekend(tx_datetime):
    
    # Transform date into weekday (0 is Monday, 6 is Sunday)
    weekday = tx_datetime.weekday()
    # Binary value: 0 if weekday, 1 if weekend
    is_weekend = weekday>=5
    
    return int(is_weekend)

In [90]:
transactions_df['TX_DURING_WEEKEND']=transactions_df.TX_DATETIME.apply(is_weekend)

In [48]:
def is_night(tx_datetime):
    
    # Get the hour of the transaction
    tx_hour = tx_datetime.hour
    # Binary value: 1 if hour less than 6, and 0 otherwise
    is_night = tx_hour<=6
    
    return int(is_night)

In [93]:
transactions_df['TX_DURING_NIGHT']=transactions_df.TX_DATETIME.apply(is_night)

In [49]:
def get_customer_spending_behaviour_features(customer_transactions, windows_size_in_days=[1,7,30]):
    
    # Let us first order transactions chronologically
    customer_transactions=customer_transactions.sort_values('TX_DATETIME')
    
    # The transaction date and time is set as the index, which will allow the use of the rolling function 
    customer_transactions.index=customer_transactions.TX_DATETIME
    
    # For each window size
    for window_size in windows_size_in_days:
        
        # Compute the sum of the transaction amounts and the number of transactions for the given window size
        SUM_AMOUNT_TX_WINDOW=customer_transactions['TX_AMOUNT'].rolling(str(window_size)+'d').sum()
        NB_TX_WINDOW=customer_transactions['TX_AMOUNT'].rolling(str(window_size)+'d').count()
    
        # Compute the average transaction amount for the given window size
        # NB_TX_WINDOW is always >0 since current transaction is always included
        AVG_AMOUNT_TX_WINDOW=SUM_AMOUNT_TX_WINDOW/NB_TX_WINDOW
    
        # Save feature values
        customer_transactions['CUSTOMER_ID_NB_TX_'+str(window_size)+'DAY_WINDOW']=list(NB_TX_WINDOW)
        customer_transactions['CUSTOMER_ID_AVG_AMOUNT_'+str(window_size)+'DAY_WINDOW']=list(AVG_AMOUNT_TX_WINDOW)
    
    # Reindex according to transaction IDs
    customer_transactions.index=customer_transactions.TRANSACTION_ID
        
    # And return the dataframe with the new features
    return customer_transactions

In [95]:
transactions_df=transactions_df.groupby('CUSTOMER_ID').apply(lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1,7,30]))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)

In [50]:
def get_count_risk_rolling_window(terminal_transactions, delay_period=7, windows_size_in_days=[1,7,30], feature="TERMINAL_ID"):
    
    terminal_transactions=terminal_transactions.sort_values('TX_DATETIME')
    
    terminal_transactions.index=terminal_transactions.TX_DATETIME
    
    NB_FRAUD_DELAY=terminal_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').sum()
    NB_TX_DELAY=terminal_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').count()
    
    for window_size in windows_size_in_days:
    
        NB_FRAUD_DELAY_WINDOW=terminal_transactions['TX_FRAUD'].rolling(str(delay_period+window_size)+'d').sum()
        NB_TX_DELAY_WINDOW=terminal_transactions['TX_FRAUD'].rolling(str(delay_period+window_size)+'d').count()
    
        NB_FRAUD_WINDOW=NB_FRAUD_DELAY_WINDOW-NB_FRAUD_DELAY
        NB_TX_WINDOW=NB_TX_DELAY_WINDOW-NB_TX_DELAY
    
        RISK_WINDOW=NB_FRAUD_WINDOW/NB_TX_WINDOW
        
        terminal_transactions[feature+'_NB_TX_'+str(window_size)+'DAY_WINDOW']=list(NB_TX_WINDOW)
        terminal_transactions[feature+'_RISK_'+str(window_size)+'DAY_WINDOW']=list(RISK_WINDOW)
        
    terminal_transactions.index=terminal_transactions.TRANSACTION_ID
    
    # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0) 
    terminal_transactions.fillna(0,inplace=True)
    
    return terminal_transactions

In [98]:
transactions_df=transactions_df.groupby('TERMINAL_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="TERMINAL_ID"))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)

# Feature group for engineered data

In [123]:
transactions_engineered_feature_group_name = "transactions-engineered-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
transactions_engineered_feature_group_name

'transactions-engineered-feature-group-04-01-05-59'

In [124]:
transactions_engineered_feature_group = FeatureGroup(
    name=transactions_engineered_feature_group_name, sagemaker_session=sagemaker_session
)

In [125]:
transactions_df.dtypes

TRANSACTION_ID                           int64
TX_DATETIME                             string
CUSTOMER_ID                              int64
TERMINAL_ID                              int64
TX_AMOUNT                              float64
TX_TIME_SECONDS                          int64
TX_TIME_DAYS                             int64
TX_FRAUD                                 int64
TX_FRAUD_SCENARIO                        int64
EventTime                              float64
TX_DURING_WEEKEND                        int64
TX_DURING_NIGHT                          int64
CUSTOMER_ID_NB_TX_1DAY_WINDOW          float64
CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW     float64
CUSTOMER_ID_NB_TX_7DAY_WINDOW          float64
CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW     float64
CUSTOMER_ID_NB_TX_30DAY_WINDOW         float64
CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW    float64
TERMINAL_ID_NB_TX_1DAY_WINDOW          float64
TERMINAL_ID_RISK_1DAY_WINDOW           float64
TERMINAL_ID_NB_TX_7DAY_WINDOW          float64
TERMINAL_ID_R

Feature group won't accept `TX_DATETIME` data type and needs to be converted

In [126]:
transactions_df['TX_DATETIME'] = transactions_df['TX_DATETIME'].astype('string')

In [127]:
current_time_sec = int(round(time.time()))

# record identifier and event time feature names
record_identifier_feature_name = "TRANSACTION_ID"
event_time_feature_name = "EventTime"

# append EventTime feature
transactions_df["EventTime"] = pd.Series([current_time_sec]*len(transactions_df), dtype="float64")

In [128]:
# Load feature definitions to the feature group.
transactions_engineered_feature_group.load_feature_definitions(data_frame=transactions_df)

[FeatureDefinition(feature_name='TRANSACTION_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_DATETIME', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='CUSTOMER_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TERMINAL_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_AMOUNT', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='TX_TIME_SECONDS', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_TIME_DAYS', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_FRAUD', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_FRAUD_SCENARIO', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.FRACTIO

In [None]:
transactions_engineered_feature_group.create(
    s3_uri=f"s3://{data_bucket}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

# check_feature_group_status(feature_group=transactions_engineered_feature_group)

In [130]:
check_feature_group_status(feature_group=transactions_engineered_feature_group)

Waiting for Feature Group to be Created
Waiting for Feature Group to be Created
FeatureGroup transactions-engineered-feature-group-04-01-05-59 successfully created.


In [None]:
transactions_engineered_feature_group.describe()

In [132]:
transactions_engineered_feature_group.ingest(data_frame=transactions_df, max_workers=5, wait=True)

IngestionManagerPandas(feature_group_name='transactions-engineered-feature-group-04-01-05-59', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7feec1afc1d0>, max_workers=5, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7feeb7451810>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

Let’s wait for the data to appear in our offline store before moving forward to creating a dataset. This will take approximately 5 minutes.

In [None]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)


transactions_engineered_feature_group_resolved_output_s3_uri = (
    transaction_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)

transactions_engineered_feature_group_s3_prefix = transactions_engineered_feature_group_resolved_output_s3_uri.replace(
    f"s3://{data_bucket}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3.list_objects(
        Bucket=data_bucket, Prefix=transactions_engineered_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

# Proceed to transform and add to feature store the whole dataset

In [None]:
# s3 = boto3.client('s3')
# data_bucket = 'creditcardfraud-project'
# folder = 'simulated_data/simulated-data-raw/data'
# sample_file_name = '2018-04-01.csv'
# sample_file_ref = "{}/{}".format(folder, sample_file_name)

# sample_data_object = s3.get_object(Bucket=data_bucket, Key=sample_file_ref)
# transactions_df = pd.read_csv(io.BytesIO(sample_data_object['Body'].read()))

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
from tqdm import tqdm

In [8]:
data_bucket = 'creditcardfraud-project'
prefix = 'simulated_data/simulated-data-raw/data'

object_list = s3.list_objects_v2(Bucket=data_bucket, Prefix=prefix)

In [40]:
len(object_list['Contents'])

183

In [41]:
# In case there are more than 1000 objects, use this:

# The inbuilt boto3 Paginator class is the easiest way to overcome the 1000 record limitation of list-objects-v2. This can be implemented as follows

# s3 = boto3.client('s3')

# paginator = s3.get_paginator('list_objects_v2')
# pages = paginator.paginate(Bucket='bucket', Prefix='prefix')

# for page in pages:
#     for obj in page['Contents']:
#         print(obj['Size'])

In [42]:
# Read first file
file_ref = object_list['Contents'][0]['Key']
data_object = s3.get_object(Bucket=data_bucket, Key=file_ref)

transactions_df = pd.read_csv(io.BytesIO(data_object['Body'].read()))
#transactions_df

In [43]:
# Read and append the rest

object_list['Contents'].pop(0)

for file_data in tqdm(object_list['Contents']):
    file_ref = file_data['Key']
        
    data_object = s3.get_object(Bucket=data_bucket, Key=file_ref)
    df = pd.read_csv(io.BytesIO(data_object['Body'].read()))
    
    transactions_df = transactions_df.append(df)

100%|██████████| 182/182 [00:30<00:00,  5.87it/s]


In [44]:
transactions_df

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
0,0,2018-04-01 00:00:31,596,3156,57.16,31,0,0,0
1,1,2018-04-01 00:02:10,4961,3412,81.51,130,0,0,0
2,2,2018-04-01 00:07:56,2,1365,146.00,476,0,0,0
3,3,2018-04-01 00:09:29,4128,8737,64.49,569,0,0,0
4,4,2018-04-01 00:10:34,927,9906,50.99,634,0,0,0
...,...,...,...,...,...,...,...,...,...
9644,1754150,2018-09-30 23:56:36,161,655,54.24,15810996,182,0,0
9645,1754151,2018-09-30 23:57:38,4342,6181,1.23,15811058,182,0,0
9646,1754152,2018-09-30 23:58:21,618,1502,6.62,15811101,182,0,0
9647,1754153,2018-09-30 23:59:52,4056,3067,55.40,15811192,182,0,0


## Data transformation

In [46]:
transactions_df['TX_DATETIME'] = pd.to_datetime(transactions_df['TX_DATETIME'])

In [51]:
transactions_df['TX_DURING_WEEKEND']=transactions_df.TX_DATETIME.apply(is_weekend)

In [52]:
transactions_df['TX_DURING_NIGHT']=transactions_df.TX_DATETIME.apply(is_night)

In [53]:
transactions_df=transactions_df.groupby('CUSTOMER_ID').apply(lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1,7,30]))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)

In [54]:
transactions_df=transactions_df.groupby('TERMINAL_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="TERMINAL_ID"))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)

## Prepare and Save to feature store

In [57]:
transactions_engineered_feature_group_name = "transactions-engineered-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
transactions_engineered_feature_group_name

'transactions-engineered-feature-group-05-18-35-51'

In [60]:
transactions_engineered_feature_group = FeatureGroup(
    name=transactions_engineered_feature_group_name, sagemaker_session=sagemaker_session
)

In [61]:
transactions_df['TX_DATETIME'] = transactions_df['TX_DATETIME'].astype('string')

In [64]:
current_time_sec = int(round(time.time()))

# record identifier and event time feature names
record_identifier_feature_name = "TRANSACTION_ID"
event_time_feature_name = "EventTime"

# append EventTime feature
transactions_df["EventTime"] = pd.Series([current_time_sec]*len(transactions_df), dtype="float64")

In [65]:
# Load feature definitions to the feature group.
transactions_engineered_feature_group.load_feature_definitions(data_frame=transactions_df)

[FeatureDefinition(feature_name='TRANSACTION_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_DATETIME', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='CUSTOMER_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TERMINAL_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_AMOUNT', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='TX_TIME_SECONDS', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_TIME_DAYS', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_FRAUD', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_FRAUD_SCENARIO', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TX_DURING_WEEKEND', feature_type=<FeatureTypeEnum

In [None]:
prefix = 'featurestore'

transactions_engineered_feature_group.create(
    s3_uri=f"s3://{data_bucket}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

In [77]:
check_feature_group_status(feature_group=transactions_engineered_feature_group)

FeatureGroup transactions-engineered-feature-group-05-18-35-51 successfully created.


In [78]:
# this takes a LOONG time (forgot to add the magic command %time, but I guess it takes longer than an hour)
transactions_engineered_feature_group.ingest(data_frame=transactions_df, max_workers=5, wait=True)

IngestionManagerPandas(feature_group_name='transactions-engineered-feature-group-05-18-35-51', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f58f01d5bd0>, max_workers=5, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f58f0b44cd0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

# Build training set

Data is retrieved from the feature store through a query from Athena.

For the training set, one week of data will be used. For the validation set, one week is left out and the next week is used as the validation (check the corresponding [book section for references](https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_3_GettingStarted/BaselineModeling.html#defining-the-training-and-test-sets)). Here, data will be partitioned using SQL, otherwise the query will create a CSV file of around 0.5 Gigabytes, which is unneeded. Plus, note that Athena queries cost $5 per terabyte of data scanned

In [None]:
sagemaker_client.list_feature_groups()

In [11]:
# Run this cell if restarting the notebook from the build training set part

from sagemaker.feature_store.feature_group import FeatureGroup

transactions_engineered_feature_group_name = 'transactions-engineered-feature-group-05-18-35-51'

transactions_engineered_feature_group = FeatureGroup(
    name=transactions_engineered_feature_group_name, sagemaker_session=sagemaker_session
)

In [12]:
transaction_query = transactions_engineered_feature_group.athena_query()

transaction_table = transaction_query.table_name

In [13]:
transaction_table

'transactions-engineered-feature-group-05-18-35-51-1638729795'

**Hint**: according to Athena documentation, it uses Presto SQL. To add date intervals, check: https://prestodb.io/docs/current/functions/datetime.html

Additionaly, head to https://console.aws.amazon.com/athena/ to inspect some queries (the table will be already available thanks to the Glue catalog; the catalog is created by default)

In [17]:
query_string = (
    'SELECT * '
    + 'FROM "' + transaction_table + '" '
    + "WHERE date_parse(tx_datetime, '%Y-%m-%d %H:%i:%s') BETWEEN DATE('2018-07-25') AND DATE('2018-07-25') + interval '21' day "
    + 'ORDER BY transaction_id'
)

prefix = 'train_data_query'

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
transaction_query.run(
    query_string=query_string,
    output_location="s3://" + data_bucket + "/" + prefix + "/query_results/",
)

transaction_query.wait()
dataset = transaction_query.as_dataframe()

dataset

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,...,terminal_id_nb_tx_1day_window,terminal_id_risk_1day_window,terminal_id_nb_tx_7day_window,terminal_id_risk_7day_window,terminal_id_nb_tx_30day_window,terminal_id_risk_30day_window,eventtime,write_time,api_invocation_time,is_deleted
0,1102483,2018-07-25 00:00:29,1111,2328,40.77,9936029,115,0,0,0,...,1.0,0.0,10.0,0.0,32.0,0.0,1.638730e+09,2021-12-05 19:08:05.133,2021-12-05 19:07:12.000,False
1,1102484,2018-07-25 00:01:08,676,6846,9.62,9936068,115,0,0,0,...,2.0,0.0,5.0,0.0,27.0,0.0,1.638730e+09,2021-12-05 19:08:04.636,2021-12-05 19:07:12.000,False
2,1102485,2018-07-25 00:01:35,402,4771,81.55,9936095,115,0,0,0,...,0.0,0.0,12.0,0.0,37.0,0.0,1.638730e+09,2021-12-05 19:08:05.011,2021-12-05 19:07:12.000,False
3,1102486,2018-07-25 00:01:43,4218,863,23.10,9936103,115,0,0,0,...,2.0,0.0,6.0,0.0,22.0,0.0,1.638730e+09,2021-12-05 19:08:05.215,2021-12-05 19:07:12.000,False
4,1102487,2018-07-25 00:02:26,3711,3599,59.25,9936146,115,0,0,0,...,1.0,0.0,10.0,0.0,38.0,0.0,1.638730e+09,2021-12-05 19:08:05.240,2021-12-05 19:07:12.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201290,1303773,2018-08-14 23:57:03,460,6133,16.72,11750223,135,0,0,0,...,1.0,0.0,3.0,0.0,20.0,0.0,1.638730e+09,2021-12-05 19:48:05.039,2021-12-05 19:43:41.000,False
201291,1303774,2018-08-14 23:58:24,3101,3229,38.16,11750304,135,0,0,0,...,2.0,0.0,13.0,0.0,42.0,0.0,1.638730e+09,2021-12-05 19:48:04.740,2021-12-05 19:43:41.000,False
201292,1303775,2018-08-14 23:58:24,4783,7511,69.85,11750304,135,0,0,0,...,0.0,0.0,3.0,0.0,34.0,0.0,1.638730e+09,2021-12-05 19:48:05.079,2021-12-05 19:43:41.000,False
201293,1303776,2018-08-14 23:58:45,2883,8550,14.99,11750325,135,0,0,0,...,3.0,0.0,7.0,0.0,20.0,0.0,1.638730e+09,2021-12-05 19:48:04.370,2021-12-05 19:43:41.000,False


Notice the set of columns added: `eventtime`, `write_time`, `api_invocation_time`, `is_deleted`

In [18]:
query_execution = transaction_query.get_query_execution()

In [19]:
print('output_location:', 's3://"' + data_bucket + "/" + prefix + "/query_results/" + query_execution['QueryExecution']['QueryExecutionId'] + '.csv')

output_location: s3://"creditcardfraud-project/train_data_query/query_results/48029c9a-5b11-41f9-ac86-7435d8efac5a.csv


Obtain the train/test set split. Observe the Note in the code: `Cards known to be frauded after the delay period are removed from the test set`

In [23]:
def get_train_test_set(transactions_df,
                       start_date_training,
                       delta_train=7,delta_delay=7,delta_test=7):
    
    # Get the training set data
    train_df = transactions_df[(transactions_df.tx_datetime>=start_date_training) &
                               (transactions_df.tx_datetime<start_date_training+datetime.timedelta(days=delta_train))]
    
    # Get the test set data
    test_df = []
    
    # Note: Cards known to be frauded after the delay period are removed from the test set
    # That is, for each test day, all frauds known at (test_day-delay_period) are removed
    
    # First, get known frauded customers from the training set
    known_frauded_customers = set(train_df[train_df.tx_fraud==1].customer_id)
    
    # Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
    start_tx_time_days_training = train_df.tx_time_days.min()
    
    # Then, for each day of the test set
    for day in range(delta_test):
    
        # Get test data for that day
        test_df_day = transactions_df[transactions_df.tx_time_days==start_tx_time_days_training+
                                                                    delta_train+delta_delay+
                                                                    day]
        
        # Frauded cards from that test day, minus the delay period, are added to the pool of known frauded customers
        test_df_day_delay_period = transactions_df[transactions_df.tx_time_days==start_tx_time_days_training+
                                                                                delta_train+
                                                                                day-1]
        
        new_frauded_customers = set(test_df_day_delay_period[test_df_day_delay_period.tx_fraud==1].customer_id)
        known_frauded_customers = known_frauded_customers.union(new_frauded_customers)
        
        test_df_day = test_df_day[~test_df_day.customer_id.isin(known_frauded_customers)]
        
        test_df.append(test_df_day)
        
    test_df = pd.concat(test_df)
    
    # Sort data sets by ascending order of transaction ID
    train_df=train_df.sort_values('transaction_id')
    test_df=test_df.sort_values('transaction_id')
    
    return (train_df, test_df)

In [38]:
import datetime

start_date_training = datetime.datetime.strptime("2018-07-25", "%Y-%m-%d")

dataset['tx_datetime'] = pd.to_datetime(dataset['tx_datetime'])

train_dataset, test_dataset = get_train_test_set(dataset, start_date_training)

In [39]:
train_dataset.head()

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,...,terminal_id_nb_tx_1day_window,terminal_id_risk_1day_window,terminal_id_nb_tx_7day_window,terminal_id_risk_7day_window,terminal_id_nb_tx_30day_window,terminal_id_risk_30day_window,eventtime,write_time,api_invocation_time,is_deleted
0,1102483,2018-07-25 00:00:29,1111,2328,40.77,9936029,115,0,0,0,...,1.0,0.0,10.0,0.0,32.0,0.0,1638730000.0,2021-12-05 19:08:05.133,2021-12-05 19:07:12.000,False
1,1102484,2018-07-25 00:01:08,676,6846,9.62,9936068,115,0,0,0,...,2.0,0.0,5.0,0.0,27.0,0.0,1638730000.0,2021-12-05 19:08:04.636,2021-12-05 19:07:12.000,False
2,1102485,2018-07-25 00:01:35,402,4771,81.55,9936095,115,0,0,0,...,0.0,0.0,12.0,0.0,37.0,0.0,1638730000.0,2021-12-05 19:08:05.011,2021-12-05 19:07:12.000,False
3,1102486,2018-07-25 00:01:43,4218,863,23.1,9936103,115,0,0,0,...,2.0,0.0,6.0,0.0,22.0,0.0,1638730000.0,2021-12-05 19:08:05.215,2021-12-05 19:07:12.000,False
4,1102487,2018-07-25 00:02:26,3711,3599,59.25,9936146,115,0,0,0,...,1.0,0.0,10.0,0.0,38.0,0.0,1638730000.0,2021-12-05 19:08:05.240,2021-12-05 19:07:12.000,False


In [40]:
train_dataset.shape

(67240, 27)

In [41]:
test_dataset.tail()

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,...,terminal_id_nb_tx_1day_window,terminal_id_risk_1day_window,terminal_id_nb_tx_7day_window,terminal_id_risk_7day_window,terminal_id_nb_tx_30day_window,terminal_id_risk_30day_window,eventtime,write_time,api_invocation_time,is_deleted
201290,1303773,2018-08-14 23:57:03,460,6133,16.72,11750223,135,0,0,0,...,1.0,0.0,3.0,0.0,20.0,0.0,1638730000.0,2021-12-05 19:48:05.039,2021-12-05 19:43:41.000,False
201291,1303774,2018-08-14 23:58:24,3101,3229,38.16,11750304,135,0,0,0,...,2.0,0.0,13.0,0.0,42.0,0.0,1638730000.0,2021-12-05 19:48:04.740,2021-12-05 19:43:41.000,False
201292,1303775,2018-08-14 23:58:24,4783,7511,69.85,11750304,135,0,0,0,...,0.0,0.0,3.0,0.0,34.0,0.0,1638730000.0,2021-12-05 19:48:05.079,2021-12-05 19:43:41.000,False
201293,1303776,2018-08-14 23:58:45,2883,8550,14.99,11750325,135,0,0,0,...,3.0,0.0,7.0,0.0,20.0,0.0,1638730000.0,2021-12-05 19:48:04.370,2021-12-05 19:43:41.000,False
201294,1303777,2018-08-14 23:59:43,3901,8047,55.99,11750383,135,0,0,0,...,0.0,0.0,7.0,0.0,23.0,0.0,1638730000.0,2021-12-05 19:48:04.757,2021-12-05 19:43:41.000,False


In [42]:
test_dataset.shape

(58264, 27)

In [43]:
test_dataset[test_dataset.tx_fraud==1].shape

(385, 27)

# Model training

We will be using the XGBoost algorithm included with Sagemaker, thus we need to accomodate some input data changes with respect to the original notebook.

## Train/test set preparation and split

In [44]:
# first column is the target (TX_FRAUD)

features = [feature.lower() for feature in ["TX_FRAUD", 'TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
       'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
       'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']]

In [45]:
features

['tx_fraud',
 'tx_amount',
 'tx_during_weekend',
 'tx_during_night',
 'customer_id_nb_tx_1day_window',
 'customer_id_avg_amount_1day_window',
 'customer_id_nb_tx_7day_window',
 'customer_id_avg_amount_7day_window',
 'customer_id_nb_tx_30day_window',
 'customer_id_avg_amount_30day_window',
 'terminal_id_nb_tx_1day_window',
 'terminal_id_risk_1day_window',
 'terminal_id_nb_tx_7day_window',
 'terminal_id_risk_7day_window',
 'terminal_id_nb_tx_30day_window',
 'terminal_id_risk_30day_window']

In [46]:
train_dataset = train_dataset[features]
train_dataset

Unnamed: 0,tx_fraud,tx_amount,tx_during_weekend,tx_during_night,customer_id_nb_tx_1day_window,customer_id_avg_amount_1day_window,customer_id_nb_tx_7day_window,customer_id_avg_amount_7day_window,customer_id_nb_tx_30day_window,customer_id_avg_amount_30day_window,terminal_id_nb_tx_1day_window,terminal_id_risk_1day_window,terminal_id_nb_tx_7day_window,terminal_id_risk_7day_window,terminal_id_nb_tx_30day_window,terminal_id_risk_30day_window
0,0,40.77,0,1,2.0,29.005000,24.0,29.191250,105.0,30.141810,1.0,0.0,10.0,0.0,32.0,0.000000
1,0,9.62,0,1,4.0,25.492500,15.0,19.151333,61.0,18.253607,2.0,0.0,5.0,0.0,27.0,0.000000
2,0,81.55,0,1,6.0,130.778333,11.0,183.875455,50.0,112.001800,0.0,0.0,12.0,0.0,37.0,0.000000
3,0,23.10,0,1,8.0,19.600000,24.0,21.201667,93.0,19.987849,2.0,0.0,6.0,0.0,22.0,0.000000
4,0,59.25,0,1,5.0,84.656000,27.0,80.427407,110.0,74.158455,1.0,0.0,10.0,0.0,38.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67235,0,60.66,0,0,6.0,49.216667,25.0,35.644400,106.0,35.563396,0.0,0.0,13.0,0.0,52.0,0.000000
67236,0,47.20,0,0,4.0,31.502500,30.0,33.665667,118.0,32.563136,1.0,0.0,8.0,0.0,33.0,0.030303
67237,0,47.74,0,0,3.0,30.410000,14.0,32.170000,78.0,35.270000,2.0,0.0,6.0,0.0,19.0,0.000000
67238,0,22.26,0,0,1.0,22.260000,4.0,10.825000,35.0,15.423429,3.0,0.0,7.0,0.0,20.0,0.000000


In [47]:
test_dataset = test_dataset[features]
test_dataset

Unnamed: 0,tx_fraud,tx_amount,tx_during_weekend,tx_during_night,customer_id_nb_tx_1day_window,customer_id_avg_amount_1day_window,customer_id_nb_tx_7day_window,customer_id_avg_amount_7day_window,customer_id_nb_tx_30day_window,customer_id_avg_amount_30day_window,terminal_id_nb_tx_1day_window,terminal_id_risk_1day_window,terminal_id_nb_tx_7day_window,terminal_id_risk_7day_window,terminal_id_nb_tx_30day_window,terminal_id_risk_30day_window
134215,0,42.32,0,1,4.0,68.422500,34.0,67.468529,120.0,64.611750,2.0,0.0,9.0,0.0,31.0,0.000000
134216,0,108.19,0,1,8.0,75.686250,33.0,79.955455,124.0,85.365645,2.0,0.0,10.0,0.0,23.0,0.000000
134218,0,26.13,0,1,1.0,26.130000,12.0,56.269167,61.0,49.095902,1.0,0.0,10.0,0.0,26.0,0.076923
134219,0,65.81,0,1,1.0,65.810000,3.0,62.643333,11.0,63.287273,1.0,0.0,6.0,0.0,23.0,0.000000
134220,0,55.22,0,1,2.0,63.065000,13.0,71.993846,72.0,59.899306,1.0,0.0,5.0,0.0,21.0,0.523810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201290,0,16.72,0,0,1.0,16.720000,7.0,19.210000,21.0,22.987619,1.0,0.0,3.0,0.0,20.0,0.000000
201291,0,38.16,0,0,7.0,40.305714,28.0,46.151429,97.0,45.018144,2.0,0.0,13.0,0.0,42.0,0.000000
201292,0,69.85,0,0,3.0,92.500000,24.0,95.770417,101.0,89.886040,0.0,0.0,3.0,0.0,34.0,0.000000
201293,0,14.99,0,0,3.0,22.343333,19.0,18.450000,77.0,15.848052,3.0,0.0,7.0,0.0,20.0,0.000000


In [48]:
prefix = 'train_dataset'
train_dataset.to_csv("train_dataset.csv", header=False, index=False)
s3.upload_file("train_dataset.csv", data_bucket, prefix + "/train_dataset.csv")

test_dataset.to_csv("test_dataset.csv", header=False, index=False)
s3.upload_file("test_dataset.csv", data_bucket, prefix + "/test_dataset.csv")

dataset_uri_prefix = "s3://" + data_bucket + "/" + prefix + "/"

## Model training

In [55]:
training_image = sagemaker.image_uris.retrieve("xgboost", region, "1.0-1")
training_image

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3'

In [56]:
prefix = 'training_output'
training_output_path = "s3://" + data_bucket + "/" + prefix

from sagemaker.estimator import Estimator

training_model = Estimator(
    training_image,
    role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=training_output_path,
    sagemaker_session=feature_store_session,
)

In [57]:
training_model.set_hyperparameters(objective="binary:logistic", num_round=50)

In [58]:
dataset_uri_prefix

's3://creditcardfraud-project/train_dataset/'

In [59]:
train_data = sagemaker.inputs.TrainingInput(
    dataset_uri_prefix + 'train_dataset',
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

test_data = sagemaker.inputs.TrainingInput(
    dataset_uri_prefix + 'test_dataset',
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)


#estimator.fit({'train': train_input, 'validation': validation_input})
data_channels = {'train': train_data, 'validation': test_data}

In [60]:
training_model.fit(inputs=data_channels, logs=True)

2021-12-06 17:55:05 Starting - Starting the training job...
2021-12-06 17:55:33 Starting - Launching requested ML instancesProfilerReport-1638813305: InProgress
......
2021-12-06 17:56:33 Starting - Preparing the instances for training.........
2021-12-06 17:58:03 Downloading - Downloading input data...
2021-12-06 17:58:34 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[17:58:32] 