In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns

In [3]:
# Custom modules

import sys
import os
ROOT_DIR = os.path.abspath('../')
sys.path.insert(0, ROOT_DIR + "/src")

# Constant

In [4]:
RAW_INPUT_PATH = '../data/raw/bs140513_032310.csv'
IGNORE_COLS = ['zipcodeOri', 'zipMerchant']

In [5]:
TEST_RATIO = 0.2

In [6]:
OUTPUT_DIR = '../data/process/'

In [53]:
LABEL = 'gender'

# Load input 

In [7]:
raw_df = pd.read_csv(RAW_INPUT_PATH,
                     quotechar="'",
                     usecols=lambda c: c not in IGNORE_COLS)

# Clean

## Drop noise (based on `01_eda` notebook)

### In label `gender`

In [8]:
clean_df = raw_df.query('gender in ("F", "M")')

### In label `age`

In [9]:
clean_df = clean_df.query('age != "U"')

# Divide train-test by customer

In [15]:
customer_list = clean_df['customer'].unique()

In [16]:
num_train_customer = round((1 - TEST_RATIO) * len(customer_list))
train_customer = np.random.choice(customer_list,
                                  size=num_train_customer,
                                  replace=False)
test_customer = set(customer_list) - set(train_customer)
test_customer = np.array(list(test_customer))

In [17]:
train_customer.shape

(3280,)

In [18]:
test_customer.shape

(820,)

In [19]:
train_df = clean_df.loc[clean_df['customer'].isin(train_customer)]
test_df = clean_df.loc[clean_df['customer'].isin(test_customer)]

In [20]:
train_df.shape

(472480, 8)

In [21]:
test_df.shape

(120470, 8)

# Featuretools

In [12]:
import featuretools as ft
import featuretools.variable_types as vtypes

In [24]:
train_df.reset_index(inplace=True)

In [115]:
es = ft.EntitySet(id='Transaction Logs')
variable_types = { 
      'step': vtypes.NumericTimeIndex,
      'age': vtypes.Categorical,
      'merchant': vtypes.Categorical,
      'category': vtypes.Categorical,
      'amount': vtypes.Numeric,
      'fraud': vtypes.Boolean}

In [116]:
es.entity_from_dataframe(entity_id='txn_logs', dataframe=train_df,
                         index='index', variable_types=variable_types)

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 472480, Columns: 9]
  Relationships:
    No relationships

In [117]:
es.normalize_entity(base_entity_id='txn_logs',
                    new_entity_id='merchants',
                    index='merchant',
                    additional_variables=['category'])

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 472480, Columns: 8]
    merchants [Rows: 50, Columns: 2]
  Relationships:
    txn_logs.merchant -> merchants.merchant

In [118]:
es.normalize_entity(base_entity_id='txn_logs',
                    new_entity_id='customer_metadata',
                    index='customer',
                    additional_variables=['age'])

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 472480, Columns: 7]
    merchants [Rows: 50, Columns: 2]
    customer_metadata [Rows: 3280, Columns: 2]
  Relationships:
    txn_logs.merchant -> merchants.merchant
    txn_logs.customer -> customer_metadata.customer

In [119]:
es.normalize_entity(base_entity_id='txn_logs',
                    new_entity_id='customer_label',
                    index='customer',
                    additional_variables=['gender'])

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 472480, Columns: 6]
    merchants [Rows: 50, Columns: 2]
    customer_metadata [Rows: 3280, Columns: 2]
    customer_label [Rows: 3280, Columns: 2]
  Relationships:
    txn_logs.merchant -> merchants.merchant
    txn_logs.customer -> customer_metadata.customer
    txn_logs.customer -> customer_label.customer

In [121]:
feature_matrix, feature_names = ft.dfs(
    entityset=es, 
    target_entity='customer_metadata',
    max_depth=3,
    verbose=1,
    n_jobs=1,
    drop_exact=['gender'],
    ignore_entities=['customer_label'],
    ignore_variables={'customer':['customer']})

Built 111 features
Elapsed: 00:16 | Progress: 100%|██████████


In [122]:
feature_matrix

Unnamed: 0_level_0,age,SUM(txn_logs.amount),SUM(txn_logs.step),STD(txn_logs.amount),STD(txn_logs.step),MAX(txn_logs.amount),MAX(txn_logs.step),SKEW(txn_logs.amount),SKEW(txn_logs.step),MIN(txn_logs.amount),...,MEAN(txn_logs.merchants.STD(txn_logs.step)),MEAN(txn_logs.merchants.MIN(txn_logs.amount)),MEAN(txn_logs.merchants.MEAN(txn_logs.step)),MEAN(txn_logs.merchants.STD(txn_logs.amount)),MEAN(txn_logs.merchants.MIN(txn_logs.step)),MEAN(txn_logs.merchants.SUM(txn_logs.step)),MEAN(txn_logs.merchants.MAX(txn_logs.amount)),MEAN(txn_logs.merchants.SKEW(txn_logs.amount)),NUM_UNIQUE(txn_logs.merchants.MODE(txn_logs.customer)),MODE(txn_logs.merchants.MODE(txn_logs.customer))
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1093826151,4,4716.20,14681,18.035708,51.648467,95.91,179,0.788957,0.042490,0.44,...,50.567665,0.006467,76.876088,20.426210,0.035928,1.152764e+07,158.128503,0.801931,7,C747590731
C352968107,2,5519.77,15401,23.194176,52.222232,135.86,179,1.390664,-0.055003,0.53,...,48.790943,0.007278,94.075895,21.450638,0.017751,1.778805e+07,151.033195,0.729204,11,C222098023
C2054744914,4,2693.84,2494,35.455397,37.327340,189.59,168,2.147027,2.202405,0.32,...,48.978982,0.021692,91.432112,42.584909,0.061538,1.323777e+07,318.684769,1.060497,13,C747590731
C757503768,5,4596.03,14099,23.550817,48.040350,168.45,179,1.851214,-0.053151,0.01,...,49.246813,0.011034,89.742316,22.834724,0.027586,1.606864e+07,163.170000,0.791981,14,C747590731
C1315400589,3,5739.56,15827,32.125813,51.752124,276.85,179,4.017118,0.003551,0.27,...,47.346242,0.008229,106.661550,22.259138,0.051429,2.113912e+07,142.158914,0.626424,7,C222098023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C650117238,3,2917.90,963,653.109788,39.449561,1914.56,169,2.440494,0.195306,19.31,...,51.639711,0.368750,77.685898,365.352891,0.000000,9.064425e+04,1975.906250,1.985177,7,C1978250683
C2095815783,2,2854.58,9566,76.472538,22.942134,546.19,177,5.283233,-0.505690,0.90,...,50.646456,0.024118,76.620188,33.964030,0.044118,1.066388e+07,235.705735,0.893096,7,C747590731
C1014783484,1,2008.11,10407,19.191099,22.991109,96.49,179,1.210393,-0.012028,0.09,...,47.868524,0.010946,103.496525,20.590465,0.000000,2.109699e+07,135.159730,0.639952,6,C222098023
C849065220,4,390.57,960,25.845704,22.872316,93.98,171,0.339886,-0.000005,24.70,...,49.714011,1.171429,79.924462,138.067776,1.857143,7.450457e+04,880.510000,2.419127,5,C1275518867


In [123]:
feature_names

[<Feature: age>,
 <Feature: SUM(txn_logs.amount)>,
 <Feature: SUM(txn_logs.step)>,
 <Feature: STD(txn_logs.amount)>,
 <Feature: STD(txn_logs.step)>,
 <Feature: MAX(txn_logs.amount)>,
 <Feature: MAX(txn_logs.step)>,
 <Feature: SKEW(txn_logs.amount)>,
 <Feature: SKEW(txn_logs.step)>,
 <Feature: MIN(txn_logs.amount)>,
 <Feature: MIN(txn_logs.step)>,
 <Feature: MEAN(txn_logs.amount)>,
 <Feature: MEAN(txn_logs.step)>,
 <Feature: COUNT(txn_logs)>,
 <Feature: PERCENT_TRUE(txn_logs.fraud)>,
 <Feature: NUM_UNIQUE(txn_logs.merchant)>,
 <Feature: MODE(txn_logs.merchant)>,
 <Feature: NUM_UNIQUE(txn_logs.merchants.category)>,
 <Feature: MODE(txn_logs.merchants.category)>,
 <Feature: SUM(txn_logs.merchants.COUNT(txn_logs))>,
 <Feature: SUM(txn_logs.merchants.NUM_UNIQUE(txn_logs.customer))>,
 <Feature: SUM(txn_logs.merchants.MAX(txn_logs.step))>,
 <Feature: SUM(txn_logs.merchants.SKEW(txn_logs.step))>,
 <Feature: SUM(txn_logs.merchants.MEAN(txn_logs.amount))>,
 <Feature: SUM(txn_logs.merchants.PERCEN

In [124]:
train_df

Unnamed: 0,index,step,customer,age,gender,merchant,category,amount,fraud
0,0,0,C1093826151,4,M,M348934600,es_transportation,4.55,0
1,1,0,C352968107,2,M,M348934600,es_transportation,39.68,0
2,2,0,C2054744914,4,F,M1823072687,es_transportation,26.89,0
3,4,0,C757503768,5,M,M348934600,es_transportation,35.72,0
4,5,0,C1315400589,3,F,M348934600,es_transportation,25.81,0
...,...,...,...,...,...,...,...,...,...
472475,594637,179,C748358246,2,M,M1823072687,es_transportation,51.17,0
472476,594638,179,C1753498738,3,F,M1823072687,es_transportation,20.53,0
472477,594639,179,C650108285,4,F,M1823072687,es_transportation,50.73,0
472478,594640,179,C123623130,2,F,M349281107,es_fashion,22.44,0


In [97]:
feature_matrix_enc, features_enc = ft.encode_features(
    feature_matrix, feature_names)

In [98]:
feature_matrix_enc

Unnamed: 0_level_0,age = 2,age = 3,age = 4,age = 5,age = 1,age = 6,age = 0,age is unknown,SUM(txn_logs.step),SUM(txn_logs.amount),...,MODE(txn_logs.merchant) = M1741626453,MODE(txn_logs.merchant) is unknown,NUM_UNIQUE(txn_logs.merchants.category),MODE(txn_logs.merchants.category) = es_transportation,MODE(txn_logs.merchants.category) = es_health,MODE(txn_logs.merchants.category) = es_wellnessandbeauty,MODE(txn_logs.merchants.category) = es_sportsandtoys,MODE(txn_logs.merchants.category) = es_fashion,MODE(txn_logs.merchants.category) = es_home,MODE(txn_logs.merchants.category) is unknown
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1093826151,0,0,1,0,0,0,0,0,14681,4716.20,...,0,0,5,1,0,0,0,0,0,0
C352968107,1,0,0,0,0,0,0,0,15401,5519.77,...,0,0,10,1,0,0,0,0,0,0
C2054744914,0,0,1,0,0,0,0,0,2494,2693.84,...,0,0,6,1,0,0,0,0,0,0
C757503768,0,0,0,1,0,0,0,0,14099,4596.03,...,0,0,8,1,0,0,0,0,0,0
C1315400589,0,1,0,0,0,0,0,0,15827,5739.56,...,0,0,6,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C650117238,0,1,0,0,0,0,0,0,963,2917.90,...,0,0,6,0,0,0,1,0,0,0
C2095815783,1,0,0,0,0,0,0,0,9566,2854.58,...,0,0,5,1,0,0,0,0,0,0
C1014783484,0,0,0,0,1,0,0,0,10407,2008.11,...,0,0,5,1,0,0,0,0,0,0
C849065220,0,0,1,0,0,0,0,0,960,390.57,...,0,0,4,0,0,0,1,0,0,0


In [99]:
features_enc

[<Feature: age = 2>,
 <Feature: age = 3>,
 <Feature: age = 4>,
 <Feature: age = 5>,
 <Feature: age = 1>,
 <Feature: age = 6>,
 <Feature: age = 0>,
 <Feature: age is unknown>,
 <Feature: SUM(txn_logs.step)>,
 <Feature: SUM(txn_logs.amount)>,
 <Feature: STD(txn_logs.step)>,
 <Feature: STD(txn_logs.amount)>,
 <Feature: MAX(txn_logs.step)>,
 <Feature: MAX(txn_logs.amount)>,
 <Feature: SKEW(txn_logs.step)>,
 <Feature: SKEW(txn_logs.amount)>,
 <Feature: MIN(txn_logs.step)>,
 <Feature: MIN(txn_logs.amount)>,
 <Feature: MEAN(txn_logs.step)>,
 <Feature: MEAN(txn_logs.amount)>,
 <Feature: COUNT(txn_logs)>,
 <Feature: NUM_UNIQUE(txn_logs.fraud)>,
 <Feature: NUM_UNIQUE(txn_logs.merchant)>,
 <Feature: MODE(txn_logs.fraud) = 0>,
 <Feature: MODE(txn_logs.fraud) = 1>,
 <Feature: MODE(txn_logs.fraud) is unknown>,
 <Feature: MODE(txn_logs.merchant) = M1823072687>,
 <Feature: MODE(txn_logs.merchant) = M348934600>,
 <Feature: MODE(txn_logs.merchant) = M480139044>,
 <Feature: MODE(txn_logs.merchant) = M153

#### Generate label

In [100]:
_tmp = es['customer_label'].df.copy()
_tmp.drop(columns=['customer'], inplace=True)

agg_train_df = feature_matrix_enc.join(_tmp, how='left')

# Apply same transformation to test

In [103]:
test_df.reset_index(inplace=True)

In [104]:
es_test = ft.EntitySet(id='Transaction Logs')
variable_types = { 
      'step': vtypes.NumericTimeIndex,
      'age': vtypes.Categorical,
      'merchant': vtypes.Categorical,
      'category': vtypes.Categorical,
      'amount': vtypes.Numeric,
      'fraud': vtypes.Categorical}

In [105]:
es_test.entity_from_dataframe(entity_id='txn_logs', dataframe=test_df,
                              index='index', variable_types=variable_types)

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 120470, Columns: 10]
  Relationships:
    No relationships

In [106]:
es_test.normalize_entity(base_entity_id='txn_logs',
                         new_entity_id='merchants',
                         index='merchant',
                         additional_variables=['category'])

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 120470, Columns: 9]
    merchants [Rows: 49, Columns: 2]
  Relationships:
    txn_logs.merchant -> merchants.merchant

In [107]:
es_test.normalize_entity(base_entity_id='txn_logs',
                         new_entity_id='customer_metadata',
                         index='customer',
                         additional_variables=['age'])

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 120470, Columns: 8]
    merchants [Rows: 49, Columns: 2]
    customer_metadata [Rows: 820, Columns: 2]
  Relationships:
    txn_logs.merchant -> merchants.merchant
    txn_logs.customer -> customer_metadata.customer

In [108]:
es_test.normalize_entity(base_entity_id='txn_logs',
                         new_entity_id='customer_label',
                         index='customer',
                         additional_variables=['gender'])

Entityset: Transaction Logs
  Entities:
    txn_logs [Rows: 120470, Columns: 7]
    merchants [Rows: 49, Columns: 2]
    customer_metadata [Rows: 820, Columns: 2]
    customer_label [Rows: 820, Columns: 2]
  Relationships:
    txn_logs.merchant -> merchants.merchant
    txn_logs.customer -> customer_metadata.customer
    txn_logs.customer -> customer_label.customer

In [109]:
feature_matrix_test = ft.calculate_feature_matrix(features=features_enc,
                                                  entities=['txn_logs', 'merchants',
                                                            'customer_metadata'],
                                                  entityset=es_test)

In [110]:
feature_matrix_test

Unnamed: 0_level_0,age = 2,age = 3,age = 4,age = 5,age = 1,age = 6,age = 0,age is unknown,SUM(txn_logs.step),SUM(txn_logs.amount),...,MODE(txn_logs.merchant) = M1741626453,MODE(txn_logs.merchant) is unknown,NUM_UNIQUE(txn_logs.merchants.category),MODE(txn_logs.merchants.category) = es_transportation,MODE(txn_logs.merchants.category) = es_health,MODE(txn_logs.merchants.category) = es_wellnessandbeauty,MODE(txn_logs.merchants.category) = es_sportsandtoys,MODE(txn_logs.merchants.category) = es_fashion,MODE(txn_logs.merchants.category) = es_home,MODE(txn_logs.merchants.category) is unknown
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1760612790,False,True,False,False,False,False,False,False,15303,5187.93,...,False,False,8,True,False,False,False,False,False,False
C202531238,False,False,True,False,False,False,False,False,15328,4942.43,...,False,False,7,True,False,False,False,False,False,False
C39858251,False,False,False,True,False,False,False,False,11747,4208.52,...,False,False,7,True,False,False,False,False,False,False
C98707741,False,False,True,False,False,False,False,False,13208,4635.87,...,False,False,9,True,False,False,False,False,False,False
C623601481,False,True,False,False,False,False,False,False,12094,8335.52,...,False,False,8,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1455309687,False,False,True,False,False,False,False,False,9435,4996.16,...,False,False,9,True,False,False,False,False,False,False
C197543842,False,False,True,False,False,False,False,False,3159,2011.93,...,False,False,8,True,False,False,False,False,False,False
C1404665203,False,False,False,False,True,False,False,False,593,877.45,...,False,True,4,False,True,False,False,False,False,False
C1555447628,True,False,False,False,False,False,False,False,14100,4400.99,...,False,False,8,True,False,False,False,False,False,False


#### Generate label

In [112]:
_tmp = es_test['customer_label'].df.copy()
_tmp.drop(columns=['customer'], inplace=True)

agg_test_df = feature_matrix_test.join(_tmp, how='left')

# Persist

In [114]:
agg_train_df.to_csv(OUTPUT_DIR + "/train_data_fe3.csv", index=False)
agg_test_df.to_csv(OUTPUT_DIR + "/test_data_fe3.csv", index=False)

# Archive