In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns

In [2]:
# Custom modules

import sys
import os
ROOT_DIR = os.path.abspath('../')
sys.path.insert(0, ROOT_DIR + "/src")

# Constant

In [3]:
RAW_INPUT_PATH = '../data/raw/bs140513_032310.csv'
IGNORE_COLS = ['zipcodeOri', 'zipMerchant']

In [22]:
TEST_RATIO = 0.2

In [62]:
OUTPUT_DIR = '../data/process/'

# Load input 

In [4]:
raw_df = pd.read_csv(RAW_INPUT_PATH,
                     quotechar="'",
                     usecols=lambda c: c not in IGNORE_COLS)

# Clean

## Drop noise (based on `01_eda` notebook)

### In label `gender`

In [8]:
clean_df = raw_df.query('gender in ("F", "M")')

### In label `age`

In [9]:
clean_df = clean_df.query('age != "U"')

# Divide train-test by customer

In [20]:
customer_list = clean_df['customer'].unique()

In [39]:
num_train_customer = round((1 - TEST_RATIO) * len(customer_list))
train_customer = np.random.choice(customer_list,
                                  size=num_train_customer,
                                  replace=False)
test_customer = set(customer_list) - set(train_customer)
test_customer = np.array(list(test_customer))

In [42]:
train_customer.shape

(3280,)

In [43]:
test_customer.shape

(820,)

In [None]:
train_df = clean_df.loc[clean_df['customer'].isin(train_customer)]
test_df = clean_df.loc[clean_df['customer'].isin(test_customer)]

In [47]:
train_df.shape

(475657, 8)

In [48]:
test_df.shape

(117293, 8)

# Agg transformation per customer

- Perform quick aggregation per user to fit a baseline
- Ignore detail information about merchant and categories

#### Aggregate

In [64]:
agg_ops = {'step': ['count', 'mean', 'median'],
           'age': ['first'],
           'merchant': ['nunique'],
           'category': ['nunique'],
           'amount': ['mean', 'median', 'max', 'min', 'std'],
           'fraud': ['sum', 'mean'],
           'gender': ['first']}

In [65]:
train_agg_df = train_df.groupby('customer').agg(agg_ops)
test_agg_df = test_df.groupby('customer').agg(agg_ops)

#### Flatten column names

In [66]:
train_agg_df.columns = train_agg_df.columns.map('_'.join).str.strip()
test_agg_df.columns = test_agg_df.columns.map('_'.join).str.strip()

# Persist

In [None]:
train_agg_df.to_csv(OUTPUT_DIR + "/train_agg.csv", index=False)
test_agg_df.to_csv(OUTPUT_DIR + "/test_agg.csv", index=False)