# Data Pipeline

## Load Libraries

In [1]:
import pandas as pd
import joblib
import yaml

from sklearn.model_selection import train_test_split

import sys
sys.path.append('../src')  
import utils as util

utils good


## Load Configuration File

In [2]:
config = util.load_config()

## Data Collection

In [3]:
def read_raw_data(config: dict, raw_dataset_dir) -> pd.DataFrame:
    raw_dataset = pd.read_csv(raw_dataset_dir, encoding='utf-8')
    
    return raw_dataset

In [4]:
raw_dataset_dir = '../' + config['data_source']['directory'] + config['data_source']['file_name']  

raw_dataset = read_raw_data(config, raw_dataset_dir)

In [5]:
pd.set_option('display.max_columns', 500)
raw_dataset.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [6]:
raw_dataset = raw_dataset[config['data_source']['columns']]
raw_dataset.head()

Unnamed: 0,browser,source,age,sex,purchase_value,class
0,Chrome,SEO,39,M,34,0
1,Chrome,Ads,53,F,16,0
2,Opera,SEO,53,M,15,1
3,Safari,SEO,41,M,44,0
4,Safari,Ads,45,M,39,0


In [7]:
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   browser         151112 non-null  object
 1   source          151112 non-null  object
 2   age             151112 non-null  int64 
 3   sex             151112 non-null  object
 4   purchase_value  151112 non-null  int64 
 5   class           151112 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 6.9+ MB


In [8]:
raw_dataset_dir = '../' + config['train_test_data']['directory'] + config['train_test_data']['raw_dataset']

util.pickle_dump(raw_dataset, raw_dataset_dir)

In [9]:
# raw_dataset['purchase_value'].min()

## Data Definition

browser: [object] ['Chrome', 'IE', 'Safari', 'FireFox', 'Opera'] -> browser used for transactions

source: [object] ['SEO', 'Ads', 'Direct'] -> transaction source

age: [integer] [18-76] -> age of the user who made the transaction

sex: [object] ['M', 'F'] -> user gender

purchase_value: [integer] [9-154] -> total transactions

class: [integer] [0-1] -> fraud status

## Data Validation

### Data Type

In [10]:
raw_dataset.dtypes

browser           object
source            object
age                int64
sex               object
purchase_value     int64
class              int64
dtype: object

### Data Range

In [11]:
raw_dataset.describe()

Unnamed: 0,age,purchase_value,class
count,151112.0,151112.0,151112.0
mean,33.140704,36.935372,0.093646
std,8.617733,18.322762,0.291336
min,18.0,9.0,0.0
25%,27.0,22.0,0.0
50%,33.0,35.0,0.0
75%,39.0,49.0,0.0
max,76.0,154.0,1.0


### Data Dimension

In [12]:
raw_dataset.shape

(151112, 6)

## Data Defence

In [13]:
def check_data(input_data, config):

    # browser
    assert input_data['browser'][0] in config['data_defense']['browser']['value'] or\
        input_data['browser'][0] != '',\
        f"Browser must be in list {config['data_defense']['browser']['value']}, and cannot be empty."
    
    # source
    assert input_data['source'][0] in config['data_defense']['source']['value'] or\
        input_data['source'][0] != '',\
        f"Source must be in list {config['data_defense']['source']['value']}, and cannot be empty."
    
    # age
    assert input_data.age.between(config['data_defense']['age'][0], config['data_defense']['age'][1]).sum() == len(input_data),\
        "an error occurs in Age range."
    
    # sex
    assert input_data['sex'][0] in config['data_defense']['sex']['value'] or\
        input_data['sex'][0] != '',\
        f"Sex must be in list {config['data_defense']['sex']['value']}, and cannot be empty."
    
    # purchase_value
    assert input_data.purchase_value.between(config['data_defense']['purchase_value'][0], config['data_defense']['purchase_value'][1]).sum() == len(input_data),\
        "an error occurs in Purchase Value range."

In [14]:
check_data(raw_dataset, config)

## Data Splitting

In [15]:
X = raw_dataset[config['data_source']['features']].copy()
y = raw_dataset[config['data_source']['target_name']].copy()

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   browser         151112 non-null  object
 1   source          151112 non-null  object
 2   age             151112 non-null  int64 
 3   sex             151112 non-null  object
 4   purchase_value  151112 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 5.8+ MB


In [17]:
y.value_counts()

0    136961
1     14151
Name: class, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = config['data_source']['test_size'], 
                                                    random_state = config['data_source']['random_state'], stratify = y)

In [19]:
X_train

Unnamed: 0,browser,source,age,sex,purchase_value
61049,Chrome,Ads,31,F,42
141787,Chrome,Ads,32,M,38
84407,Safari,SEO,36,M,12
115800,IE,SEO,19,M,12
61857,Chrome,SEO,28,F,41
...,...,...,...,...,...
79502,Opera,SEO,31,M,52
137723,Chrome,Ads,30,M,46
13126,Chrome,Ads,49,M,21
22089,Chrome,Direct,19,M,26


## Dump Data

In [20]:
X_train_path = '../' + config['train_test_data']['directory'] + config['train_test_data']['X_train']
y_train_path = '../' + config['train_test_data']['directory'] + config['train_test_data']['y_train']

X_test_path = '../' + config['train_test_data']['directory'] + config['train_test_data']['X_test']
y_test_path = '../' + config['train_test_data']['directory'] + config['train_test_data']['y_test']

util.pickle_dump(X_train, X_train_path)
util.pickle_dump(y_train, y_train_path)
util.pickle_dump(X_test, X_test_path)
util.pickle_dump(y_test, y_test_path)