# Data Pipeline

## Import Libraries

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

import sys
sys.path.append('../src')  
import utils as util

utils good


In [2]:
all_attributes = dir(util)

user_defined = [attr for attr in all_attributes if not attr.startswith('_')]

for attr in user_defined:
    print(attr)

OneHotEncoder
config_dir
datetime
dump_dataset
joblib
load_config
load_dataset
ohe_input
pd
pickle_dump
pickle_load
ros_fit_resample
rus_fit_resample
smote_fit_resample
yaml


## Load Configuration File

In [3]:
config_data = util.load_config()

## Data Collection

In [4]:
def read_raw_data(config: dict) -> pd.DataFrame:
    raw_dataset = pd.DataFrame()

    raw_dataset_dir = '../' + config['data_source']['directory'] + config['data_source']['file_name']  
    raw_dataset = pd.read_csv(raw_dataset_dir, encoding='utf-8')
    
    return raw_dataset

In [5]:
raw_dataset = read_raw_data(config_data)

In [6]:
raw_dataset

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [7]:
util.pickle_dump(raw_dataset, '../' + config_data['train_test_data']['directory'] + config_data['train_test_data']['raw_dataset'])

## Data Definition

Education         :
    [object]
    ['Bachelors', 'Masters', 'PHD']
    pendidikan terakhir karyawan

JoiningYear         :
    [integer]
    [2012 - 2018]
    tahun karyawan bergabung

City            :
    [object]
    ['Bangalore', 'Pune', 'New Delhi']
    kota asal karyawan

PaymentTier            :
    [integer]
    [1 - 3]
    jenis tipe pembayaran karyawan

Age             :
    [integer]
    [22 - 41]
    usia karyawan

Gender              :
    [object]
    ['Male', 'Female']
    jenis kelamin karyawan

EverBenched              :
    [object]
    ['No', 'Yes']
    indikasi apakah karyawan sempat menganggur

ExperienceInCurrentDomain             :
    [integer]
    [0 - 7]
    berapa tahun pengalaman karyawan pada domain 

LeaveOrNot             :
    [integer]
    [0 - 1]
    status apakah churn atau tidak

## Data Validation

### Data Type

In [8]:
raw_dataset.dtypes

Education                    object
JoiningYear                   int64
City                         object
PaymentTier                   int64
Age                           int64
Gender                       object
EverBenched                  object
ExperienceInCurrentDomain     int64
LeaveOrNot                    int64
dtype: object

### Data Range

In [9]:
raw_dataset.describe()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot
count,4653.0,4653.0,4653.0,4653.0,4653.0
mean,2015.06297,2.698259,29.393295,2.905652,0.343864
std,1.863377,0.561435,4.826087,1.55824,0.475047
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,3.0,26.0,2.0,0.0
50%,2015.0,3.0,28.0,3.0,0.0
75%,2017.0,3.0,32.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


### Data Dimension

In [10]:
raw_dataset.shape

(4653, 9)

## Data Defense

In [11]:
def check_data(input_data, config):
    
    # Education
    assert input_data['Education'][0] in config['data_defense']['Education']['value'] or\
        input_data['Education'][0] != '',\
        f"Education must be in list {config['data_defense']['Education']['value']}, and cannot be empty."
    
    # City
    assert input_data['City'][0] in config['data_defense']['City']['value'] or\
        input_data['City'][0] != '',\
        f"City must be in list {config['data_defense']['City']['value']}, and cannot be empty."
    
    # Gender
    assert input_data['Gender'][0] in config['data_defense']['Gender']['value'] or\
        input_data['Gender'][0] != '',\
        f"Gender must be in list {config['data_defense']['Gender']['value']}, and cannot be empty."
    
    # EverBenched
    assert input_data['EverBenched'][0] in config['data_defense']['EverBenched']['value'] or\
        input_data['EverBenched'][0] != '',\
        f"EverBenched must be in list {config['data_defense']['EverBenched']['value']}, and cannot be empty."
    
    # JoiningYear
    assert input_data.JoiningYear.between(config['data_defense']['JoiningYear'][0], config['data_defense']['JoiningYear'][1]).sum() == len(input_data),\
        "an error occurs in JoiningYear range."
    
    # PaymentTier
    assert input_data.PaymentTier.between(config['data_defense']['PaymentTier'][0], config['data_defense']['PaymentTier'][1]).sum() == len(input_data),\
        "an error occurs in PaymentTier range."
    
    # Age
    assert input_data.Age.between(config['data_defense']['Age'][0], config['data_defense']['Age'][1]).sum() == len(input_data),\
        "an error occurs in Age range."
    
    # ExperienceInCurrentDomain
    assert input_data.ExperienceInCurrentDomain.between(config['data_defense']['ExperienceInCurrentDomain'][0], config['data_defense']['ExperienceInCurrentDomain'][1]).sum() == len(input_data),\
        "an error occurs in ExperienceInCurrentDomain range."

In [12]:
check_data(raw_dataset, config_data)

## Data Splitting

In [13]:
X = raw_dataset[config_data['data_source']['features']].copy()
y = raw_dataset[config_data['data_source']['target_name']].copy()

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 290.9+ KB


In [15]:
y.value_counts()

LeaveOrNot
0    3053
1    1600
Name: count, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = config_data['data_source']['test_size'], 
                                                    random_state = config_data['data_source']['random_state'], stratify = y)

In [21]:
# y_train

## Dump Data

In [19]:
X_train_path = '../' + config_data['train_test_data']['directory'] + config_data['train_test_data']['X_train']
y_train_path = '../' + config_data['train_test_data']['directory'] + config_data['train_test_data']['y_train']

X_test_path = '../' + config_data['train_test_data']['directory'] + config_data['train_test_data']['X_test']
y_test_path = '../' + config_data['train_test_data']['directory'] + config_data['train_test_data']['y_test']

util.pickle_dump(X_train, X_train_path)
util.pickle_dump(y_train, y_train_path)
util.pickle_dump(X_test, X_test_path)
util.pickle_dump(y_test, y_test_path)