In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os

sns.set_context('talk')

In [2]:
input_train_path = os.path.join(
    '..',
    'dataset',
    'preprocessed',
    'transact_train.csv'
)

df_train = pd.read_csv(input_train_path)
df_train.head()

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,...,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
0,1,6.0,5.0,0.0,1.0,59.99,59.99,59.99,1.0,59.99,...,-99,1.0,600.0,70.0,21.0,1.0,43.0,1,49.0,1
1,1,6.0,5.0,11.94,1.0,59.99,59.99,59.99,1.0,59.99,...,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1,49.0,1
2,1,6.0,5.0,39.887,1.0,59.99,59.99,59.99,1.0,59.99,...,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1,49.0,1
3,2,6.0,5.0,0.0,0.0,-99.0,-99.0,-99.0,0.0,-99.0,...,completely orderable,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99,-99.0,1
4,2,6.0,5.0,15.633,0.0,-99.0,-99.0,-99.0,0.0,-99.0,...,completely orderable,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99,-99.0,1


In [3]:
df_train.iloc[0]

sessionNo              1
startHour            6.0
startWeekday         5.0
duration             0.0
cCount               1.0
cMinPrice          59.99
cMaxPrice          59.99
cSumPrice          59.99
bCount               1.0
bMinPrice          59.99
bMaxPrice          59.99
bSumPrice          59.99
bStep              -99.0
onlineStatus         -99
availability         -99
customerNo           1.0
maxVal             600.0
customerScore       70.0
accountLifetime     21.0
payments             1.0
age                 43.0
address                1
lastOrder           49.0
order                  1
Name: 0, dtype: object

In [4]:
## create categorical values based on startHour
condition_list = [
    ((df_train['startHour'] >= 0) & (df_train['startHour'] < 6)),
    ((df_train['startHour'] >= 6) & (df_train['startHour'] < 12)),
    ((df_train['startHour'] >= 12) & (df_train['startHour'] < 18)),
    ((df_train['startHour'] >= 18) & (df_train['startHour'] < 25)),
]

choice_list = [
    "early_morning",
    "morning",
    "afternoon",
    "evening"
]

df_train["time_of_day"] = np.select(condition_list, choice_list, default="unknown")

In [5]:
## use one hot encoder for categorical values
from sklearn.preprocessing import OneHotEncoder

selected_feature_list = ['availability','address','time_of_day','onlineStatus']
for selected_feature in selected_feature_list:
    ohe = OneHotEncoder()
    feature_array = np.array(df_train[selected_feature])
    feature_encoded = ohe.fit_transform(np.reshape(feature_array, (-1,1))).toarray()

    try:
        feature_encoded = pd.DataFrame(
            data = feature_encoded, 
            columns = [selected_feature + '_' + col.str.replace(' ','_') for col in ohe.categories_[0]]
        )
    except:
        print(selected_feature)
        feature_encoded = pd.DataFrame(
            data = feature_encoded, 
            columns = [selected_feature + '_' + str(col).replace(' ','_') for col in ohe.categories_[0]]
        )
        
    feature_encoded.drop(
        feature_encoded.columns[-1], 
        axis = 1,
        inplace = True
    )

    df_train = pd.concat([df_train, feature_encoded], axis = 1)

availability
address
time_of_day
onlineStatus


In [6]:
df_train.iloc[0]

sessionNo                                         1
startHour                                       6.0
startWeekday                                    5.0
duration                                        0.0
cCount                                          1.0
cMinPrice                                     59.99
cMaxPrice                                     59.99
cSumPrice                                     59.99
bCount                                          1.0
bMinPrice                                     59.99
bMaxPrice                                     59.99
bSumPrice                                     59.99
bStep                                         -99.0
onlineStatus                                    -99
availability                                    -99
customerNo                                      1.0
maxVal                                        600.0
customerScore                                  70.0
accountLifetime                                21.0
payments    

In [7]:
SAVE_DATA = False

if SAVE_DATA:
    output_train_path = os.path.join(
        '..',
        'dataset',
        'feature-engineering',
        'iteration-1.csv'
    )
    df_train.to_csv(output_train_path, index=False)