In [154]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [155]:
train_df = pd.read_csv('./data/train_users_2.csv')
test_df = pd.read_csv('./data/test_users.csv')

In [156]:
train_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [157]:
train_df.dtypes

id                          object
date_account_created        object
timestamp_first_active       int64
date_first_booking          object
gender                      object
age                        float64
signup_method               object
signup_flow                  int64
language                    object
affiliate_channel           object
affiliate_provider          object
first_affiliate_tracked     object
signup_app                  object
first_device_type           object
first_browser               object
country_destination         object
dtype: object

### Feature Enginnering Flow
1. datetime
2. age
3. categorical
    - One-hot encoding
        - ```pd.get_dummies()```
4. remove NaN 많은 column
    - drop

### 1. Datetime

In [158]:
train_df['date_account_created'] = pd.to_datetime(train_df.date_account_created)
test_df['date_account_created'] = pd.to_datetime(test_df.date_account_created)

In [159]:
tr_tfa_str = train_df.timestamp_first_active.values.astype('str')
train_df['timestamp_first_active'] = pd.to_datetime(tr_tfa_str)
te_tfa_str = test_df.timestamp_first_active.values.astype('str')
test_df['timestamp_first_active'] = pd.to_datetime(te_tfa_str)

### 2. Drop NaN columns

In [160]:
train_df.drop(['date_first_booking','country_destination'],axis=1, inplace=True)
test_df.drop(['date_first_booking'],axis=1, inplace=True)

### 3. Concat train_df & test_df

In [171]:
df = pd.concat([train_df,test_df],axis=0,ignore_index=True)
df.shape

(275547, 14)

### 4. One-hot Encoding

#### 4.1 timestamp_first_active

In [172]:
df['tfa_year'] = np.array([x.year for x in df.timestamp_first_active])
df['tfa_month'] = np.array([x.month for x in df.timestamp_first_active])
df['tfa_day'] = np.array([x.day for x in df.timestamp_first_active])

In [173]:
df['tfa_wd'] = np.array([x.isoweekday() for x in df.timestamp_first_active])
# return weekdays as 1,2,3,4,5,6,7 = mon ~ sun
tfa_wd_df = pd.get_dummies(df.tfa_wd, prefix='tfa_wd')
# return a dataframe 
df = pd.concat((df,tfa_wd_df),axis=1)
# new df = previous df + onhot-encoding df
df.drop(['tfa_wd'], axis=1, inplace=True)
# remove tfa_wd column, use onehot-encoded feature columns

In [174]:
def season(tfa):
    month = tfa.month
    if month in [3,4,5]:
        label = 'Spring'
    elif month in [6,7,8]:
        label = 'Summer'
    elif month in [9,10,11]:
        label = 'Autumn'
    else: label = 'Winter'
    return label   
    
df['tfa_season'] = df.timestamp_first_active.apply(season)
tfa_season_df = pd.get_dummies(df.tfa_season, prefix='tfa_season')
df = pd.concat((df, tfa_season_df),axis=1)
df.drop(['tfa_season'],axis=1,inplace=True)

In [175]:
df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,tfa_wd_2,tfa_wd_3,tfa_wd_4,tfa_wd_5,tfa_wd_6,tfa_wd_7,tfa_season_Autumn,tfa_season_Spring,tfa_season_Summer,tfa_season_Winter
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,-unknown-,,facebook,0,en,direct,direct,...,0,0,1,0,0,0,0,1,0,0
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,MALE,38.0,facebook,0,en,seo,google,...,0,0,0,0,1,0,0,1,0,0
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,FEMALE,56.0,basic,3,en,direct,direct,...,1,0,0,0,0,0,0,0,1,0
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,FEMALE,42.0,facebook,0,en,direct,direct,...,0,0,0,0,1,0,1,0,0,0
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,-unknown-,41.0,basic,0,en,direct,direct,...,1,0,0,0,0,0,0,0,0,1


#### 4.2 date_account_created