In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization 
import matplotlib.pyplot as plt
# json
import json
# sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics

# lgb
import lightgbm as lgb

# datatime
import datetime
%matplotlib inline

In [2]:
def load_df(csv_path, nrows=None):
    '''
    load csv file and convert json columns to normal columns
    '''
    json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in json_columns},
                     dtype={'fullVisitorId': 'str'},
                     nrows=nrows)
    
    for column in json_columns:
        column_df = pd.io.json.json_normalize(list(df[column].values))
        column_df.columns = [f'{column}.{sub_column}' for sub_column in column_df.columns]
        df = df.drop(column, axis=1).join(column_df)
        
    return df

In [3]:
# load and transform data
train_df = load_df('train.csv')
test_df = load_df('test.csv')

In [4]:
def run_lgb(X_train, y_train, X_dev, y_dev, X_test):
    '''
    wrap train, dev and test dataset in LGB
    '''
    params = {'learning_rate': 0.03,
            'objective':'regression',
            'metric':'rmse',
            'num_leaves': 31,
            'verbose': 1,
            "subsample": 0.99,
            "colsample_bytree": 0.99,
            "random_state":42,
            'max_depth': 15,
            'lambda_l2': 0.02085548700474218,
            'lambda_l1': 0.004107624022751344,
            'bagging_fraction': 0.7934712636944741,
            'feature_fraction': 0.686612409641711,
            'min_child_samples': 21}
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_dev = lgb.Dataset(X_dev, label=y_dev)
    model = lgb.train(params, lgb_train, valid_sets=[lgb_dev], early_stopping_rounds=100, verbose_eval=100)
    pred_y_test = model.predict(X_test, num_iteration=model.best_iteration) 
    pred_y_dev = model.predict(X_dev, num_iteration=model.best_iteration)
    return model, pred_y_dev, pred_y_test

In [8]:
from sklearn.model_selection import GroupKFold

def get_oof(lgb_model, train_df, test_df, select_features):
    oof_pred = np.zeros(train_df.shape[0])
    sub_pred = np.zeros(test_df.shape[0])
    fold_ids = get_fold(train_df)
    
    for i, (train_idx, dev_idx) in enumerate(fold_ids):
        print('processing {} fold: '.format(i))

        train_id = train_df['fullVisitorId'].iloc[train_idx].values
        train_X = train_df[select_features].iloc[train_idx]
        train_y = train_df['totals.transactionRevenue'].iloc[train_idx].values
        train_y = np.log1p(train_y)

        dev_id = train_df['fullVisitorId'].iloc[dev_idx].values
        dev_X = train_df[select_features].iloc[dev_idx]
        dev_y = train_df['totals.transactionRevenue'].iloc[dev_idx].values  
        dev_y = np.log1p(dev_y)

        model, pred_dev, pred_test = run_lgb(train_X, train_y, dev_X, dev_y, test_X)
        pred_dev[pred_dev < 0] = 0
        pred_test[pred_test < 0] = 0

        oof_pred[dev_idx] = np.expm1(pred_dev)
        sub_pred += np.expm1(pred_test) / len(fold_ids)
        
        # n fold rmse 
        pred_dev_df = pd.DataFrame({'fullVisitorId': dev_id})
        pred_dev_df['transactionRevenue'] = dev_y
        pred_dev_df['predictedRevenue'] = np.expm1(pred_dev)
        pred_dev_df = pred_dev_df.groupby('fullVisitorId')[['transactionRevenue', 'predictedRevenue']].sum()
        rmse_score = rmse(np.log1p(pred_dev_df['transactionRevenue'].values), np.log1p(pred_dev_df['predictedRevenue'].values))
        print('{} fold rmse:'.format(rmse_score))
    return oof_pred, sub_pred
        
    def get_fold(train_df, kfold = 5):   
        '''
        kfold based on unique fullVisitorId
        '''
        unique_vis = np.array(sorted(train_df['fullVisitorId'].unique()))
        folds = GroupKFold(n_splits=kfold)
        fold_ids = []
        ids = np.arange(df.shape[0])
        for train_vis, dev_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            fold_ids.append([
                ids[df['fullVisitorId'].isin(unique_vis[train_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[dev_vis])]
            ])
        return fold_ids

In [9]:
# define the metrics: rmse(root mean square error)
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

In [10]:
# preprocess data
def preprocess(train_df, test_df):
    num_columns = ['totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews', 'visitNumber', 'visitStartTime']
    cat_columns = ['channelGrouping',
                   'device.browser',
                   'device.deviceCategory',
                   'device.operatingSystem',
                   'geoNetwork.city',
                   'geoNetwork.continent',
                   'geoNetwork.country',
                   'geoNetwork.metro',
                   'geoNetwork.networkDomain',
                   'geoNetwork.region',
                   'geoNetwork.subContinent',
                   'trafficSource.adContent',
                   'trafficSource.adwordsClickInfo.adNetworkType',
                   'trafficSource.adwordsClickInfo.gclId',
                   'trafficSource.adwordsClickInfo.isVideoAd',
                   'trafficSource.adwordsClickInfo.page',
                   'trafficSource.adwordsClickInfo.slot',
                   'trafficSource.campaign',
                   'trafficSource.isTrueDirect',
                   'trafficSource.keyword',
                   'trafficSource.medium',
                   'trafficSource.referralPath',
                   'trafficSource.source']
    
    for column in cat_columns:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[column].values.astype('str')) + list(test_df[column].values.astype('str')))
        train_df[column] = lbl.transform(list(train_df[column].values.astype('str')))
        test_df[column] = lbl.transform(list(test_df[column].values.astype('str')))
    
    train_df[num_columns] = train_df[num_columns].astype(float)
    test_df[num_columns] = test_df[num_columns].astype(float)
    
    return train_df, test_df

In [11]:
train_df, test_df = preprocess(train_df, test_df)

KeyboardInterrupt: 