# Introduction



### This kernel is mainly made up of two parts:
* [**1. Data loading**](#Data loading)
* [**2. Model building**](#Model building)

* Data are generated from this script : https://www.kaggle.com/qnkhuat/make-data-ready 
* Stacking part is from this script: https://www.kaggle.com/ashishpatel26/updated-bayesian-lgbm-xgb-cat-fe-kfold-cv

## Data loading

In [1]:
import numpy as np 
import pandas as pd 
import scipy
from datetime import datetime

import sys
import os
from os.path import join as pjoin

data_root = '../input/user-level-training-and-test-data-prepare'
print(os.listdir(data_root))

pd.set_option('display.max_rows',200)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns

['__notebook__.ipynb', 'train_clean8', 'train_clean6', '__output__.json', 'train_clean1', 'custom.css', 'test_clean', 'train_clean2', 'train_clean5', '__results__.html', 'train_clean7', 'train_clean0', 'train_clean3', 'train_clean4']


In [2]:
def load_data(data='train',n=2):
    df = pd.DataFrame()
    for i in range(n) :
        if data=='train':
            if i > 8 :
                break
            dfpart = pd.read_pickle(pjoin(data_root,f'train_clean{i}'))
        elif data=='test':
            if i > 2 :
                break
            dfpart = pd.read_pickle(pjoin(data_root,f'test_{i}.pkl'))
        df = pd.concat([df,dfpart])
        del dfpart
    return df       

In [3]:
df_train = load_data(n=9)
df_test = pd.read_pickle(pjoin(data_root,f'test_clean'))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
df_train.shape, df_test.shape

((3134406, 61), (296530, 59))

In [5]:
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                      df_train:  1.9 GiB
                       df_test: 176.1 MiB
                  LabelEncoder:  1.0 KiB
               StratifiedKFold:  1.0 KiB
                           _i5:  576.0 B
                           _i1:  556.0 B
                          _iii:  490.0 B
                           _i2:  490.0 B
                      datetime:  400.0 B
                           _oh:  240.0 B


## Model building

In [6]:
import lightgbm as lgb
# from xgboost import XGBRegressor
# from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error

In [7]:
df_train['month'] = df_train['month'].astype('category')
df_train['year'] = df_train['year'].astype('category')

In [8]:
df_test['month'] = df_test['month'].astype('category')
df_test['year'] = df_test['year'].astype('category')

In [9]:
# print(df_train.columns)
# col = 'adwordsClickInfo_page'
# print(len(df_train[col].unique()))
# print(df_train[col].mode())
# print(df_train[col].describe)

In [10]:
to_category = ['channelGrouping', 'device_browser', 'deviceCategory', 'operatingSystem',
              'networkDomain', 'city', 'metro', 'region', 'country', 'continent', 
              'adwordsClickInfo_gclId', 'keyword', 'medium', 'referralPath', 
              'source', 'adwordsClickInfo_page', 'totals_sessionQualityDim'] 

for col in to_category:
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

In [11]:
params_lgb2 = {
        "objective" : "binary",
        "metric" : "binary_logloss",
        "max_leaves": 256,
        "num_leaves" : 15,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1           
    }

In [12]:
params_lgb3 = {
        "objective" : "regression",
        "metric" : "rmse", 
        "max_leaves": 256,
        "num_leaves" : 9,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1      
    }

In [13]:
target_cols = ['totals_transactionRevenue', 'ret', 'fullVisitorId']

dtrain_all = lgb.Dataset(df_train.drop(target_cols, axis=1), label=df_train['ret'])

dtrain_ret = lgb.Dataset(df_train.drop(target_cols, axis=1)[df_train['ret']==1], 
                         label= np.log1p(df_train['totals_transactionRevenue'][df_train['ret']==1]))

In [14]:
test = df_test.drop('fullVisitorId', axis=1)

In [15]:
pr_lgb_sum = 0

num_ite = 1
print('Training and predictions')
for i in range(num_ite):
    print('Interation number ', i)
    lgb_model1 = lgb.train(params_lgb2, dtrain_all, num_boost_round=1250) # original 1200
    pr_lgb = lgb_model1.predict(test)
    
    lgb_model2 = lgb.train(params_lgb3, dtrain_ret, num_boost_round=390) # original 368
    pr_lgb_ret = lgb_model2.predict(test)
    
    pr_lgb_sum = pr_lgb_sum + pr_lgb*pr_lgb_ret

pr_final2 = pr_lgb_sum/num_ite

Training and predictions
Interation number  0


## Save result

In [17]:
sub_df = pd.DataFrame(df_test['fullVisitorId'])
sub_df["PredictedLogRevenue"] = pr_final2
sub_df.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,18966949534117,0.003024
1,39738481224681,0.000299
2,73585230191399,0.000379
3,87588448856385,5e-05
4,149787903119437,5e-05


In [18]:
sub_df.to_csv("stacked_result.csv", index=False)

In [19]:
len(sub_df)

296530