In [156]:
from os import path
from fastai.structured import *
from fastai.column_data import *
import pandas as pd # to manipulate data frames 
import numpy as np # to work with matrix

In [157]:
PATH=Path('data/kg-google/')

In [158]:
train_file_name = f'{PATH}\extracted_fields_train.csv'
test_file_name= f'{PATH}\extracted_fields_test.csv'

In [159]:
chunksize=24000

In [160]:
df_train = pd.read_csv(train_file_name, dtype={'fullVisitorId': 'str'},
                      usecols=['channelGrouping', 'date', 'fullVisitorId', 'visitId', 'visitNumber','totals.transactions',
                               'totals.timeOnSite','totals.visits',
       'visitStartTime', 'device.browser', 'device.deviceCategory',
       'device.isMobile', 'device.operatingSystem', 'geoNetwork.city',
       'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',
       'trafficSource.adContent', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source'])

  interactivity=interactivity, compiler=compiler, result=result)


In [161]:
df_train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,...,totals.newVisits,totals.pageviews,totals.transactionRevenue,trafficSource.adContent,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20171016,3162355547410993243,1508198000.0,1.0,1508198000.0,Firefox,desktop,0.0,Windows,...,1.0,1.0,,,(not set),,water bottle,organic,,google
1,Referral,20171016,8934116514970143966,1508176000.0,6.0,1508176000.0,Chrome,desktop,0.0,Chrome OS,...,,2.0,,,(not set),,,referral,/a/google.com/transportation/mtv-services/bike...,sites.google.com
2,Direct,20171016,7992466427990357681,1508202000.0,1.0,1508202000.0,Chrome,mobile,1.0,Android,...,1.0,2.0,,,(not set),1.0,,(none),,(direct)
3,Organic Search,20171016,9075655783635761930,1508170000.0,1.0,1508170000.0,Chrome,desktop,0.0,Windows,...,1.0,2.0,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,6960673291025684308,1508191000.0,1.0,1508191000.0,Chrome,desktop,0.0,Windows,...,1.0,2.0,,,(not set),,(not provided),organic,,google


In [162]:
# df_train = pd.read_csv(train_file_name)
df_test = pd.read_csv(test_file_name, dtype={'fullVisitorId': 'str'})

In [163]:
# df_test.head()


In [164]:
# df_test.size

In [165]:
# df_test.columns

In [166]:
#columns to drop
to_drop = [ 'visitId', 'visitNumber', 
             'geoNetwork.metro',
       'geoNetwork.networkDomain', 
       'trafficSource.adContent',
       'trafficSource.campaign',
#        'trafficSource.isTrueDirect', 
#            'trafficSource.keyword',
#        'trafficSource.medium', 
           'trafficSource.referralPath'
#            ,
#        'trafficSource.source'
          ]

In [167]:
df_train.drop(to_drop, axis=1, inplace=True)
df_test.drop(to_drop, axis=1, inplace=True)

In [168]:
from datetime import datetime

# This function is to extract date features
def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d") # setting the column as pandas datetime
    df["_weekday"] = df['date'].dt.weekday #extracting week day
    df["_day"] = df['date'].dt.day # extracting day
    df["_month"] = df['date'].dt.month # extracting day
    df["_year"] = df['date'].dt.year # extracting day
    df['_visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    
    return df #returning the df after the transformations

In [169]:
df_train = date_process(df_train)
df_test = date_process(df_test)

In [170]:
def NumericalColumns(df):    # fillna numeric feature
    df['totals.pageviews'].fillna(1, inplace=True) #filling NA's with 1
    df['totals.newVisits'].fillna(0, inplace=True) #filling NA's with 0
    df['totals.transactions'].fillna(0, inplace=True) #filling NA's with 0
#     df['totals.timeOnSite'].fillna(0, inplace=True) #filling NA's with 0
    
    df['totals.bounces'].fillna(0, inplace=True)   #filling NA's with 0
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True) # filling boolean with False
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].fillna(1.0).astype(float) #filling NA with zero
    df['totals.pageviews'] = df['totals.pageviews'].astype(int) # setting numerical column as integer
    df['totals.newVisits'] = df['totals.newVisits'].astype(int) # setting numerical column as integer
#     df['totals.bounces'] = df['totals.bounces'].astype(int)  # setting numerical column as integer
    df["totals.hits"] = df["totals.hits"].astype(float) # setting numerical to float
    df['totals.visits'] = df['totals.visits'].astype(int) # seting as int

    return df #return the transformed dataframe

In [171]:
df_train = NumericalColumns(df_train)
df_test = NumericalColumns(df_test)


In [172]:
df_train.head().T.head(40)

Unnamed: 0,0,1,2,3,4
channelGrouping,Organic Search,Referral,Direct,Organic Search,Organic Search
date,2017-10-16 00:00:00,2017-10-16 00:00:00,2017-10-16 00:00:00,2017-10-16 00:00:00,2017-10-16 00:00:00
fullVisitorId,3162355547410993243,8934116514970143966,7992466427990357681,9075655783635761930,6960673291025684308
visitStartTime,1.5082e+09,1.50818e+09,1.5082e+09,1.50817e+09,1.50819e+09
device.browser,Firefox,Chrome,Chrome,Chrome,Chrome
device.deviceCategory,desktop,desktop,mobile,desktop,desktop
device.isMobile,0,0,1,0,0
device.operatingSystem,Windows,Chrome OS,Android,Windows,Windows
geoNetwork.city,not available in demo dataset,Cupertino,not available in demo dataset,not available in demo dataset,not available in demo dataset
geoNetwork.continent,Europe,Americas,Americas,Asia,Americas


In [173]:
df_test.head().T.head(40)

Unnamed: 0,0,1,2,3,4
channelGrouping,Organic Search,Direct,Organic Search,Direct,Organic Search
date,2018-05-11 00:00:00,2018-05-11 00:00:00,2018-05-11 00:00:00,2018-05-11 00:00:00,2018-05-11 00:00:00
fullVisitorId,7460955084541987166,460252456180441002,3461808543879602873,975129477712150630,8381672768065729990
visitStartTime,1.5261e+09,1.52606e+09,1.52607e+09,1.52611e+09,1.52606e+09
device.browser,Chrome,Chrome,Chrome,Chrome,Internet Explorer
device.deviceCategory,mobile,desktop,desktop,mobile,tablet
device.isMobile,1,0,0,1,1
device.operatingSystem,Android,Macintosh,Chrome OS,iOS,Windows
geoNetwork.city,(not set),San Francisco,not available in demo dataset,Houston,Irvine
geoNetwork.continent,Asia,Americas,Americas,Americas,Americas


In [174]:
cat_vars = ['channelGrouping', 
       'device.browser', 'device.operatingSystem',
#        'device.operatingSystemVersion', 
            'geoNetwork.city',
       'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.region',
       'geoNetwork.subContinent','_weekday','_day','_month','_year','_visitHour']
contin_vars = ['totals.hits', 'totals.newVisits',
       'totals.pageviews']
#                , 'totals.timeOnSite', 
#        'totals.transactions', 'totals.visits']
#  'fullVisitorId',

In [175]:
dep = 'totals.transactionRevenue'
df_train = df_train[cat_vars+contin_vars+[dep, 'date']].copy()

In [176]:
df_test[dep] = 0.0
df_test = df_test[cat_vars+contin_vars+[dep, 'date', 'fullVisitorId']].copy()

In [177]:
for v in cat_vars: df_train[v] = df_train[v].astype('category').cat.as_ordered()

In [178]:
apply_cats(df_test, df_train)

In [179]:
for v in contin_vars:
    df_train[v] = df_train[v].fillna(0).astype('float32')
    df_test[v] = df_test[v].fillna(0).astype('float32')

In [180]:
samp_size = len(df_train)
df_samp = df_train.set_index("date")

In [181]:
df, y, nas, mapper = proc_df(df_samp, 'totals.transactionRevenue', do_scale=True)
yl = np.log(y)

In [182]:
df_test = df_test.set_index("date")

In [183]:
df_test1, _, nas, mapper = proc_df(df_test, dep, do_scale=True,skip_flds=['fullVisitorId'],mapper=mapper, na_dict=nas)

In [184]:
train_ratio = 0.75
# train_ratio = 0.9
train_size = int(samp_size * train_ratio); train_size
val_idx = list(range(train_size, len(df)))

In [185]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

In [186]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=128,
                                       test_df=df_test1)

In [187]:
cat_sz = [(c, len(df_samp[c].cat.categories)+1) for c in cat_vars]

In [188]:
cat_sz

[('channelGrouping', 9),
 ('device.browser', 120),
 ('device.operatingSystem', 25),
 ('geoNetwork.city', 941),
 ('geoNetwork.continent', 7),
 ('geoNetwork.country', 229),
 ('geoNetwork.region', 479),
 ('geoNetwork.subContinent', 24),
 ('_weekday', 8),
 ('_day', 32),
 ('_month', 13),
 ('_year', 4),
 ('_visitHour', 25)]

In [189]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [190]:
emb_szs

[(9, 5),
 (120, 50),
 (25, 13),
 (941, 50),
 (7, 4),
 (229, 50),
 (479, 50),
 (24, 12),
 (8, 4),
 (32, 16),
 (13, 7),
 (4, 2),
 (25, 13)]

In [191]:
m = md.get_learner(emb_szs,len(df.columns)-len(cat_vars),0.04,1,[1000,500],[0.001,0.01])

In [192]:
m.model

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(9, 5)
    (1): Embedding(120, 50)
    (2): Embedding(25, 13)
    (3): Embedding(941, 50)
    (4): Embedding(7, 4)
    (5): Embedding(229, 50)
    (6): Embedding(479, 50)
    (7): Embedding(24, 12)
    (8): Embedding(8, 4)
    (9): Embedding(32, 16)
    (10): Embedding(13, 7)
    (11): Embedding(4, 2)
    (12): Embedding(25, 13)
  )
  (lins): ModuleList(
    (0): Linear(in_features=279, out_features=1000, bias=True)
    (1): Linear(in_features=1000, out_features=500, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True)
    (1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True)
  )
  (outp): Linear(in_features=500, out_features=1, bias=True)
  (emb_drop): Dropout(p=0.04)
  (drops): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
  )
  (bn): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True)
)

In [214]:
lr=1e-3
m.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      3.088454   3.355305  


In [213]:
m.sched.plot(100)

AttributeError: 'LossRecorder' object has no attribute 'plot'

In [195]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-4

In [196]:
m.fit(lr, 3, metrics=[exp_rmspe])

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                                                                             
    0      2.523746   2.460339   4437.702657
    1      2.384632   2.423964   1552.556618                                                                           
    2      2.553737   2.415122   38538.491944                                                                          


[array([2.41512]), 38538.49194369111]

In [198]:
m.save('model-workingv2')

In [199]:
m.load('model-workingv2')

In [200]:
x,y=m.predict_with_targs()

In [201]:
exp_rmspe(x,y)

1359551.2874827487

In [202]:
pred_test=m.predict(True)
pred_test = np.exp(pred_test)

In [203]:
df_test[dep]=pred_test-1

In [205]:
csv_fn=f'{PATH}/tmp/subv2.csv'

In [206]:
df_test[['fullVisitorId',dep]].to_csv(csv_fn, index=False)

In [207]:
df_try_unique = pd.read_csv(csv_fn,low_memory=False)

In [208]:
# df_try_unique = df_try_unique.drop_duplicates(['fullVisitorId'])
df_try_unique.size

803178

In [209]:
df_try_unique1 = df_try_unique.groupby('fullVisitorId').sum()

In [210]:
df_try_unique1 = df_try_unique1.reset_index()

In [211]:
# df_try_unique1 = df_try_unique1.drop_duplicates(['fullVisitorId'])
df_try_unique1.size

593060

In [212]:
csv_fn1=f'{PATH}/tmp/submission_newv2.csv'
df_try_unique1[['fullVisitorId','totals.transactionRevenue']].to_csv(csv_fn1, index=False)