In [198]:
import warnings
warnings.filterwarnings('ignore')

In [199]:
import numpy as np
import pandas as pd
import feather
import pickle

In [200]:
pd.options.display.max_columns = 200
pd.options.display.max_rows = 60

In [201]:
clean_train_df = feather.read_dataframe('data/clean_train_v2.feather')

In [202]:
clean_test_df = feather.read_dataframe('data/clean_test_v2.feather')

In [203]:
#im_df = clean_train_df.loc[(clean_train_df['fullVisitorId'] == '8934116514970143966')| (clean_train_df['fullVisitorId'] =='9075655783635761930')]

In [204]:
date_train = clean_train_df.groupby('fullVisitorId', as_index = False)[['date', 'visitStartTime', 'visitNumber']].agg(['max', 'min'])
date_train = date_train.reset_index()
date_train.columns = ['fullVisitorId','date_max', 'date_min', 'visitStartTime_max', 'visitStartTime_min', 'visitNumber_max', 'visitNumber_min']
date_train['interval_dates'] = (date_train['date_max'] - date_train['date_min']).dt.days

In [205]:
train_cats = clean_train_df[['fullVisitorId','visitStartTime','channelGrouping', 'device.browser','device.operatingSystem'
                                                  ,'device.isMobile', 'device.deviceCategory' ,'geoNetwork.continent','geoNetwork.subContinent'
                                                  , 'geoNetwork.country' ,'geoNetwork.region', 'geoNetwork.metro' ,'geoNetwork.city'
                                                  ,'geoNetwork.networkDomain', 'trafficSource.source',  'trafficSource.medium', 'trafficSource.isTrueDirect'
                                                  ,'trafficSource.adwordsClickInfo.isVideoAd']]

In [206]:
train_num = clean_train_df.groupby('fullVisitorId', as_index = False)[['totals.hits', 'totals.pageviews', 'totals.sessionQualityDim', 'totals.timeOnSite']].agg(['sum', 'mean', 'min', 'max', 'std'])
train_num.columns = ["_".join(x) for x in train_num.columns.ravel()]
train_num = train_num.reset_index()
train_num.fillna(0.0, inplace = True)

In [207]:
train_trans = clean_train_df.groupby('fullVisitorId', as_index = False)[['totals.transactions', 'totals.transactionRevenue']].sum()

In [208]:
#train_cats.drop(columns = 'fullVisitorId')

In [209]:
grp_train_im = pd.concat([date_train,  train_num.drop(columns = 'fullVisitorId'), train_trans.drop(columns = 'fullVisitorId')], axis=1)

In [210]:
grp_train = grp_train_im.merge(train_cats, how = 'inner',left_on = ['fullVisitorId', 'visitStartTime_max'], right_on = ['fullVisitorId', 'visitStartTime'])

In [211]:
len(grp_train) == len(clean_train_df['fullVisitorId'].unique())

True

In [212]:
date_test = clean_test_df.groupby('fullVisitorId', as_index = False)[['date', 'visitStartTime', 'visitNumber']].agg(['max', 'min'])
date_test = date_test.reset_index()
date_test.columns = ['fullVisitorId','date_max', 'date_min', 'visitStartTime_max', 'visitStartTime_min', 'visitNumber_max', 'visitNumber_min']
date_test['interval_dates'] = (date_test['date_max'] - date_test['date_min']).dt.days

test_cats = clean_test_df[['fullVisitorId','visitStartTime','channelGrouping', 'device.browser','device.operatingSystem'
                                                  ,'device.isMobile', 'device.deviceCategory' ,'geoNetwork.continent','geoNetwork.subContinent'
                                                  , 'geoNetwork.country' ,'geoNetwork.region', 'geoNetwork.metro' ,'geoNetwork.city'
                                                  ,'geoNetwork.networkDomain', 'trafficSource.source',  'trafficSource.medium', 'trafficSource.isTrueDirect'
                                                  ,'trafficSource.adwordsClickInfo.isVideoAd']]
                                                  
test_num = clean_test_df.groupby('fullVisitorId', as_index = False)[['totals.hits', 'totals.pageviews', 'totals.sessionQualityDim', 'totals.timeOnSite']].agg(['sum', 'mean', 'min', 'max', 'std'])
test_num.columns = ["_".join(x) for x in test_num.columns.ravel()]
test_num = test_num.reset_index()
test_num.fillna(0.0, inplace = True)

test_trans = clean_test_df.groupby('fullVisitorId', as_index = False)[['totals.transactions', 'totals.transactionRevenue']].sum()

grp_test_im = pd.concat([date_test,  test_num.drop(columns = 'fullVisitorId'), test_trans.drop(columns = 'fullVisitorId')], axis=1)
grp_test = grp_test_im.merge(test_cats, how = 'inner',left_on = ['fullVisitorId', 'visitStartTime_max'], right_on = ['fullVisitorId', 'visitStartTime'])

In [213]:
len(grp_test) == len(clean_test_df['fullVisitorId'].unique())

True

In [214]:
feather.write_dataframe(grp_train, 'data/grp_train.feather')
feather.write_dataframe(grp_test, 'data/grp_test.feather')

In [215]:
for column in grp_train.select_dtypes(include = 'object').columns.tolist():
    print(column + ' ' + str(grp_train[column].nunique()))

fullVisitorId 1323730
channelGrouping 8
device.browser 129
device.operatingSystem 24
device.deviceCategory 3
geoNetwork.continent 6
geoNetwork.subContinent 23
geoNetwork.country 228
geoNetwork.region 483
geoNetwork.metro 123
geoNetwork.city 956
geoNetwork.networkDomain 41157
trafficSource.source 334
trafficSource.medium 7


In [216]:
country_not_top10 = grp_train.groupby('geoNetwork.country').count().sort_values('date_min', ascending = False).index[10:]

In [217]:
device_not_top10 = grp_train.groupby('device.browser').count().sort_values('date_min', ascending = False).index[10:]
grp_train['device.browser'].replace(device_not_top10, 'Other', inplace = True)

In [218]:
os_not_top10 = grp_train.groupby('device.operatingSystem').count().sort_values('date_min', ascending = False).index[10:]
grp_train['device.operatingSystem'].replace(os_not_top10, 'Other', inplace = True)

In [249]:
encode_grp_train = pd.concat([grp_train, \
                             pd.get_dummies(grp_train[['channelGrouping', 'device.browser', 'device.operatingSystem'
                                                            , 'geoNetwork.country','geoNetwork.subContinent', 'trafficSource.medium']])]\
                             , axis = 1)

In [250]:
cat_cols_to_drop = [column for column in grp_train.select_dtypes(include = 'object').columns.tolist() if column != 'fullVisitorId'] 

In [251]:
encode_grp_train.drop(cat_cols_to_drop, axis = 1, inplace = True)

In [252]:
device_train_values = [value for value in grp_train['device.browser'].unique() if value != 'Other']
grp_test['device.browser'] = np.where(grp_test['device.browser'].isin(device_train_values), grp_test['device.browser'], 'Other')

In [253]:
os_train_values = [value for value in grp_train['device.operatingSystem'].unique() if value != 'Other']
grp_test['device.operatingSystem'] = np.where(grp_test['device.operatingSystem'].isin(os_train_values), grp_test['device.operatingSystem'], 'Other')

In [254]:
encode_grp_test = pd.concat([grp_test, \
                             pd.get_dummies(grp_test[['channelGrouping', 'device.browser', 'device.operatingSystem'
                                                            , 'geoNetwork.country','geoNetwork.subContinent', 'trafficSource.medium']])]\
                             , axis = 1)

In [256]:
[column for column in encode_grp_test.columns if column not in encode_grp_train.columns]

['channelGrouping',
 'device.browser',
 'device.operatingSystem',
 'device.deviceCategory',
 'geoNetwork.continent',
 'geoNetwork.subContinent',
 'geoNetwork.country',
 'geoNetwork.region',
 'geoNetwork.metro',
 'geoNetwork.city',
 'geoNetwork.networkDomain',
 'trafficSource.source',
 'trafficSource.medium',
 'geoNetwork.country_Palau']

In [257]:
encode_grp_test.drop(list(set(encode_grp_test.columns.tolist()).intersection(cat_cols_to_drop)), axis = 1, inplace = True)

In [258]:
encode_grp_test.drop([column for column in encode_grp_test.columns if column not in encode_grp_train.columns], axis = 1, inplace = True)

In [267]:
encode_grp_train.drop([column for column in encode_grp_train.columns if column not in encode_grp_test.columns], axis = 1, inplace = True)

In [259]:
encode_grp_train.columns[encode_grp_train.isna().any()].tolist()

[]

In [239]:
encode_grp_test.columns[encode_grp_test.isna().any()].tolist()

[]

In [268]:
def date_processing(df):
    df["date_min"] = pd.to_datetime(df["date_min"], format = '%Y%m%d')
    df["min_weekday"] = df['date_min'].dt.weekday
    df["min_day"] = df['date_min'].dt.day 
    df["min_month"] = df['date_min'].dt.month
    df["min_year"] = df['date_min'].dt.year
    df["min_visitHour"] = pd.to_datetime(df["visitStartTime_min"], unit = "s").dt.hour
    
    df["date_max"] = pd.to_datetime(df["date_max"], format = '%Y%m%d')
    df["max_weekday"] = df['date_max'].dt.weekday
    df["max_day"] = df['date_max'].dt.day 
    df["max_month"] = df['date_max'].dt.month
    df["max_year"] = df['date_max'].dt.year
    df["max_visitHour"] = pd.to_datetime(df["visitStartTime_max"], unit = "s").dt.hour
    return df

In [269]:
encode_grp_train = date_processing(encode_grp_train)

In [270]:
encode_grp_test = date_processing(encode_grp_test)

In [271]:
feather.write_dataframe(encode_grp_train, 'data/encode_grp_train.feather')

In [272]:
feather.write_dataframe(encode_grp_test, 'data/encode_grp_test.feather')