In [1]:
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df


def isgooglemc(day,num):
    if day in num.index.values:
        val = num[day]
    else:
        val = 0
    return val

In [3]:
def cleandata(path,isdf):
    df = pd.read_feather(path)
    #Time
    df['date'] = pd.to_datetime(df.date, format='%Y%m%d')
    df['week'] = df.date.apply(lambda t: t.dayofweek)
    df['day'] = df.date.apply(lambda t: t.dayofyear)
    #drop repeated feature
    for i in df.columns:
        if sum(df[i] == df[i][0]) == df.shape[0]:
            df.drop([i], axis=1, inplace=True)
    #geoNetwork
    df['isnotavailable'] = df.geoNetwork_city == 'not available in demo dataset'
    df['unknown'] = df.geoNetwork_networkDomain == 'unknown.unknown'
    df['notset'] = df.geoNetwork_networkDomain == '(not set)'
    df['notsetcity'] = df.geoNetwork_city == '(not set)'
    df['notsetmetro'] = df.geoNetwork_metro == '(not set)'
    df['isUS'] = df.geoNetwork_country == 'United States'
    #totals
    df.dropna(axis=0, inplace=True, subset=['totals_pageviews'])
    df['totals_bounces'] = df.totals_bounces.fillna(0).astype('float')
    df['totals_newVisits'] = df.totals_newVisits.fillna(0).astype('float')
    df['totals_hits'] = df.totals_hits.astype('float')
    df['totals_pageviews'] = df.totals_pageviews.astype('float')
    #trafficSource
    df['havegclid'] = df['trafficSource_adwordsClickInfo.gclId'].notnull()
    df['istruedirect'] = (df.trafficSource_isTrueDirect.notnull()) & (df.channelGrouping != 'Direct') 
    df['path'] = df.trafficSource_source +  df.trafficSource_referralPath.fillna('') 
    df['isgoogleplex'] = df.path == 'mall.googleplex.com/'

    adcontentidx = df.trafficSource_adContent == 'Google Merchandise Collection'
    num = df[adcontentidx].day.value_counts()
    df['keyadcontent'] = df.day.apply(isgooglemc, args=[num])
    if isdf == True:
        df['holiidx'] = (df.date > '2016-11-24') & (df.date < '2016-12-25')
        df.drop(['trafficSource_campaignCode'],axis=1,inplace=True)
    else:
        df['holiidx'] = (df.date > '2017-11-23') & (df.date < '2017-12-25')   
    #drop
    droplist = [
        'sessionId',
        'date',
        'device_isMobile',
        'visitStartTime',
        'geoNetwork_city',
        'geoNetwork_region',
        'geoNetwork_metro',
        'geoNetwork_country',
        'geoNetwork_networkDomain',
        'trafficSource_adContent',
        'trafficSource_adwordsClickInfo.gclId',
        'trafficSource_medium',
        'trafficSource_adwordsClickInfo.isVideoAd',
        'trafficSource_adwordsClickInfo.adNetworkType',
        'trafficSource_adwordsClickInfo.page',
        'trafficSource_adwordsClickInfo.slot',
        'trafficSource_keyword',
        'trafficSource_campaign',
        'trafficSource_source',
        'trafficSource_referralPath',
        'trafficSource_isTrueDirect',
        'path',
        'visitId',
    ]
    df.drop(droplist, axis=1, inplace=True)
    return df
    

In [4]:
trainpath = '/home/leechh/桌面/tempfile/R/train.feather'

train = cleandata(trainpath,True)

  return feather.read_dataframe(path, nthreads=nthreads)


In [5]:
testpath = '/home/leechh/桌面/tempfile/R/test.feather'

test = cleandata(testpath,False)

In [None]:
train.head()

In [6]:
#device
devicelist = ['device_browser', 'device_operatingSystem']
for i in devicelist:
    deviceidx = train[i].value_counts().index[10:].values
    train[i] = train[i].replace(deviceidx,'other')
    test[i] = test[i].replace(deviceidx,'other')

In [7]:
target = train.totals_transactionRevenue
train.drop(['totals_transactionRevenue'], axis=1, inplace=True)

In [8]:
testid = test.fullVisitorId
test.drop(['fullVisitorId'], axis=1, inplace=True)
train.drop(['fullVisitorId'], axis=1, inplace=True)

In [9]:
lelist = ['channelGrouping',
          'device_browser',
          'device_deviceCategory',
          'device_operatingSystem',
          'geoNetwork_continent',
          'geoNetwork_subContinent'
         ]

for col in lelist:
    le = LabelEncoder()
    fitlabel = le.fit(list(train[col].values)+list(test[col].values))
    train[col] = le.transform(list(train[col].values))
    test[col] = le.transform(list(test[col].values))

In [11]:
train = train.astype(dtype='float')
test = test.astype(dtype='float')

In [13]:
train.shape

(903553, 24)

In [16]:
sum(target.notnull())

11515

In [19]:
np.save('/home/leechh/tempfile/R/class2/x_train.npy',train.values)
np.save('/home/leechh/tempfile/R/class2/y_train.npy',target.notnull().astype('float').values)
np.save('/home/leechh/tempfile/R/class2/x_test.npy',test.values)
np.save('/home/leechh/tempfile/R/class2/Id.npy',testid.values)