In [13]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import feather
import json
import pickle

In [2]:
pd.options.display.max_columns = 60

In [165]:
test = pd.read_csv('data/train_v2.csv', nrows = 1000)

In [168]:
test['totals'][0]

'{"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1", "newVisits": "1", "sessionQualityDim": "1"}'

# Reading and Cleaning Data

## Dealing with JSON columns

In [3]:
def convert_json(df):
    json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']
    for column in json_columns:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [column+"."+subcolumn for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df

## Date and Time Columns

In [4]:
def date_processing(df):
    df["date"] = pd.to_datetime(df["date"], format = '%Y%m%d')
    df["_weekday"] = df['date'].dt.weekday
    df["_day"] = df['date'].dt.day 
    df["_month"] = df['date'].dt.month
    df["_year"] = df['date'].dt.year
    df["_visitHour"] = pd.to_datetime(df["visitStartTime"], unit = "s").dt.hour
    return df

## Data Type Converions

In [181]:
def df_numeric_bool_fillna(df, is_test_set = False):
    df["totals.visits"] = df["totals.visits"].astype(int)
    df["totals.hits"] = df["totals.hits"].astype(int)
    df["totals.pageviews"].fillna(1, inplace = True)
    df["totals.pageviews"] = df["totals.pageviews"].astype(int)
    df["totals.bounces"].fillna(0, inplace=True)
    df["totals.bounces"] = df["totals.bounces"].astype(int)
    df["totals.newVisits"].fillna(0, inplace=True)
    df["totals.newVisits"] = df["totals.newVisits"].astype(int)
    df["trafficSource.isTrueDirect"].fillna(False, inplace = True)
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
    df["totals.transactionRevenue"].fillna(0.0, inplace=True)
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].astype(float)
    if is_test_set:
        df["totals.totalTransactionRevenue"].fillna(0.0, inplace=True)
        df["totals.totalTransactionRevenue"] = df["totals.totalTransactionRevenue"].astype(float)
        df["totals.transactions"].fillna(0, inplace=True)
        df["totals.transactions"] = df["totals.transactions"].astype(int)
        return df
    else:
        #df["totals.transactionRevenue"].fillna(0.0, inplace=True)
        #df["totals.transactionRevenue"] = df["totals.transactionRevenue"].astype(float)
        return df

## Normalisation

Probably only need to log normalise the transaction revenue, but will create a function anyway, just in case

In [91]:
def df_normalise(df, is_test_set):
    if is_test_set == True:
        return df
    else:
        df["totals.transactionRevenue"] = df["totals.transactionRevenue"].apply(lambda x: np.log1p(x))
        return df

## Constant Columns and hits column?

In [130]:
def df_remove_cols(df, cols_to_drop = []):
    if cols_to_drop == []:
        constant_cols = [col for col in df.columns if df[col].nunique() == 1 and col != "totals.visits"]
        null_cols =  [col for col in df.columns if df[col].isnull().sum()/len(df) > 0.5] 
        cols_to_drop = constant_cols + null_cols + ['hits','customDimensions']
        df.drop(cols_to_drop, axis = 1, inplace = True)
        return df, cols_to_drop
    else:
        intersection = set(df.columns.tolist()).intersection(cols_to_drop)
        df.drop(intersection, axis = 1, inplace = True)
        return df, cols_to_drop

## Flagging visitor ids as spenders

In [8]:
def df_flag_spender(df):
    im_df = pd.DataFrame(df.groupby('fullVisitorId', as_index = False)['totals.transactionRevenue'].sum())
    im_df.columns = ['fullVisitorId', 'totalUserRev']
    im_df['spender'] = np.where(im_df['totalUserRev']>0.0,True,False)
    df = df.merge(im_df, on = 'fullVisitorId')
    return df

In [9]:
def output_df(df, output_file_name):
    feather.write_dataframe(df, output_file_name)
    feather.write_dataframe(df.sample(frac=0.1, random_state = 1), output_file_name.split('.')[0]+"_sample."+output_file_name.split('.')[1])

## Combining PreProcessing Steps

In [112]:
def process(df, is_test_set, cols_to_drop = []):
    
    df = convert_json(df)
    
    df = date_processing(df)
    df = df_numeric_bool_fillna(df, is_test_set)
    
    df = df_normalise(df, is_test_set)
    
    df, cols_to_drop = df_remove_cols(df, cols_to_drop)
    
    return df, cols_to_drop

In [149]:
def df_load(raw_file_name, output_file_name, chunksize, is_test_set = False, cols_to_drop = []):
    
    df_proc = pd.DataFrame()
    json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df_reader = pd.read_csv(raw_file_name, converters = {column: json.loads for column in json_columns}, dtype = {'fullVisitorId':'str'}, chunksize = chunksize)
    #chunksize works at 100,000. Smaller and not all the columns turn up in the json. 
    #probably a way to fix that later
    
    for chunk_id, df in enumerate(df_reader):
        df.reset_index(drop=True, inplace=True) 
        df, cols_to_drop = process(df, is_test_set, cols_to_drop)
        
        df_proc = pd.concat([df_proc, df], axis = 0, sort=False).reset_index(drop=True)
        
        del df
        
        if chunk_id % 5 == 0:
            print('{}: rows loaded: {}'.format(chunk_id, df_proc.shape[0]))

    if is_test_set:
        output_df(df_proc, output_file_name)
        return df_proc
    else:
        df_proc = df_flag_spender(df_proc)
        output_df(df_proc, output_file_name)
        pickle.dump(cols_to_drop, open('data/cols_to_drop.pickle', 'wb'))
        return df_proc

In [12]:
clean_train_df = df_load('data/train_v2.csv', 'data/clean_train_v2.feather', chunksize = 100000)

0: rows loaded: 100000
5: rows loaded: 600000
10: rows loaded: 1100000
15: rows loaded: 1600000


In [182]:
cols_to_drop = pickle.load(open('data/cols_to_drop_v2.pickle', 'rb'))
clean_test_df = df_load('data/test_v2.csv', 'data/clean_test_v2.feather', chunksize = 100000, is_test_set = True, cols_to_drop = cols_to_drop)

0: rows loaded: 100000


Cheating a bit with choosing a large enough chunksize to avoid the issue of the json columns not having every column in each chunk. 
Instead of having null values they just aren't in the json, possibly will need to come back and deal with that. 