In [1]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import feather
import json

In [2]:
pd.options.display.max_columns = 60

# Reading and Cleaning Data

## Dealing with JSON columns

In [3]:
def read_csv_with_json(path_to_csv):
    json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(path_to_csv, converters = {column: json.loads for column in json_columns}, dtype = {'fullVisitorId':'str'})
    for column in json_columns:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [column+"."+subcolumn for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df

## Date and Time Columns

In [4]:
def date_processing(df):
    df["date"] = pd.to_datetime(df["date"], format = '%Y%m%d')
    df["_weekday"] = df['date'].dt.weekday
    df["_day"] = df['date'].dt.day 
    df["_month"] = df['date'].dt.month
    df["_year"] = df['date'].dt.year
    df["_visitHour"] = pd.to_datetime(df["visitStartTime"], unit = "s").dt.hour
    return df

## Data Type Converions

In [5]:
def df_numeric_bool_fillna(df, is_test_set = False):
    df["totals.visits"] = df["totals.visits"].astype(int)
    df["totals.hits"] = df["totals.hits"].astype(int)
    df["totals.pageviews"].fillna(1, inplace = True)
    df["totals.pageviews"] = df["totals.pageviews"].astype(int)
    df["totals.bounces"].fillna(0, inplace=True)
    df["totals.bounces"] = df["totals.bounces"].astype(int)
    df["totals.newVisits"].fillna(0, inplace=True)
    df["totals.newVisits"] = df["totals.newVisits"].astype(int)
    df["trafficSource.isTrueDirect"].fillna(False, inplace = True)
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
    if is_test_set:
        return df
    else:
        df["totals.transactionRevenue"].fillna(0.0, inplace=True)
        df["totals.transactionRevenue"] = df["totals.transactionRevenue"].astype(float)
    return df

## Normalisation

Probably only need to log normalise the transaction revenue, but will create a function anyway, just in case

In [6]:
def df_normalise(df):
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].apply(lambda x: np.log1p(x))
    return df

## Constant Columns

In [7]:
def df_remove_cols(df, cols_to_drop = []):
    if cols_to_drop == []:
        constant_cols = [col for col in df.columns if df[col].nunique() == 1 and col != "totals.visits"]
        null_cols =  [col for col in df.columns if df[col].isnull().sum()/len(df) > 0.5] 
        cols_to_drop = constant_cols+null_cols
        df.drop(cols_to_drop, axis = 1, inplace = True)
    else:
        df.drop(cols_to_drop, axis = 1, inplace = True)
    return df, cols_to_drop

## Flagging visitor ids as spenders

In [8]:
def df_flag_spender(df):
    im_df = pd.DataFrame(df.groupby('fullVisitorId', as_index = False)['totals.transactionRevenue'].sum())
    im_df.columns = ['fullVisitorId', 'totalUserRev']
    im_df['spender'] = np.where(im_df['totalUserRev']>0.0,True,False)
    df = df.merge(im_df, on = 'fullVisitorId')
    return df

In [10]:
def output_df(df, output_file_name):
    #pickle.dump(df, open(output_file_name, 'wb'))
    feather.write_dataframe(df, output_file_name)
    #pickle.dump(cols_to_drop, open('data/cols_to_drop.df', 'wb'))
    feather.write_dataframe(df, 'data/cols_to_drop.feather')
    #pickle.dump(df.sample(frac=0.1, random_state = 1), open(output_file_name.split('.')[0]+"_sample."+output_file_name.split('.')[1], 'wb'))
    feather.write_dataframe(df.sample(frac=0.1, random_state = 1), output_file_name.split('.')[0]+"_sample."+output_file_name.split('.')[1])

## Combining PreProcessing Steps

In [11]:
def df_pre_process_and_dump(raw_file_name, output_file_name, is_test_set = False, cols_to_drop = []):
    df = read_csv_with_json(raw_file_name)
    
    df = date_processing(df)
    
    df = df_numeric_bool_fillna(df, is_test_set)
    
    df = df_normalise(df)
    
    df, cols_to_drop = df_remove_cols(df)
    
    if is_test_set:
        output_df(df, output_file_name)
        return df
    else:
        df = df_flag_spender(df)
        output_df(df, output_file_name)
        return df

In [13]:
clean_train_df = df_pre_process_and_dump('data/train.csv', 'data/clean_train.feather')