In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import gc
import sys
import math

from pandas.io.json import json_normalize
from datetime import datetime

import os
print(os.listdir("../input"))

In [None]:
gc.enable()

features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign',\
       'trafficSource.isTrueDirect', 'trafficSource.keyword',\
       'trafficSource.medium', 'trafficSource.referralPath',\
       'trafficSource.source', 'customDimensions']


def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, # Important!!
            chunksize=100000)
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis=0).reset_index(drop=True)
        #print(ans.shape)
    return ans

In [None]:
%%time
train = load_df('../input/train_v2.csv')
test = load_df('../input/test_v2.csv')

print('train date:', min(train['date']), 'to', max(train['date']))
print('test date:', min(test['date']), 'to', max(test['date']))

In [None]:
test.head()

# Preliminary Checks

## Dimensions of train and test set

In [None]:
print("Sturcture of train:", train.shape,"\n")
print("Structure of test:", test.shape)

## Percent Missing Values in train-test set

In [None]:
train_columns = train.columns
test_columns = test.columns

train_percent_missing = train.isnull().sum()*100/len(train)
test_percent_missing = test.isnull().sum()*100/len(test)

missing_values_train = pd.DataFrame({"column_name": train_columns,
                                     "percent_missing_train": train_percent_missing.round(2)})
missing_values_test = pd.DataFrame({"column_name": test_columns,
                                    "percent_missing_test": test_percent_missing.round(2)})

combined_missing = pd.merge(missing_values_train, missing_values_test, how = 'left', on = "column_name")
combined_missing = combined_missing.sort_values(by = ['percent_missing_train'], ascending = False).reset_index()
combined_missing


## Check for columns with Constant Values

In [None]:
const_cols_train = [c for c in train.columns if train[c].nunique(dropna=False)!=1]
const_cols_test = [c for c in test.columns if test[c].nunique(dropna=False)!=1]

## Replace NA's by 0 from transaction revenue

In [None]:
train['totals.transactionRevenue'] = train['totals.transactionRevenue'].fillna(0)
test['totals.transactionRevenue'] = test['totals.transactionRevenue'].fillna(0)

## Table of Data types, Unique Values and Missing Values

In [None]:
# For train set
num_unique = [train[c].nunique(dropna=False) for c in train.columns]
data_types = train.dtypes
missing_values = train.isnull().sum()

pd.DataFrame({"Data_Types":data_types,"Count_Unique_Values":num_unique,"Missing_Values":missing_values})

## Change Data Types

In [None]:
train['totals.transactionRevenue'] = train['totals.transactionRevenue'].astype('float64')
test['totals.transactionRevenue'] = test['totals.transactionRevenue'].astype('float64')

In [None]:
train[['totals.bounces','totals.newVisits','totals.pageviews','trafficSource.adContent','trafficSource.isTrueDirect','trafficSource.keyword','trafficSource.referralPath']] = train[['totals.bounces','totals.newVisits','totals.pageviews','trafficSource.adContent','trafficSource.isTrueDirect','trafficSource.keyword','trafficSource.referralPath']].fillna(0)

In [None]:
test[['totals.bounces','totals.newVisits','totals.pageviews','trafficSource.adContent','trafficSource.isTrueDirect','trafficSource.keyword','trafficSource.referralPath']] = test[['totals.bounces','totals.newVisits','totals.pageviews','trafficSource.adContent','trafficSource.isTrueDirect','trafficSource.keyword','trafficSource.referralPath']].fillna(0)

In [None]:
train['date'] = pd.to_datetime(train['date'], format='%Y%m%d')
test['date'] = pd.to_datetime(test['date'], format='%Y%m%d')

In [None]:
train = train.sort_values(by=['date'])
test = test.sort_values(by=['date'])

In [None]:
train = train.set_index('date')
test = test.set_index('date')

In [None]:
trafficSource_catCols = list(train.loc[:,train.dtypes == 'object'].columns)
trafficSource_catCols