In [None]:
import numpy as np
import pandas as pd
import os
import time
import json
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### 第一步，我们先把json格式展开

In [None]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
train = load_df('/home/leechh/data/R/train.csv')

In [None]:
train.head(5)

In [None]:
train.channelGrouping.unique()

In [None]:
le = LabelEncoder()
train['channelGrouping'] = le.fit_transform(train.channelGrouping)

train.channelGrouping.hist(bins=15,figsize=[15,5])
plt.title('channelGrouping')

### 我觉得seesionId 应该就是fullvisitId 与 visitId 的结合，验证以下，如果是的话，我们就可以把 这个features去掉了。

In [None]:
print('fullvisitId sess diff:',sum(train.sessionId.str.split('_',expand=True)[0] != train.fullVisitorId))
print('visitId sess diff:',sum(train.sessionId.str.split('_',expand=True)[1] != train.visitId.astype('str')))

In [None]:
train.drop(['sessionId'],axis=1,inplace=True)

### 接下来，把时间date转化为时间格式

In [None]:
train['date'] = pd.to_datetime(train.date,format='%Y%m%d')

### socialEngagementType

In [None]:
idx = (train.socialEngagementType != 'Not Socially Engaged')
sum(idx)

In [None]:
train.drop(['socialEngagementType'], axis=1, inplace=True)

In [None]:
train.head()

### visitNumber

In [None]:
train.groupby('visitNumber').size()

In [None]:
train.visitNumber.hist(bins=500,figsize=[15,5])
plt.show()

我们可以看到，visitNumber及其不平衡，大多数为1，由kaggle官方提供的说明，If this is the first session, then this is set to 1，所以大多数都是用户初次访问

### visitStartTime
The timestamp (expressed as POSIX time)， so we convert it to a pd.datatime

In [None]:
def totime(t):
    year = time.localtime(t)[0]
    month = time.localtime(t)[1]
    day = time.localtime(t)[2]
    hour = time.localtime(t)[3]
    minute = time.localtime(t)[4]
    second = time.localtime(t)[5]
    return pd.Timestamp(year,month,day,hour,minute,second)


train['visitStartTime'] = train.visitStartTime.map(totime)

### 去掉重复features

In [None]:
for i in train.columns:
    if sum(train[i] == train[i][0]) == train.shape[0]:
        train.drop([i], axis=1, inplace=True)

In [None]:
train.shape

### device

In [None]:
def dfbar(df,feature):
    unique = df[feature].unique
    

In [None]:
train.iloc[:,6:10].head()

In [None]:
# 我们去掉device_isMobile
train.drop(['device_isMobile'], axis=1, inplace=True)

### geoNetwork

In [None]:
train.iloc[:,9:16].head()

### totals

In [None]:
train.iloc[:,16:21].head()

In [None]:
train.groupby('totals_transactionRevenue').size()

### trafficSource

In [None]:
train.iloc[:,21:].head()

In [None]:
for i in train.columns[21:35]:
    print('\n',train.groupby(i).size())

In [None]:
train.head()