In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from collections import defaultdict
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.rcParams['axes.grid'] = True
plt.rcParams['figure.figsize'] = (20.0, 13.0)

In [None]:
def is_useless(df, col):
    return df[col].nunique() == 1

def bad_percent(df, col, bads):
    return round(100 * (pd.isnull(df[col]) | df[col].isin(bads)).sum() / float(len(df)), 2)

def find_useful_cols(df):
    for col in df.columns:
        try:
            if not is_useless(df, col):
                bad_perc = bad_percent(df, col, ['nan', '(not set)', 'not available in demo dataset'])
                if bad_perc < 90:
                    print('column {} is useful, bad_percent={}'.format(col, bad_perc))
        except:
            print('can\'t parse {}'.format(col))

def countplot(df, col, index, begin=0, end=8):
    plt.subplot(2, 1, 1)
    plt.title('all visits')
    sns.countplot(x=col, data=df, order=df[col].value_counts().iloc[begin:end].index)
    plt.subplot(2, 1, 2)
    plt.title('visits with transactions')
    sns.countplot(x=col, data=df[index], order=df[index][col].value_counts().iloc[begin:end].index)

In [None]:
raw_df = pd.read_csv('../input/train.csv', nrows=None, dtype={'fullVisitorId': str, 'date': str})
raw_df.head()

In [None]:
stats = pd.DataFrame.from_records(raw_df['totals'].apply(json.loads).values.tolist())
stats.head()

In [None]:
find_useful_cols(stats)

In [None]:
pd.isnull(stats['transactionRevenue']).sum() / float(len(stats))

In [None]:
raw_df['date'] = pd.to_datetime(raw_df['date'], format='%Y%m%d')

raw_df['hits'] = stats['hits'].astype(int)
raw_df['revenue'] = stats['transactionRevenue']
raw_df.loc[pd.isnull(raw_df['revenue']), 'revenue'] = 0
raw_df['revenue'] = raw_df['revenue'].astype(float)
has_transaction = raw_df['revenue'] > 0

In [None]:
z = raw_df.groupby('date')['revenue'].sum().reset_index()
z.plot(x='date', y='revenue', figsize=(20, 10))
z[z['date'].dt.month.isin([10, 11])].plot(x='date', y='revenue', figsize=(20, 10))

In [None]:
sns.distplot(np.log(raw_df[has_transaction].groupby('fullVisitorId')['revenue'].sum() + 1))

In [None]:
countplot(stats, 'hits', index=has_transaction, end=50)

In [None]:
countplot(stats, 'pageviews', index=has_transaction, end=50)

Columns:
* fullVisitorId
* channelGrouping -- The channel via which the user came to the Store
* date -- The date on which the user visited the Store
* device -- The specifications for the device used to access the Store
* geoNetwork -- This section contains information about the geography of the user
* sessionId -- A unique identifier for this visit to the store
* socialEngagementType -- Engagement type, either "Socially Engaged" or "Not Socially Engaged"
* totals -- This section contains aggregate values across the session
* trafficSource -- This section contains information about the Traffic Source from which the session originated
* visitId -- An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId
* visitNumber -- The session number for this user. If this is the first session, then this is set to 1
* visitStartTime -- The timestamp (expressed as POSIX time)

In [None]:
countplot(raw_df, 'channelGrouping', index=has_transaction, end=50)

In [None]:
device = pd.DataFrame.from_records(raw_df['device'].apply(json.loads).values.tolist())
device.head()

In [None]:
find_useful_cols(device)

In [None]:
countplot(device, 'browser', index=has_transaction)

In [None]:
countplot(device, 'isMobile', index=has_transaction, end=50)

In [None]:
countplot(device, 'operatingSystem', index=has_transaction, end=8)

In [None]:
geo = pd.DataFrame.from_records(raw_df['geoNetwork'].apply(json.loads).values.tolist())
geo.head()

In [None]:
find_useful_cols(geo)

In [None]:
countplot(geo, 'country', has_transaction)

In [None]:
countplot(geo, 'city', has_transaction)

In [None]:
countplot(geo, 'metro', has_transaction)

In [None]:
countplot(geo, 'networkDomain', has_transaction)

In [None]:
countplot(geo, 'region', has_transaction)

In [None]:
traffic = pd.DataFrame.from_records(raw_df['trafficSource'].apply(json.loads).values.tolist()).fillna('nan')
traffic.head()

In [None]:
find_useful_cols(traffic)

In [None]:
countplot(traffic, 'isTrueDirect', has_transaction)

In [None]:
countplot(traffic, 'medium', has_transaction)

In [None]:
countplot(traffic, 'referralPath', has_transaction)

In [None]:
countplot(traffic, 'keyword', has_transaction)

In [None]:
countplot(traffic, 'source', has_transaction)

In [None]:
adwords = pd.DataFrame.from_records(traffic['adwordsClickInfo'].values.tolist())

find_useful_cols(adwords)

In [None]:
def data_prepare(df, train=True):
    agg_rules = {
        'isMobile': 'any',
        'city': 'any',
        'browser': 'any',
        'operatingSystem': 'any',
        'channelGrouping': 'any',
        'country': 'any',
        'referralPath': 'any',
        'source': 'any',
        'pageviews': 'sum',
        'hits': 'sum',
        'visitNumber': 'max',
        'bounces': 'sum',
        'day_0': 'sum',
        'day_1': 'sum',
        'day_2': 'sum',
        'day_3': 'sum',
        'day_4': 'sum',
        'day_5': 'sum',
        'day_6': 'sum'
    }
    if train:
        agg_rules['transactionRevenue'] = 'sum'
    
    device = pd.DataFrame.from_records(df['device'].apply(json.loads).values.tolist())
    device.loc[~device['browser'].isin([u'Chrome', u'Safari', u'Firefox']), 'browser'] = u'Other'
    device['isMobile']
    device.loc[~device['operatingSystem'].isin([
        u'Windows', u'Macintosh', u'Android', u'iOS', u'Linux', u'Chrome OS'
    ]), 'operatingSystem'] = u'Other'
    
    geo = pd.DataFrame.from_records(df['geoNetwork'].apply(json.loads).values.tolist())
    geo.loc[~geo['country'].isin([u'United States', u'Canada']), 'country'] = 'Other'
    
    traffic = pd.DataFrame.from_records(df['trafficSource'].apply(json.loads).values.tolist())
    traffic['referralPath'] = (traffic['referralPath'] == u'/').astype(int)
    
    traffic.loc[~traffic['source'].isin([
        u'mall.googleplex.com', u'(direct)', u'google', u'youtube.com'
    ]), 'source'] = 'Other'
    
    stats = pd.DataFrame.from_records(df['totals'].apply(json.loads).values.tolist())
    stats.loc[pd.isnull(stats['pageviews']), 'pageviews'] = 1
    stats.loc[pd.isnull(stats['hits']), 'hits'] = 1
    
    df['browser'] = device['browser']
    df.loc[~df['channelGrouping'].isin(
        ['Direct', 'Organic Search', 'Referral', 'Paid Search', 'Display', 'Social']
    ), 'channelGrouping'] = 'Other'
    df['isMobile'] = device['isMobile']
    df['operatingSystem'] = device['operatingSystem']
    df['country'] = geo['country']
    df['city'] = geo['city']
    df.loc[~df['city'].isin(
        ['New York', 'Mountain View', 'San Francisco']), 'city'
    ] = 'Other'
    df['referralPath'] = traffic['referralPath']
    df['source'] = traffic['source']
    df['hits'] = stats['hits'].astype(int)
    df['pageviews'] = stats['pageviews'].astype(int)
    df['bounces'] = pd.isnull(stats['bounces']).astype(int)
    if train:
        df['transactionRevenue'] = stats['transactionRevenue']
        df.loc[pd.isnull(df['transactionRevenue']), 'transactionRevenue'] = 0.0
        df['transactionRevenue'] = df['transactionRevenue'].astype(float)
    
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['day'] = df['date'].dt.dayofweek
    for i in df['day'].unique():
        df['day_{}'.format(i)] = (df['day'] == i).astype(int)
    del df['date'], df['day']
    
    return df.groupby('fullVisitorId').agg(agg_rules)

In [None]:
class Cleaner:
    def __init__(self):
        self.cols_to_encode = ['browser', 'operatingSystem', 'country', 'source', 'city', 'channelGrouping']
        self.other_cols = ['isMobile', 'pageviews', 'hits', 'visitNumber', 'bounces', 'referralPath']
        self.other_cols.extend(['day_{}'.format(i) for i in range(7)])
        self.les = defaultdict(LabelEncoder)
        self.ohe = OneHotEncoder(sparse=False)
        self.scaler = StandardScaler()
    
    def fit(self, df):        
        tmp_le = df[self.cols_to_encode].apply(
            lambda x: self.les[x.name].fit_transform(x)
        ).values
        tmp_ohe = self.ohe.fit_transform(tmp_le)
        other_vals = df[self.other_cols].values
        return self.scaler.fit_transform(np.hstack([tmp_ohe, other_vals]))
    
    def transform(self, df):
        tmp_le = df[self.cols_to_encode].apply(
            lambda x: self.les[x.name].transform(x)
        ).values
        tmp_ohe = self.ohe.transform(tmp_le)
        other_vals = df[self.other_cols].values
        return self.scaler.transform(np.hstack([tmp_ohe, other_vals]))


def find_best_param(train, params, param, values, verbose=True):
    best_cv = None
    best_param = None
    
    params_copy = params.copy()
    for i, val in enumerate(values, 1):
        print('iter_num={}'.format(i))
        params_copy[param] = val
        
        cv = lgb.cv(
            params_copy,
            train,
            metrics='rmse',
            nfold=3,
            stratified=False
        )['rmse-mean'][-1]
        if best_cv is None or cv < best_cv:
            best_cv = cv
            best_param = val
    params[param] = best_param
    print('param {} has been set'.format(param))
    return best_param, best_cv

In [None]:
train_df = data_prepare(raw_df)
train_df.head()

In [None]:
cleaner = Cleaner()
X_train = cleaner.fit(train_df)
y_train = np.log(train_df['transactionRevenue'] + 1).values

In [None]:
del train_df, raw_df

In [None]:
# no CV :(

model = RandomForestRegressor(
    n_estimators=500,
    max_depth=12,
    min_samples_leaf=25,
    n_jobs=-1,
    verbose=1,
    max_leaf_nodes=200
)
model.fit(X_train, y_train)

In [None]:
test_df = data_prepare(pd.read_csv('../input/test.csv', nrows=None, dtype={'fullVisitorId': str, 'date': str}), train=False)
test_df.head()

In [None]:
X_test = cleaner.transform(test_df)
ids = test_df.index.values
y_pred = model.predict(X_test)
y_pred[test_df['bounces'] == 0] = 0

submission = pd.DataFrame(data={'PredictedLogRevenue': y_pred, 'fullVisitorId': ids})
submission.to_csv('submission.csv', index=False)
submission.head()