In [229]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import namedtuple
Dataset = namedtuple("Dataset", "data target")

from collections import defaultdict
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate, cross_val_predict

from sklearn.linear_model import (ARDRegression, BayesianRidge, ElasticNet, ElasticNetCV, HuberRegressor, Lars, LarsCV,
                                  Lasso, LassoCV, LassoLars, LassoLarsCV, LassoLarsIC, LinearRegression,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV,
                                  OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor,
                                  RANSACRegressor, Ridge, RidgeCV, SGDRegressor, TheilSenRegressor)

from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor)
from sklearn.cross_decomposition import (CCA, PLSCanonical, PLSRegression)
from sklearn.tree import (DecisionTreeRegressor, ExtraTreeRegressor)
from sklearn.gaussian_process import (GaussianProcessRegressor)
from sklearn.neighbors import (KNeighborsRegressor, RadiusNeighborsRegressor)
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import (LinearSVR, NuSVR, SVR)

def RMSE(actual,predicted):
    return np.sqrt(metrics.mean_squared_error(actual,predicted))

In [3]:
IN_TRAIN = 'in/train-wrangled.csv'

In [4]:
dataframe = pd.read_csv(IN_TRAIN, dtype={'fullVisitorId': 'str'}, low_memory=False)

In [15]:
print("Number of unique users: {}".format(len(dataframe)))

Number of unique users: 714167


In [123]:
train_df, _ = train_test_split(dataframe, train_size=10000, shuffle=True, random_state=42)



In [124]:
train_df = train_df.drop(columns=["fullVisitorId", "sessionId", "visitId", "visitStartTime"])
train_df = train_df.fillna(0)
train_df.head()

Unnamed: 0,date,visitNumber,device.isMobile,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.visits,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.isTrueDirect,...,geoNetwork.subContinent Western Asia,geoNetwork.subContinent Western Europe,trafficSource.source (direct),trafficSource.source Other,trafficSource.source Partners,trafficSource.source analytics.google.com,trafficSource.source google,trafficSource.source mall.googleplex.com,trafficSource.source youtube.com,totals.transactionRevenue
77442,0.0115,0.0,0,0.0,0.016032,0.333333,0.001683,0.0,0,0,...,0,1,0,0,0,0,1,0,0,0.0
420331,0.9626,0.0,0,0.0,0.004008,0.333333,0.000721,0.0,0,0,...,0,0,0,0,0,0,1,0,0,0.0
84153,0.9601,0.0,0,0.0,0.002004,0.333333,0.000481,0.0,0,0,...,1,0,0,0,0,0,1,0,0,0.0
377332,0.981,0.0,0,0.0,0.0,0.333333,0.00024,0.0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
239423,0.992,3.2e-05,0,0.0,0.004008,0.333333,0.000962,0.00361,0,0,...,0,0,1,0,0,0,0,0,0,0.0


In [125]:
def load_csv_dataset(df):
    target = df["totals.transactionRevenue"].values
    data = df.drop(columns="totals.transactionRevenue").values
    return Dataset(data=data, target=target)

In [223]:
dataset = load_csv_dataset(train_df)

In [127]:
%%time
from sklearn.ensemble import IsolationForest
isolation = IsolationForest()
isolation.fit(dataset.data, y=dataset.target)
anomalies = isolation.predict(dataset.data)
anomalies

CPU times: user 2.4 s, sys: 44 ms, total: 2.44 s
Wall time: 2.44 s


In [128]:
len(anomalies[anomalies == 1]) / len(anomalies)

0.9

In [129]:
X = np.delete(dataset.data, np.where(anomalies == 1)[0], axis=0)
y = np.delete(dataset.target, np.where(anomalies == 1)[0])
dataset.data.shape, X.shape

((10000, 162), (1000, 162))

In [170]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X = pca.fit_transform(dataset.data)
y = dataset.target

(10, 162)

In [224]:
%%time
params = {
    
}

estimator = RandomForestRegressor()
estimator.set_params(**params)

# estimator.fit(X, y)
# results = estimator.predict(dataset.data)
results = cross_val_predict(estimator, dataset.data, dataset.target)

CPU times: user 1.12 s, sys: 4 ms, total: 1.12 s
Wall time: 1.13 s


In [172]:
results[results < 0] = 0
np.mean(results)

2519423.3

In [230]:
RMSE(dataset.target, results)

36725877.69969698

In [174]:
len(results[results==0])/len(results)

0.944

In [175]:
estimator.fit(dataset.data, dataset.target)
importances = estimator.feature_importances_
feature_names = np.array(list(train_df)[:-1])
feature_names[importances > 0.05]

array(['date', 'totals.pageviews', 'totals.visits',
       'geoNetwork.city not available in demo dataset',
       'geoNetwork.country Canada'], dtype='<U46')

In [221]:
estimator.fit(X, y)
importances = estimator.feature_importances_
importances

array([0.13763548, 0.05549551, 0.06512627, 0.05960663, 0.04523174,
       0.12558777, 0.24392239, 0.12970753, 0.09652947, 0.04115723])

In [220]:
f = results[ np.abs(results - dataset.target)/ (dataset.target+1) < 0.1 ]
print(len(f), np.unique(f))

9414 [0.00000e+00 4.10210e+07 6.74220e+07 1.03764e+08 1.24829e+08 1.27088e+08]
