In [1]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)

# https://www.kaggle.com/c/ga-customer-revenue-prediction

In [2]:
data = pd.read_csv("./data/data_pre.csv",index_col=0)

In [3]:
data['logRevenue'] = np.log(data['totals.transactionRevenue']+1)

In [4]:
data['logRevenue'].describe()

count    18514.000000
mean        17.770575
std          1.186022
min          9.210440
25%         16.953935
50%         17.645455
75%         18.420681
max         23.864375
Name: logRevenue, dtype: float64

In [5]:
X = data.copy()
X.drop(['totals.transactionRevenue','date','logRevenue','totals.totalTransactionRevenue','fullVisitorId','visitId','totals.transactions'],axis=1,inplace=True)
y = data['logRevenue']

In [6]:
X.head()

Unnamed: 0,channelGrouping,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,4,1508200705,0,0,False,0,0,0,0,0,0,0,0,9,0.0,9,13.0,261.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
1,1,11,1508192636,0,0,False,1,1,0,0,1,0,1,0,15,0.0,12,38.0,285.0,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1
2,0,6,1508162218,1,1,True,2,2,0,0,2,1,1,0,15,0.0,15,42.0,1044.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0
3,1,17,1508189401,0,0,False,0,2,0,0,2,0,1,0,18,0.0,16,77.0,514.0,0,0,0,0,0.0,0,0,0.0,0,1,1,1,1
4,1,1,1508190484,0,0,False,0,0,0,0,0,0,0,0,21,1.0,20,62.0,487.0,0,0,0,0,0.0,0,0,0.0,1,1,1,1,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

# Using Random Forest

In [8]:
rf = RandomForestRegressor(n_estimators=100)

In [9]:
param = {'max_depth':[5,10,15,20,25,30]}

In [10]:
rf_cv = GridSearchCV(rf,param,cv=10,verbose=True,scoring='neg_mean_squared_log_error')

rf_cv.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 14.1min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_log_error', verbose=True)

In [23]:
rf_cv.grid_scores_



[mean: -0.00343, std: 0.00013, params: {'max_depth': 3},
 mean: -0.00337, std: 0.00013, params: {'max_depth': 4},
 mean: -0.00332, std: 0.00014, params: {'max_depth': 5},
 mean: -0.00328, std: 0.00014, params: {'max_depth': 6},
 mean: -0.00326, std: 0.00015, params: {'max_depth': 7},
 mean: -0.00324, std: 0.00015, params: {'max_depth': 8},
 mean: -0.00322, std: 0.00015, params: {'max_depth': 9},
 mean: -0.00322, std: 0.00016, params: {'max_depth': 10}]

In [24]:
best_model = rf_cv.best_estimator_
print(best_model)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [26]:
predicted_train = best_model.predict(X_train)
best_model.score(X_train,y_train)

0.39468197524040616

In [13]:
rmse_train = np.sqrt(mean_squared_error(y_train, predicted_train))
print("RMSE: %f" % (rmse_train))

RMSE: 0.921405


In [14]:
predicted_train = best_model.predict(X_test)
best_model.score(X_test,y_test)

0.20196553754876723

In [15]:
rmse = np.sqrt(mean_squared_error(y_test, predicted_train))
print("RMSE: %f" % (rmse))

RMSE: 1.063005


In [16]:
predicted_vs_actual = y_test.copy()
predicted_vs_actual['predicted'] = np.array(predicted_train)

predicted_vs_actual.head()

6183     20.9055
15939    18.2364
14837    16.9061
3157     18.0461
11484    16.5868
Name: logRevenue, dtype: object

In [17]:
y_test.shape

(5555,)

In [18]:
feature_importances = pd.DataFrame(best_model.feature_importances_,index = X_train.columns,columns=['importance']).sort_values('importance',ascending=False)
feature_importances.head()

Unnamed: 0,importance
totals.hits,0.248179
visitNumber,0.136764
visitStartTime,0.115872
totals.timeOnSite,0.084665
totals.pageviews,0.050865


# Use XGBoost

In [31]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [35]:
xg_reg = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 100)

In [36]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)



In [37]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 1.060898


In [None]:
# Model Selector


In [None]:
#from sklearn.feature_selection import SelectFromModel

In [None]:
#sel = SelectFromModel(RandomForestClassifier(n_estimators = 1))
#sel.fit(X_train, y_train)

In [None]:
#sel.get_support()

In [None]:
#selected_feat= X_train.columns[(sel.get_support())]
#len(selected_feat)

In [None]:
#print(selected_feat)

In [None]:
#pd.series(sel.estimator_,feature_importances_,.ravel()).hist()