In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/train.flat.csv", dtype={'fullVisitorId': 'str', 'visitId': 'str'})
df_t = pd.read_csv("data/test.flat.csv", dtype={'fullVisitorId': 'str', 'visitId': 'str'})
for col in df.columns:
#     print(col, df[col].fillna('').nunique())
    if df[col].fillna('').nunique() == 1:
        print("column skipped (cardinality = 1):", col)
        df = df.drop(col, axis=1)

In [None]:
(np.log(df['totals.transactionRevenue'].dropna() +1)).hist()

In [None]:
df['totals.transactionRevenue'] = df['totals.transactionRevenue'].fillna(0)

In [None]:
from datetime import datetime
sdf = pd.DataFrame()
sdf_t = pd.DataFrame()

# boolean
for col in ['trafficSource.isTrueDirect', 'trafficSource.adwordsClickInfo.isVideoAd', 'device.isMobile', 'totals.newVisits']:
    sdf[col] = df[col].apply(lambda x: 2 if np.isnan(x) else int(x))
    sdf_t[col] = df_t[col].apply(lambda x: 2 if np.isnan(x) else int(x))
        
# numeric
for col in ['visitNumber', 'visitStartTime', 'totals.hits', 'totals.pageviews']:
    minv = df[col].min()
    maxv = df[col].max()
    sdf[col] = df[col].apply(lambda x: (x-minv)/maxv).fillna(0)
    sdf_t[col] = df_t[col].apply(lambda x: (x-minv)/maxv).fillna(0)
    
from sklearn.preprocessing import LabelEncoder
    
# string
for col in [
    'trafficSource.adwordsClickInfo.adNetworkType',
    'trafficSource.adwordsClickInfo.page',
    'channelGrouping',
    'geoNetwork.region',
    'geoNetwork.country',
    'geoNetwork.metro',
    'trafficSource.campaign',
    'totals.bounces',
    #'trafficSource.referralPath',
    'device.deviceCategory',
    'device.browser',
    'trafficSource.adContent',
    'trafficSource.medium',
    'geoNetwork.city',
    'trafficSource.source',
    'trafficSource.adwordsClickInfo.slot',
    'fullVisitorId',
    'geoNetwork.subContinent',
    'device.operatingSystem',
    #'geoNetwork.networkDomain',
    'trafficSource.adwordsClickInfo.gclId',
#     'trafficSource.campaignCode', // not exist in test
    'trafficSource.keyword',
    'geoNetwork.continent']:
    le = LabelEncoder()
    le.fit(pd.concat([df[col].astype(str).fillna(''), df_t[col].astype(str).fillna('')]))
    sdf[col] = le.transform(df[col].astype(str).fillna(''))
    sdf_t[col] = le.transform(df_t[col].astype(str).fillna(''))

sdf["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
sdf["weekday"] = sdf['date'].dt.weekday
sdf["is_holiday"] = sdf["weekday"].apply(lambda x: 1 if x >= 5 else 0)
sdf["day"] = sdf['date'].dt.day
sdf["month"] = sdf['date'].dt.month
sdf["year"] = sdf['date'].dt.year
sdf['visitHour'] = (sdf['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
sdf = sdf.drop('date', 1)
sdf_t["date"] = pd.to_datetime(df_t["date"], format="%Y%m%d")
sdf_t["weekday"] = sdf_t['date'].dt.weekday
sdf_t["is_holiday"] = sdf_t["weekday"].apply(lambda x: 1 if x >= 5 else 0)
sdf_t["day"] = sdf_t['date'].dt.day
sdf_t["month"] = sdf_t['date'].dt.month
sdf_t["year"] = sdf_t['date'].dt.year
sdf_t['visitHour'] = (sdf_t['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
sdf_t = sdf_t.drop('date', 1)

In [None]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import svm
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot as plt

label = pd.to_numeric(df['totals.transactionRevenue'] >0)

In [None]:
import xgboost as xgb
# from sklearn.grid_search import GridSearchCV

clf_xgb = xgb.XGBClassifier(max_depth=6, n_jobs=8, objective='binary:logistic', reg_alpha=0.1)
X_train, X_test, y_train, y_test = train_test_split(sdf, label, test_size=.2, random_state=10)

In [None]:
clf_xgb.fit(X_train, y_train)

In [None]:
pred = clf_xgb.predict_proba(X_test)
cv = pd.DataFrame(pred[:,0])[df['totals.transactionRevenue'] >0][0]
# cv
non_cv = pd.DataFrame(pred[:,0])[df['totals.transactionRevenue'] == 0][0]
plt.hist([cv, non_cv], stacked=['True'], color=['g', 'r'], log=True, bins=20)
print('accuracy:', clf_xgb.score(X_test, y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, clf_xgb.predict(X_test)))
print(pred.shape)
print(clf_xgb.score)

In [None]:
from xgboost import plot_tree
graph1 = xgb.to_graphviz(clf_xgb)
graph1.format = 'png'
graph1.render('data/tree')

In [None]:
proba = clf_xgb.predict_proba(sdf)[:,0]
sdf_cv = sdf.copy()
sdf_cv['cv_prob'] = proba
revenue = np.log(df['totals.transactionRevenue'] +1)

In [None]:
from sklearn.metrics import mean_squared_error
def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(sdf_cv, revenue, test_size=.2, random_state=10)
clf = linear_model.Lasso(alpha=0.0001)
clf.fit(X_train, y_train)

ddf = pd.DataFrame()
ddf['feature'] = sdf_cv.columns
ddf['coef'] = clf.coef_
y_pred = clf.predict(X_test)
print('RMSE:', RMSE(y_pred, y_test))
ddf

In [None]:
sdf_t_cv = sdf_t.copy()
sdf_t_cv['cv_prob'] = clf_xgb.predict_proba(sdf_t)[:,0]

In [None]:
print(sdf_t_cv.shape)
pre = clf.predict(sdf_t_cv)
print(pre.shape)
# print('RMSE:', RMSE(revenue, pre))
pd.DataFrame(pre).hist()

In [None]:
ans = pd.DataFrame()
ans['fullVisitorId'] = df_t['fullVisitorId'].astype(str)
ans['PredictedLogRevenue'] = pre
ans = ans.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
ans.columns = ["fullVisitorId", "PredictedLogRevenue"]
# ans['PredictedLogRevenue'] = np.log(ans['PredictedLogRevenue'])
ans.to_csv('data/xgb+lasso.csv', index=False)
ans.hist(bins=50)
print(ans.shape) 