In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import json
import gc
import sys
import math

from pandas.io.json import json_normalize
from datetime import datetime

import os
print(os.listdir("../input/ga-customer-revenue-prediction"))

In [None]:
#function to load load data and normalize the json data columns

gc.enable()
  
features = ['channelGrouping', 'date', 'fullVisitorId','socialEngagementType', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem', \
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.visits', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue','totals.totalTransactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign', 'trafficSource.medium', \
       'trafficSource.source', 'customDimensions']

  
        
def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, 
            chunksize=100000)
    
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

       
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis=0).reset_index(drop=True)
   
    return ans

In [None]:
#reduce memory usage by adjust the reserved size of datatypes as per need. 

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data types to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Initial Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df


df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
#Exploratory Data Analysis

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.columns

In [None]:
df_train.dtypes

In [None]:
df_train.select_dtypes(include='object').columns

In [None]:
df_train.select_dtypes(exclude='object').columns

In [None]:
df_train.head()

In [None]:
#Checking and Filling Nulls

In [None]:
#check columns contain nulls with their null percentage
null_percentage = pd.DataFrame()
for col in df_train.columns:
    if df_train[col].isnull().sum() > 0:
        null_percentage.loc[col,'NullPercentage'] = (df_train[col].isnull().sum())/len(df_train) * 100 
print(null_percentage)
        

In [None]:
df_train['totals.transactionRevenue'].value_counts()

In [None]:
#As this is a duplicate column
df_train.drop('totals.totalTransactionRevenue', axis=1, inplace=True)
df_test.drop('totals.totalTransactionRevenue', axis=1, inplace=True)

In [None]:
df_train.drop('trafficSource.adContent', axis=1, inplace=True)

In [None]:
#fill null in 'totals.pageviews' by mode
print(df_train['totals.pageviews'].isnull().sum())
print(df_train['totals.pageviews'].dtype)

print(df_train['totals.pageviews'] .mode())
print((df_train["totals.pageviews"]=='1').sum())
df_train['totals.pageviews'] = df_train['totals.pageviews'].fillna(1)

df_train['totals.pageviews'] = df_train['totals.pageviews'].astype(int)

In [None]:
#fill null in 'totals.newVisits' by mode
print(df_train['totals.newVisits'].isnull().sum())
print(df_train['totals.newVisits'].dtype)

print(df_train['totals.newVisits'] .mode())
print((df_train["totals.newVisits"]=='1').sum())
df_train['totals.pageviews'] = df_train['totals.pageviews'].fillna(1)

df_train['totals.pageviews'] = df_train['totals.pageviews'].astype(int)

In [None]:
unique_customers_no = df_train['fullVisitorId'].nunique()
total_customers_no = df_train['visitId'].count()
print("No of unique customers {} from {} total customers".format(unique_customers_no , total_customers_no))

In [None]:
df_train['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].astype(float)
df_train['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].fillna(0)

print(df_train['totals.transactionRevenue'].sort_values().unique()[1])
print(df_train['totals.transactionRevenue'].sort_values().unique()[-1])

In [None]:
target = df_train.groupby('fullVisitorId')[['totals.transactionRevenue']].sum().reset_index()

customers_making_revenue = (target["totals.transactionRevenue"]>0).sum()

print("No of different customers with no zero revenue= {} from total {} customers".format(customers_making_revenue,unique_customers_no))
print("Percentage of customers with no zero revenue = ",round(customers_making_revenue/unique_customers_no*100 , 2))    

In [None]:
target.sort_values(by='totals.transactionRevenue' , ascending=False).head()

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.countplot(x=pd.cut( df_train['totals.transactionRevenue'], [-10,0,2e11]) )
ax.set_xticklabels(["0$ revenue customers" , "revenue customers"])
ax.set_xlabel('Revenue' , fontsize=16 , color='Black')
ax.set_ylabel('Number of Customers' , fontsize=16 , color='Black')

In [None]:
df_train2['totals.transactionRevenue'] = df_train2['totals.transactionRevenue'].replace(0,np.nan)


dev_category = df_train2.groupby('device.deviceCategory')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
dev_category.columns = ['total transactions no','non zero transactions no','average revenue']

dev_os = df_train2.groupby('device.operatingSystem')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
dev_os.columns = ['total transactions no','non zero transactions no','average revenue']
dev_os = dev_os.sort_values(by='total transactions no', ascending=False)

dev_browser = df_train2.groupby('device.browser')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
dev_browser.columns = ['total transactions no','non zero transactions no','average revenue']
dev_browser = dev_browser.sort_values(by='total transactions no', ascending=False)


#plt.figure(figsize=(12,4))
fig , ax = plt.subplots(3 , 3 , figsize=(15,8))
plt.subplot(3,3,1)
sns.barplot(y=dev_category.index , x=dev_category['total transactions no'],palette="viridis")
plt.subplot(3,3,2)
sns.barplot(y=dev_category.index , x=dev_category['non zero transactions no'],palette="viridis").set(ylabel=None)
plt.subplot(3,3,3)
sns.barplot(y=dev_category.index , x=dev_category['average revenue'],palette="viridis").set(ylabel=None)

plt.subplot(3,3,4)
sns.barplot(y=dev_os.index[0:6] , x=dev_os['total transactions no'].head(6) ,palette="viridis")
plt.subplot(3,3,5)
sns.barplot(data=dev_os.head(5), y=dev_os.index[0:6], x=dev_os['non zero transactions no'].head(6), palette="viridis").set(ylabel=None)
plt.subplot(3,3,6)
sns.barplot(data=dev_os.head(5), y=dev_os.index[0:6] , x=dev_os['average revenue'].head(6), palette="viridis").set(ylabel=None)

plt.subplot(3,3,7)
sns.barplot(y=dev_browser.index[0:6] , x=dev_browser['total transactions no'].head(6) ,palette="viridis")
plt.subplot(3,3,8)
sns.barplot(y=dev_browser.index[0:6], x=dev_os['non zero transactions no'].head(6), palette="viridis").set(ylabel=None)
plt.subplot(3,3,9)
sns.barplot(y=dev_browser.index[0:6] , x=dev_os['average revenue'].head(6), palette="viridis").set(ylabel=None)

fig.show()

In [None]:
geo_continent = df_train2.groupby('geoNetwork.continent')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
geo_continent.columns = ['total transactions no','non zero transactions no','average revenue']
geo_continent = geo_continent.sort_values(by='total transactions no', ascending=False)

geo_country = df_train2.groupby('geoNetwork.country')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
geo_country.columns = ['total transactions no','non zero transactions no','average revenue']
geo_country = geo_country.sort_values(by='total transactions no', ascending=False)

geo_city = df_train2.groupby('geoNetwork.city')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
geo_city.columns = ['total transactions no','non zero transactions no','average revenue']
geo_city = geo_city.sort_values(by='total transactions no', ascending=False)

geo_region = df_train2.groupby('geoNetwork.region')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
geo_region.columns = ['total transactions no','non zero transactions no','average revenue']
geo_region = geo_region.sort_values(by='total transactions no', ascending=False)

geo_network = df_train2.groupby('geoNetwork.networkDomain')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
geo_network.columns = ['total transactions no','non zero transactions no','average revenue']
geo_network = geo_network.sort_values(by='total transactions no', ascending=False)


#plt.figure(figsize=(12,4))
fig , ax = plt.subplots(5 , 3 , figsize=(15,40))
plt.subplot(5,3,1)
sns.barplot(y=geo_continent.index , x=geo_continent['total transactions no'],color="magenta")
plt.subplot(5,3,2)
sns.barplot(y=geo_continent.index , x=geo_continent['non zero transactions no'],color="magenta").set(ylabel=None)
plt.subplot(5,3,3)
sns.barplot(y=geo_continent.index , x=geo_continent['average revenue'],color="magenta").set(ylabel=None)

plt.subplot(5,3,4)
sns.barplot(y=geo_country.index[0:10] , x=geo_country['total transactions no'].head(10) ,color="cyan")
plt.subplot(5,3,5)
sns.barplot(y=geo_country.index[0:10], x=geo_country['non zero transactions no'].head(10), color="cyan").set(ylabel=None)
plt.subplot(5,3,6)
sns.barplot(y=geo_country.index[0:10] , x=geo_country['average revenue'].head(10), color="cyan").set(ylabel=None)

plt.subplot(5,3,7)
sns.barplot(y=geo_city.index[0:10] , x=geo_city['total transactions no'].head(10) ,color="salmon")
plt.subplot(5,3,8)
sns.barplot(y=geo_city.index[0:10], x=geo_city['non zero transactions no'].head(10), color="salmon").set(ylabel=None)
plt.subplot(5,3,9)
sns.barplot(y=geo_city.index[0:10] , x=geo_city['average revenue'].head(10), color="salmon").set(ylabel=None)

plt.subplot(5,3,10)
sns.barplot(y=geo_region.index[0:10] , x=geo_region['total transactions no'].head(10) ,color="green")
plt.subplot(5,3,11)
sns.barplot(y=geo_region.index[0:10], x=geo_region['non zero transactions no'].head(10), color="green").set(ylabel=None)
plt.subplot(5,3,12)
sns.barplot(y=geo_region.index[0:10] , x=geo_region['average revenue'].head(10), color="green").set(ylabel=None)

plt.subplot(5,3,13)
sns.barplot(y=geo_network.index[0:10] , x=geo_network['total transactions no'].head(10) ,color="orange")
plt.subplot(5,3,14)
sns.barplot(y=geo_network.index[0:10], x=geo_network['non zero transactions no'].head(10), color="orange").set(ylabel=None)
plt.subplot(5,3,15)
sns.barplot(y=geo_network.index[0:10] , x=geo_network['average revenue'].head(10), color="orange").set(ylabel=None)


fig.show()

In [None]:
tf_source = df_train2.groupby('trafficSource.source')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
tf_source.columns = ['total transactions no','non zero transactions no','average revenue']
tf_source = tf_source.sort_values(by='total transactions no', ascending=False)

tf_medium = df_train2.groupby('trafficSource.medium')[['totals.transactionRevenue']].agg(['size', 'count', 'mean'])
tf_medium.columns = ['total transactions no','non zero transactions no','average revenue']
tf_medium = tf_medium.sort_values(by='total transactions no', ascending=False)


fig , ax = plt.subplots(2 , 3 , figsize=(15,10))
plt.subplot(2,3,1)
sns.barplot(y=tf_source.index[0:10] , x=tf_source['total transactions no'].head(10), palette="Spectral")
plt.subplot(2,3,2)
sns.barplot(y=tf_source.index[0:10] , x=tf_source['non zero transactions no'].head(10), palette="Spectral").set(ylabel=None)
plt.subplot(2,3,3)
sns.barplot(y=tf_source.index[0:10] , x=tf_source['average revenue'].head(10), palette="Spectral").set(ylabel=None)

plt.subplot(2,3,4)
sns.barplot(y=tf_medium.index[0:10] , x=tf_medium['total transactions no'].head(10) ,palette="Spectral")
plt.subplot(2,3,5)
sns.barplot(y=tf_medium.index[0:10], x=tf_medium['non zero transactions no'].head(10), palette="Spectral").set(ylabel=None)
plt.subplot(2,3,6)
sns.barplot(y=tf_medium.index[0:10] , x=tf_medium['average revenue'].head(10), palette="Spectral").set(ylabel=None)

fig.show()

In [None]:
 # check for constant columns

In [None]:
#df_train.select_dtypes(exclude='object').columns

remain_features= ['visitNumber', 'device.isMobile', 'totals.pageviews','socialEngagementType','channelGrouping'
                    ,'geoNetwork.metro','totals.visits', 'totals.hits','totals.newVisits', 
                    'trafficSource.campaign', 'customDimensions' ]

for col in remain_features:
    if len(df_train[col].unique()) == 1:
        print("{} is constant".format(col))

In [None]:
print(df_train['socialEngagementType'].unique())
print(df_train['totals.visits'].unique())
['Not Socially Engaged']
['1']

In [None]:
#drop columns that contain constant values.
df_train.drop('socialEngagementType',axis=1, inplace=True)
df_train.drop('totals.visits',axis=1, inplace=True)

In [None]:
sns.pairplot(df_train, diag_kind="hist")

In [None]:
df_train.columns

In [None]:
df_train['totals.hits']= df_train['totals.hits'].astype(int)

In [None]:
print(df_train['totals.newVisits'].value_counts())
print('-'*30)
print(df_train['visitNumber'].value_counts()[:10])
print('-'*30)
print(df_train['device.isMobile'].value_counts()[:10])

In [None]:
print(df_train['totals.hits'].value_counts()[:10])
print('-'*30)
print(df_train['totals.pageviews'].value_counts()[:10])

In [None]:
#Transform & Dealing with dates

In [None]:
print(df_train['visitStartTime'][0])
df_train['visitStartTime'] = pd.to_datetime(df_train['visitStartTime'], unit='s')
print(df_train['visitStartTime'][0])
df_train['vst_dayofweek'] = df_train['visitStartTime'].dt.dayofweek
df_train['vst_hours'] = df_train['visitStartTime'].dt.hour
df_train['vst_dayofmonth'] = df_train['visitStartTime'].dt.day
print(df_train['vst_dayofweek'][0], df_train['vst_hours'][0], df_train['vst_dayofmonth'][0])
df_train.drop('visitStartTime', axis = 1, inplace = True)
    

In [None]:
format_str = '%Y%m%d'
df_train['formated_date'] = df_train['date'].apply(lambda x: datetime.strptime(str(x), format_str))
df_train['year'] = df_train['formated_date'].apply(lambda x:x.year)
df_train['month'] = df_train['formated_date'].apply(lambda x:x.month)
df_train['quarterMonth'] = df_train['formated_date'].apply(lambda x:x.day//8)
df_train['day'] = df_train['formated_date'].apply(lambda x:x.day)
df_train['weekday'] = df_train['formated_date'].apply(lambda x:x.weekday())

df_train.drop(['date','formated_date'], axis=1, inplace=True)

In [None]:
#drop ID columns as they're irrelevant features
irrelavant_features = ['fullVisitorId', 'visitId']
for col in irrelavant_features:
    df_train.drop(col, axis = 1, inplace = True)

In [None]:
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Transform categorical features to numerical using Label Encoding m

In [None]:
%%time
le = LabelEncoder()
print('Categorical columns that will be converted:')
for col in df_train.columns:
    if df_train[col].dtype == 'O':
        print(col)
        #print(col, train[col].unique())
        df_train.loc[:, col] = le.fit_transform(df_train.loc[:, col])

In [None]:
#Repeat same analysis on t

In [None]:
print('No of columns in test set ',len(df_test.columns))
print('Columns in train and not in test are: ',set(df_train)-set(df_test))

null_percentage = pd.DataFrame()
for col in df_test.columns:
    if df_test[col].isnull().sum() > 0:
        null_percentage.loc[col,'NullPercentage'] = (df_test[col].isnull().sum())/len(df_test) * 100 
print(null_percentage)

In [None]:
#drop or fill
No of columns in test set  28
Columns in train and not in test are:  {'vst_hours', 'weekday', 'year', 'quarterMonth', 'vst_dayofweek', 'day', 'month', 'vst_dayofmonth'}
                           NullPercentage
totals.newVisits                28.766724
totals.pageviews                 0.025150
totals.transactionRevenue 

In [None]:
print(df_test['totals.pageviews'].isnull().sum())
print(df_test['totals.pageviews'].dtype)

print(df_test['totals.pageviews'] .mode())
print((df_test["totals.pageviews"]=='1').sum())

df_test['totals.pageviews'] = df_test['totals.pageviews'].fillna(1)

df_test['totals.pageviews'] = df_test['totals.pageviews'].astype(int)
101

In [None]:
print(df_test['totals.newVisits'].isnull().sum())
print(df_test['totals.newVisits'].dtype)

print(df_test['totals.newVisits'] .mode())
print((df_test["totals.newVisits"]=='1').sum())
df_test['totals.pageviews'] = df_test['totals.pageviews'].fillna(1)

df_test['totals.pageviews'] = df_test['totals.pageviews'].astype(int)

In [None]:
df_test.drop('totals.transactionRevenue', axis=1, inplace=True)

In [None]:
for col in df_test.columns:
    if df_test.nunique == 1:
        print("{} is constant".format(col))
        df_test.drop(col, a

In [None]:
df_train['totals.hits']= df_train['totals.hits'].astype(int)

In [None]:
print(df_test['visitStartTime'][0])
df_test['visitStartTime'] = pd.to_datetime(df_test['visitStartTime'], unit='s')
print(df_test['visitStartTime'][0])
df_test['vst_dayofweek'] = df_test['visitStartTime'].dt.dayofweek
df_test['vst_hours'] = df_test['visitStartTime'].dt.hour
df_test['vst_dayofmonth'] = df_test['visitStartTime'].dt.day
print(df_test['vst_dayofweek'][0], df_test['vst_hours'][0], df_test['vst_dayofmonth'][0])
df_test.drop('visitStartTime', axis = 1, inplace = True)

In [None]:
format_str = '%Y%m%d'
df_test['formated_date'] = df_test['date'].apply(lambda x: datetime.strptime(str(x), format_str))
df_test['year'] = df_test['formated_date'].apply(lambda x:x.year)
df_test['month'] = df_test['formated_date'].apply(lambda x:x.month)
df_test['quarterMonth'] = df_test['formated_date'].apply(lambda x:x.day//8)
df_test['day'] = df_test['formated_date'].apply(lambda x:x.day)
df_test['weekday'] = df_test['formated_date'].apply(lambda x:x.weekday())

df_test.drop(['date','formated_date'], axis=1, inplace=True)

In [None]:
irrelavant_features = ['fullVisitorId', 'visitId', 'socialEngagementType' , 'totals.visits']
for col in irrelavant_features:
    df_test.drop(col, axis =

In [None]:
#Categorical columns that will be converted:
channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
totals.hits
totals.newVisits
trafficSource.adContent
trafficSource.campaign
trafficSource.medium
trafficSource.source
customDimensions

In [None]:
le = LabelEncoder()
print('Categorical columns that will be converted:')
for col in df_test.columns:
    if df_test[col].dtype == 'O':
        print(col)
        #print(col, train[col].unique())
        df_test.loc[:, col] = le.fit_transform(df_test.loc[:, col])

In [None]:
print('Columns in train and not in test are: ',set(df_train)-set(df_test))

In [None]:
#Modelling

In [None]:
model = lgb.LGBMRegressor(
        num_leaves = 31,  #(default = 31) – Maximum tree leaves for base learners.
        learning_rate = 0.03, #(default = 0.1) – Boosting learning rate. You can use callbacks parameter of fit method to shrink/adapt learning rate in training using 
                              #reset_parameter callback. Note, that this will ignore the learning_rate argument in training.
        n_estimators = 1000, #(default = 100) – Number of boosted trees to fit.
        subsample = .9, #(default = 1.) – Subsample ratio of the training instance.
        colsample_bytree = .9, #(default = 1.) – Subsample ratio of columns when constructing each tree
        random_state = 34
)

In [None]:
print(len(df_train) , len(df_test))
print(len(df_test) / len(df_t

In [None]:
print(len(df_train) - len(df_test))

In [None]:
y = df_train['totals.transactio

In [None]:
y

In [None]:
df_train.drop('totals.transactionRevenue' , axis=1, inplace=True)

In [None]:
X = df_train
X.shape

In [None]:
X_train = X[:1306748]
X_val = X[1306749:]

y_train = y[:1306748]
y_val = y[1306749:]

print(X_train.shape , y_train.shape)
print(X_val.shape , y_val.shape)

In [None]:
X_train = X[:1306748]
X_val = X[1306749:]

y_train = y[:1306748]
y_val = y[1306749:]

print(X_train.shape , y_train.shape)
print(X_val.shape , y_val.sha

In [None]:
LGBMRegressor(colsample_bytree=0.9, learning_rate=0.03, n_estimators=1000,
              random_state=34, subsample=0.9)

In [None]:
#Check features Importance

In [None]:
features_importance = pd.DataFrame()
features_importance['feature'] = X_train.columns
features_importance['importance'] = model.booster_.feature_importance(importance_type = 'gain')
features_importance.sort_values(by = 'importance', ascending = False)[:10]

In [None]:
predictions = model.predict(X_val, num_iteration = model.best_iteration_)
predictions[predictions < 0] = 0

In [None]:
mean_squared_error(np.log1p(y_val), predictions)

In [None]:
test_predictions = pd.DataFrame()
test_predictions = model.predict(df_test[X_train.columns], num_iteration = model.best_iteration_)
test_predictions[test_predictions < 0] = 0
test_predictions

In [None]:
test_predictions = pd.DataFrame(test_predictions)
test_predictions.head()

In [None]:
test_Ids = pd.read_csv('../input/ga-customer-revenue-prediction/test_v2.csv', usecols=['fullVisitorId'])
test_Ids['fullVisitorId']= test_Ids['fullVisitorId'].astype(str)
test_Ids.dtypes

In [None]:
submission = pd.concat([test_Ids , test_predictions] , axis=1)
submission.columns = ['fullVisitorId','PredictedLogRevenue']
submission.head(10)

In [None]:
submission = submission.groupby('fullVisitorId').sum().reset_index()
submission.head(10)

In [None]:
submission.to_csv('submit.csv', index=False)