# Data Cleaning

## Import Libraries

In [None]:
import pandas as pd
from scripts.helper import reduce_mem_usage
from datetime import datetime
import glob
import numpy as np
from scripts.helper import reduce_mem_usage, load_df
pd.set_option("display.max_columns", 120)
import seaborn as sns
from pycaret.regression import *
from pycaret.utils import check_metric
from pycaret.datasets import get_data
import pickle
from sklearn import preprocessing

## Import Datasets

In [None]:
dataset = pd.read_csv('data/shrunk_train_v2.csv', index_col=[0]) # Load shrunk train dataset and remove unnamed column
dataset_test = pd.read_csv('data/shrunk_test_v2.csv', index_col=[0]) # Load shrunk test dataset and remove unnamed column

## Replaceing missing values

In [None]:
# Check if null values in dataset
dataset.isnull().sum()

In [None]:
# Fill all null values in the dataset with zeros
dataset.fillna({'totals.transactionRevenue': 0, "trafficSource.referralPath" : "(not provided)",
          'totals.bounces': 0, 'totals.newVisits': 0, "totals.pageviews" : "1",
          'trafficSource.isTrueDirect': False, 
          'trafficSource.adContent': 'None',
            'trafficSource.keyword': '(not provided)'}, inplace=True)


dataset_test.fillna({'totals.transactionRevenue': 0, "trafficSource.referralPath" : "(not provided)",
          'totals.bounces': 0, 'totals.newVisits': 0, "totals.pageviews" : "1",
          'trafficSource.isTrueDirect': False, 
          'trafficSource.adContent': 'None',
            'trafficSource.keyword': '(not provided)'}, inplace=True)

## Droping not important features

In [None]:
dataset= dataset.drop('geoNetwork.metro', axis = 1) # Too many missing values.

dataset= dataset.drop('geoNetwork.continent', axis = 1) # Too many missing values.
dataset= dataset.drop('geoNetwork.subContinent', axis = 1) # Too many missing values.
dataset= dataset.drop('device.deviceCategory', axis = 1) # similar colmun as "device.isMobile".


dataset_test= dataset_test.drop('geoNetwork.metro', axis = 1) # Too many missing values.

dataset_test= dataset_test.drop('geoNetwork.continent', axis = 1) # Too many missing values.

dataset_test= dataset_test.drop('geoNetwork.subContinent', axis = 1) # Too many missing values.

dataset_test= dataset_test.drop('device.deviceCategory', axis = 1) # similar colmun as "device.isMobile".

## Export data

In [None]:
# Save train data to a new .csv file
path = 'data/cleaned_train_v2.csv'
dataset.to_csv(path)

In [None]:
# Save test data to a new .csv file
path1 = 'data/cleaned_test_v2.csv'
dataset_test.to_csv(path1)

# Baseline Model

## Dealing with PISOX time

In [None]:
# Converting POSIX data from visiStartTime column and replace it in date column
dataset['date'] = pd.to_datetime(dataset['visitStartTime'], unit='s').dt.strftime('%Y-%m-%d')
dataset = dataset.drop('visitStartTime', axis=1)

dataset_test['date'] = pd.to_datetime(dataset_test['visitStartTime'], unit='s').dt.strftime('%Y-%m-%d')
dataset_test = dataset_test.drop('visitStartTime', axis=1)

In [None]:
# Check datatypes
dataset.dtypes

## Categorical Columns

In [None]:
categorical_cols = list()
for i in dataset.columns:
    if (dataset[i].dtype=='object' or dataset[i].dtype=='bool') and (not(i.startswith('total'))):
        categorical_cols.append(i)

In [None]:
categorical_cols.remove('fullVisitorId')

In [None]:
categorical_cols

## Numerical Columns

In [None]:
numerical_cols = list()
for i in dataset.columns:
    if dataset[i].dtype not in ['object', 'bool']:
        numerical_cols.append(i)

In [None]:
numerical_cols.remove('visitId')

In [None]:
numerical_cols.remove('totals.transactionRevenue')

In [None]:
numerical_cols

## Label Encoding

In [None]:
start_time = datetime.now()
# Listing categorical features in a variable and changing it to float.
for feature in categorical_cols:
    label_encoder = preprocessing.LabelEncoder() # initializing        label encoder object
    label_encoder.fit(list(dataset[feature].values.astype('str')) + list(dataset_test[feature].values.astype('str')))

# Fit with list of variables in that feature
    dataset[feature] = label_encoder.transform(list(dataset[feature].values.astype('str'))) 
    dataset_test[feature] = label_encoder.transform(list(dataset_test[feature].values.astype('str'))) 

# Transforming that feature
    print("for this feature : {0} label-encoding was done succesfully".format(feature))
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

## Create Targets and Features

In [None]:
# Create target for classification model
reg_train = dataset[categorical_cols+numerical_cols]
reg_train['Target'] = dataset['totals.transactionRevenue']
reg_test = dataset_test[categorical_cols+numerical_cols]
reg_test['Target'] = dataset_test['totals.transactionRevenue']

In [None]:
totals_transactionRevenue_zero = reg_train[reg_train['Target'] == 0].sample(frac=0.3, random_state=10)
totals_transactionRevenue_nonzero = reg_train[reg_train['Target'] != 0]
reg_train = pd.concat([totals_transactionRevenue_zero, totals_transactionRevenue_nonzero], axis=0)

In [None]:
reg_train["Target"] = reg_train["Target"].apply(np.log1p)
reg_test["Target"] = reg_test["Target"].apply(np.log1p)

## Regression Modeling

In [None]:
data_unseen = reg_test
#data.reset_index(inplace=True, drop=True)
#data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(reg_train.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

### Setup for Pycaret

In [None]:
exp_clf101 = setup(data = reg_train, target = 'Target', session_id=123, numeric_features = categorical_cols+numerical_cols)

In [None]:
models()

## Creating Linear regression Model

In [None]:
lr = create_model('lr')

## Plot a Model

## Residual Plot

In [None]:
plot_model(lr) # Plot residuals for Linear Regression model

## Prediction Error Plot

In [None]:
plot_model(lr, plot = 'error') # Plot prediction error for Linear Regression model

## Feature Importance Plot

In [None]:
plot_model(lr, plot='feature') # Plot feature importance for Linear Regression model

## Predict on test / hold-out Sample

In [None]:
predict_model(lr)

## Finalize Model for Deplyoment

In [None]:
final_lr = finalize_model(lr)

In [None]:
predict_model(final_lr)

## Predict on unseen data

In [None]:
unseen_predictions = predict_model(final_lr, data=data_unseen)

In [None]:
check_metric(unseen_predictions["Target"], unseen_predictions.Label, 'RMSE')

# Final part - Creating submission file

## Saving the Predicted Target Values of the Model

In [None]:
sub_reg = unseen_predictions['Label']
sub_reg.head()

### Saving the result of prediction in CSV file

In [None]:
sub_reg.to_csv("models/sub_reg_base.csv",index=False)

### Calling back Regression Result

In [None]:
sub_reg = pd.read_csv("models/sub_reg_base.csv")

In [None]:
pred_test = sub_reg.Label

### Saving FullvisitorID for creating submission file for aggregation per Customer

In [None]:
test_id = dataset_test["fullVisitorId"].values
pred_target = pd.DataFrame({"fullVisitorId":test_id})

### This is the part to get Predict Target value with fullVisitorId

In [None]:
# Removing negative values
pred_test[pred_test<0] = 0
pred_target["PredictedLogRevenue"] = np.expm1(pred_test)
pred_target = pred_target.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()

### Submission file ready below!

In [None]:
pred_target.columns = ["fullVisitorId", "PredictedLogRevenue"]
pred_target["PredictedLogRevenue"] = np.log1p(pred_target["PredictedLogRevenue"])
pred_target.to_csv("model/submission_base.csv", index=False)

In [None]:
# Final submission file
pred_target.head()

##  RMSE based on Stakeholders Definition

### This is the part to get Actual Target with fullVisitorId to compare the previously Predict Target

In [None]:
act_target = pd.DataFrame({"fullVisitorId":test_id})
act_target.head()

In [None]:
act_target["Target_actual"] = np.expm1(unseen_predictions['Target'])
act_target = act_target.groupby("fullVisitorId")["Target_actual"].sum().reset_index()

In [None]:
act_target.columns = ["fullVisitorId", "Target_actual"]
act_target["Target_actual"] = np.log1p(1+act_target["Target_actual"])
act_target.head()

In [None]:
# Defining the RMSE (based on Customer level)
RMSE = np.sqrt((sum( (act_target['Target_actual'].values - pred_target['PredictedLogRevenue'].values) **2)) / len(act_target))

In [None]:
print(f'Your RMSE Score is: {RMSE}')