# Customer Revenue Prediction

## Baseline Light GBM Model
*Machine Learning Nanodegree Program | Capstone Project*

---

In this notebook I will be creating a baseline model that can be used to evaluate the performance of the Pytorch model that we will be creating as part of the project.

### Overview:
- Reading the data
- Initializing the Light GBM model
- Training the model with the train dataset
- Validating the model using the val dataset
- Predict the revenue for customer in test dataset
- Visualizing the results
- Saving the base line results to a csv 

First, import the relevant libraries into notebook

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from os import path
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [None]:
files_dir = '../data/files'

if not path.exists(files_dir):
    raise Exception('{} directory not found.'.format(
        files_dir
    ))

train_file = '{}/{}'.format(files_dir, 'train.zip')
print('\nTrain file: {}'.format(train_file))

val_file = '{}/{}'.format(files_dir, 'val.zip')
print('\nVal file: {}'.format(val_file))

test_file = '{}/{}'.format(files_dir, 'test.zip')
print('\nTest file: {}'.format(test_file))

baseline_file = '{}/{}'.format(files_dir, 'baseline.zip')
print('\nBaseline file: {}'.format(baseline_file))

imp_features_file = '{}/{}'.format(files_dir, 'lgbm_importances-01.png')
print('\nImportant Features file: {}'.format(imp_features_file))

In [None]:
def load_data(zip_path):
    df = pd.read_csv(
        zip_path,
        dtype={'fullVisitorId': 'str'},
        compression='zip'
    )
    
    [rows, columns] = df.shape

    print('\nLoaded {} rows with {} columns from {}.\n'.format(
        rows, columns, zip_path
    ))
    
    return df

In [None]:
%%time

train_df = load_data(train_file)
val_df = load_data(val_file)
test_df = load_data(test_file)

In [None]:
train_id = train_df["fullVisitorId"].values
val_id = val_df["fullVisitorId"].values
test_id = test_df["fullVisitorId"].values

train_y = np.log1p(train_df["totals.transactionRevenue"].values)
val_y = np.log1p(val_df["totals.transactionRevenue"].values)


train_X = train_df.drop(['totals.transactionRevenue', 'fullVisitorId'], axis=1)
val_X = val_df.drop(['totals.transactionRevenue', 'fullVisitorId'], axis=1)
test_X = test_df.drop(['fullVisitorId'], axis=1)

In [None]:
print('\nShape of the train dataset: {}'.format(train_X.shape))
print('\nShape of the val dataset: {}'.format(val_X.shape))
print('\nShape of the test dataset: {}\n'.format(test_X.shape))

In [None]:
def lgbm_model(train_X, train_y, val_X, val_y, test_X):
    params = {
        'objective' : 'regression',
        'metric' : 'rmse', 
        'num_leaves' : 30,
        'min_child_samples' : 100,
        'learning_rate' : 0.1,
        'bagging_fraction' : 0.7,
        'feature_fraction' : 0.5,
        'bagging_frequency' : 5,
        'bagging_seed' : 2020,
        'verbosity' : -1
    }
    
    lg_train = lgb.Dataset(train_X, label=train_y)
    lg_val = lgb.Dataset(val_X, label=val_y)
    
    model = lgb.train(params, lg_train, 1000, valid_sets=[lg_val], early_stopping_rounds=100, verbose_eval=100)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    
    return pred_test_y, pred_val_y, model

In [None]:
pred_test, pred_val, model = lgbm_model(train_X, train_y, val_X, val_y, test_X)

In [None]:
pred_val[pred_val < 0] = 0

pred_val_df = pd.DataFrame({ 'fullVisitorId': val_id })
pred_val_df['transactionRevenue'] = val_df["totals.transactionRevenue"].values
pred_val_df['predictedRevenue'] = np.expm1(pred_val)

pred_val_df = pred_val_df.groupby('fullVisitorId')['transactionRevenue', 'predictedRevenue'].sum().reset_index()

print(
    np.sqrt(
        mean_squared_error(
            np.log1p(pred_val_df["transactionRevenue"].values), 
            np.log1p(pred_val_df["predictedRevenue"].values)
        )
    )
)



In [None]:
baseline_df = pd.DataFrame({ 'fullVisitorId': test_id })
pred_test[pred_test < 0] = 0

baseline_df['predictedRevenue'] = np.expm1(pred_test)

baseline_df = baseline_df.groupby('fullVisitorId')['predictedRevenue'].sum().reset_index()

baseline_df = ['fullVisitorId', 'predictedRevenue']
baseline_df['predictedRevenue'] = np.log1p(baseline_df['predictedRevenue'])


baseline_df.to_csv(baseline_file, index=False, compression='zip')

baseline_df.head()

In [None]:
imp_features = pd.DataFrame(
    sorted(zip(model.feature_importances_, train_df.columns)),
    columns=['Value','Feature']
)

# sns.countplot(imp_features, palette="hls")

sns.barplot(x="Value", y="Feature", data=imp_features.sort_values(by="Value", ascending=False))

plt.title("LightGBM - Feature Importance", fontsize=12)

plt.show()
plt.savefig(imp_features_file)