---
# Prediction Marketing ML
---


# Data
* raw data: mkt_wine_example.csv
* Amazon data for an online wine review
* This is a time-series data, non-iid data: do not use cross-validation
* using the conventional split test to train the ML and predict out-of-sample prediction.
* out-of-sample prediction means the prediction of potential consumers' preference and/or purchase decision.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

def label_feature_df(df, feature_lists, lables = 'binary'):
    label = df[lables]
    feature_df = df.loc[:,feature_lists]
    return label, feature_df



def report(true_y, pred_y, ranking=3):
    '''
    # report(true_y, pred_y) 
    # Reporting classification results 
    # Return : accuracy, loss (= 1-accuracy)
    '''
    accuracy = accuracy_score(pred_y, true_y)     # validation accuracy, Ref; https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#:~:text=Accuracy%20classification%20score.,set%20of%20labels%20in%20y_true.&text=Otherwise%2C%20return%20the%20fraction%20of%20correctly%20classified%20samples.
    loss = 1- accuracy
    
    print(f'Overall accuracy : {accuracy}')
    print(f'Overall loss : {loss}')
    
    target_names = ['rate 1', 'rate 2', 'rate 3', 'rate 4', 'rate 5']
    target_names = target_names[:ranking]
    print(classification_report(true_y, pred_y, target_names = target_names))   
    print(confusion_matrix(true_y, pred_y))
    return accuracy, loss

from sklearn.utils import class_weight # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html

def class_weight_array(df_y):
    
    '''
    Class weight vector creating for labels in inbalance dataset
    Class weight = # num of sample / (number of total class + fequence of each class)
    '''
    
    class_weights = list(class_weight.compute_class_weight('balanced',np.unique(df_y), df_y)) # num of sample / (number of total class + fequence of each class)
    weight_array = np.ones(df_y.shape[0]) # array([1., 1., 1., ..., 1., 1., 1.])
    for i, value in enumerate(df_y):
        weight_array[i] = class_weights[int(value)-1]
    print(f' class weights : {class_weights}')
    return weight_array

# Ref: https://datascience.stackexchange.com/questions/16342/unbalanced-multiclass-data-with-xgboost

# accuracy
def acc(y_pred, y_true):
    '''
    acc(y_pred,y_true)
    accuracy and error rate(= 1-accuracy)
    '''
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1) # y_hat    
    
    correct_prediction = (y_pred_tags == y_true).float()
    accuracy_rate = correct_prediction.sum() / len(correct_prediction)
    err_rate = 1 - accuracy_rate
    return accuracy_rate, err_rate


import matplotlib.pyplot as plt
def valid_loss_plot(valid_results, x_axis = 'index', y_axis='loss'):
    '''
    # valid_loss_plot(valid_results, x_axis = 'index', y_axis='loss')
    # output: Line graph with x_axis and y_axis
    '''
    valid_results.reset_index().plot.line(x= x_axis, y=y_axis)

import matplotlib.pyplot as plt
def valid_loss_scatter(valid_results, x_axis = 'index', y_axis='loss'):
    '''
    # valid_loss_scatter(valid_results, x_axis = 'index', y_axis='loss')
    # Input: validation set result dataframe, originally for RF
    # output: scatter plot 
    '''
    valid_results.reset_index().plot.scatter(x= x_axis, y=y_axis)

import seaborn as sns; sns.set()
def valid_loss_plot_sns(df,x_axis ='depth', y_axis="loss", hues ="num_tree"):
    '''
    # valid_loss_plot_sns(df,x_axis ='depth', y_axis="loss", hues ="num_tree")
    '''
    g = sns.relplot(x=x_axis, y=y_axis, hue= hues, data= df)
    
def valid_loss_plot(valid_loss):
    '''
    valid_loss_plot(valid_loss)
    one loss list graph
    '''
    fig = plt.figure(figsize=(10,8))
    plt.plot(valid_loss, label='Valid Loss')
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
def label_feature_df(df, feature_lists, label ='overall'):
    '''
    label_feature_df(df, feature_lists, label ='overall')
    return return label, feature_df
    '''
    label = df[label]
    feature_df = df.loc[:,feature_lists]
    return label, feature_df

# Ref: https://seaborn.pydata.org/generated/seaborn.lineplot.html
# ref: https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab

def BCELoss_ClassWeights(input, target, class_weights):
    # input (n, d)
    # target (n, d)
    # class_weights (1, d)
    input = torch.clamp(input,min=1e-7,max=1-1e-7)
    bce = - target * torch.log(input) - (1 - target) * torch.log(1 - input)
    weighted_bce = (bce * class_weights).sum(axis=1) / class_weights.sum(axis=1)[0]
    final_reduced_over_batch = weighted_bce.mean(axis=0)
    return final_reduced_over_batch

# Ref: https://discuss.pytorch.org/t/solved-class-weight-for-bceloss/3114/23

    ##time
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC


# Hyperparameter for Linear SVM

C_list = [0.1,1, 10, 100, 1000] # 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5
# Gamma_list = [1, 0.1, 0.01, 0.001, 0.0001] Not a hyperparameter for linear SVM
class_weight = None
# class_weight = 'balanced' # Performace is worse 

def linear_svm_valid(train_label, train_feature, valid_label, valid_feature, C_list, class_weight):
    '''
    # linear_svm_valid(train_label, train_feature, valid_label, valid_feature, C_list, class_weight)
    # hyper-parametter : C
    # output : Dataframe and optimal C
    '''
    c_list = []
    accuracy_list = []
    loss_list = []
    
    print(f' Hyperparameter space is: {C_list}')
    
    for c in C_list:
        # print(f' Current Hyperparameter c is: {c}')
        
        linear_svm = Pipeline([
                                ("scaler", StandardScaler()),
                                ("linear_svc", LinearSVC(C= c, loss ="hinge", class_weight= class_weight))])            
    
        linear_svm.fit(train_feature, train_label)
        linear_svm_pred = linear_svm.predict(valid_feature)
        accuracy = accuracy_score(valid_y, linear_svm_pred)
        loss = 1- accuracy
                     
        c_list.append(c)
        accuracy_list.append(accuracy)
        loss_list.append(loss)
                     
    valid_result_df =  pd.DataFrame({'loss': loss_list, 'accuracy': accuracy_list, 'c_list': c_list})
    
    print(f' Shape of valid result dataframe: {valid_result_df.shape}')
    print(f' Argmax row for loss: {valid_result_df.loss.argmin()}')

    print('---- Best hyperparameter ---- ' )
    print(f' Minimum validation loss: {valid_result_df.loss[valid_result_df.loss.argmin()]}')
    print(f' Maximum accuracy: {valid_result_df.accuracy[valid_result_df.loss.argmin()]}')
    print(f' Optimal hyperparamer C: {valid_result_df.c_list[valid_result_df.loss.argmin()]}')
    
    optimal_linear_svm_c = valid_result_df.c_list[valid_result_df.loss.argmin()]
    
    return valid_result_df, optimal_linear_svm_c



Data Upload
* binary is a dependent variable whether consumer is happy (1) or not (0)

In [2]:
df = pd.read_csv('mkt_wine_example.csv', index_col=[0])
df = df.reindex()
print(f' The dimension of dataset is {df.shape}')
print(f'the number sample size : {len(df)}')
df.columns = df.columns.map(str)
df['binary'] =0
df['binary'][df['overall'] > 3] =1

test = df[1000:]
print(f'The dimension of test dataset is {test.shape}')
total_train = df[:1000]
print(f'The dimension of total train dataset is {total_train.shape}')
train = total_train[:800]
print(f'The dimension of sub train dataset is {train.shape}')
valid = total_train[800:]
print(f'The dimension of valid dataset is {valid.shape}')

 The dimension of dataset is (1210, 58)
the number sample size : 1210
The dimension of test dataset is (210, 59)
The dimension of total train dataset is (1000, 59)
The dimension of sub train dataset is (800, 59)
The dimension of valid dataset is (200, 59)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


# Structured Data

---
### Variable Selection 
* 40 variables (1 dependent variable "binary" + 39 indepdent variables)
* feature size = the number of independent variables = 39
---

In [3]:
print(f' column names {df.columns}')
# mask test, feature_list requires in Full CNN
feature_list =['verified', 'price',
       'len_summary', 'len_review',  'mon', 'tue', 'wed', 'thu',
       'fri', 'sat', 'sun', 'holiday', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       '10', '11', '12']
print(f' full feature set: {len(feature_list)}')

 column names Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'vote', 'category',
       'description', 'title', 'brand', 'rank', 'main_cat', 'price', 'buy',
       'len_summary', 'len_review', 'date', 'day', 'mon', 'tue', 'wed', 'thu',
       'fri', 'sat', 'sun', 'year', 'month', 'holiday', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       '10', '11', '12', 'binary'],
      dtype='object')
 full feature set: 39


# 1. Feature List All: 39 variables

---
### Train total set
---

In [4]:
train_total_y, train_total_x = label_feature_df(total_train, feature_list, 'binary')
train_total_x.head(1)

Unnamed: 0,verified,price,len_summary,len_review,mon,tue,wed,thu,fri,sat,...,3,4,5,6,7,8,9,10,11,12
26,0,15.1,29.0,866,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


---
### Sub train set
---

In [5]:
train_y, train_x = label_feature_df(train, feature_list, 'binary')
train_x.head(1)

Unnamed: 0,verified,price,len_summary,len_review,mon,tue,wed,thu,fri,sat,...,3,4,5,6,7,8,9,10,11,12
26,0,15.1,29.0,866,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Valid set

In [6]:
valid_y, valid_x  = label_feature_df(valid, feature_list, 'binary')
valid_x.head(1)

Unnamed: 0,verified,price,len_summary,len_review,mon,tue,wed,thu,fri,sat,...,3,4,5,6,7,8,9,10,11,12
1577,0,17.08,10.0,8,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


---
### Test Set
---

In [7]:
test_y, test_x =  label_feature_df(test, feature_list, 'binary')
test_x.head(1)

Unnamed: 0,verified,price,len_summary,len_review,mon,tue,wed,thu,fri,sat,...,3,4,5,6,7,8,9,10,11,12
2986,0,29.95,17.0,310,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
# 1. Ex-Ante Prediction + 39 variables
---
* 1. SVM
---

---
# SVM
---
* Linear 
* Kernel 
* Training and Validation: train_x, tain_y, valid_x, valid_y
* Test set: train_total_x, train_total_y, test_x, test_y
* 'kernel' : rbf
* 'gamma'  : hyperparameter
* 'C' : softmargin, a smaller C incresing a larger margin with more margin violation
* Ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html (official)
* Ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html (official, Linear)
----

---
### Linear SVM
* hyperparameter is C
---


In [9]:
svm_valid_result, optimal_linear_svm_c = linear_svm_valid(train_y, train_x, valid_y, valid_x, C_list, class_weight)

 Hyperparameter space is: [0.1, 1, 10, 100, 1000]




 Shape of valid result dataframe: (5, 3)
 Argmax row for loss: 0
---- Best hyperparameter ---- 
 Minimum validation loss: 0.20999999999999996
 Maximum accuracy: 0.79
 Optimal hyperparamer C: 0.1




---
#### Test Step
* with tunned hyperparameters
----

In [10]:
%%time
# Linear SVM Test Prediction 
# Previous C=1 and acc = 0.6303630363036303

print(f' Optimal Linear SVM C hyperparameter: {optimal_linear_svm_c}')

optimal_C = optimal_linear_svm_c

linear_svm_test = rbf_kernel_svm  = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=optimal_C, loss ="hinge", class_weight=class_weight))
])

linear_svm_test.fit(train_total_x, train_total_y)
linear_svm_pred_test = linear_svm_test.predict(test_x)
report(test_y, linear_svm_pred_test,2 )

 Optimal Linear SVM C hyperparameter: 0.1
Overall accuracy : 0.719047619047619
Overall loss : 0.28095238095238095
              precision    recall  f1-score   support

      rate 1       0.67      0.03      0.06        60
      rate 2       0.72      0.99      0.83       150

    accuracy                           0.72       210
   macro avg       0.69      0.51      0.45       210
weighted avg       0.70      0.72      0.61       210

[[  2  58]
 [  1 149]]
CPU times: user 28.9 ms, sys: 0 ns, total: 28.9 ms
Wall time: 31.7 ms




-----
* The label is a binary indicator of whether consumers are happy (y=1) or not (y=0) after purchase based on user-generated data (i.e., online product review).

* The results are 71% accurate in predicting whether or not a customer is happy or not based on the test parameters.
----