In [1]:
%load_ext pycodestyle_magic

In [2]:
# %%pycodestyle

# Required libraries
import datetime as dt
import matplotlib.pyplot as plt

import os
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 

start = time.time()

# Load data

In [3]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data.csv"
full_path = os.path.join("data", datafile)
df_customers = pd.read_csv(full_path)
print(df_customers.shape)

(4323, 9)


In [4]:
# Sample data to get a certain knowledge of the dataframe content
df_customers.head()

Unnamed: 0,CustomerID,InvoiceNumberYear,LastInvoiceMonth,LastInvoiceMonthNb,MeanLinesPerInvoice,MeanQuantityPerInvoice,MeanAmountPerInvoice,TotalAmountSpent,cluster_k
0,12347,7,201112,0,26.0,351.142857,615.714286,4310.0,9
1,12348,4,201109,3,6.75,583.0,359.31,1437.24,2
2,12349,1,201111,1,72.0,630.0,1457.55,1457.55,4
3,12350,1,201102,10,16.0,196.0,294.4,294.4,0
4,12352,7,201111,1,10.0,66.142857,180.772857,1265.41,2


# Define classification functions

## Logistic regression

In [5]:
def LogisticRegressionCl(X_train, X_test, y_train, y_test):
    
    # First run of LR is without regularization
    lr = LogisticRegression()
    acc_scorer = make_scorer(accuracy_score)
    
    lr.fit(X_train,y_train)

    y_pred = lr.predict(X_test)

    print("Non regularized Logistic Regression Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

    # second run of LR is with regularization
    lr = LogisticRegression()
    params = {'C': np.logspace(-3, 3, 7) , 'penalty':['l1','l2'] }
    acc_scorer = make_scorer(accuracy_score)

    gs_lr = GridSearchCV(lr, params)

    gs_lr.fit(X_train, y_train)

    y_pred = gs_lr.predict(X_test)
    
    print("Regularized Logistic Regression best params {0}".format(gs_lr.best_params_))
    print("Regularized Logistic Regression Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

## Linear Support Vector Classification

In [6]:
def SVMCl(X_train, X_test, y_train, y_test):

    svm = LinearSVC()

    params = { 'C': np.logspace(-4, 4, 9) }

    gs_svm = GridSearchCV(svm, params)
    gs_svm.fit(X_train, y_train)

    y_pred = gs_svm.predict(X_test)
    
    print("SVM classification best params {0}".format(gs_svm.best_params_))
    print("SVM classification Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

## Random Forest

In [7]:
def RandomForestCl(X_train, X_test, y_train, y_test):

    rfc = RandomForestClassifier()
    params = { 'n_estimators': [50, 100, 200, 500, 1000] }
    acc_scorer = make_scorer(accuracy_score)
    
    gs_rfc = GridSearchCV(rfc, params, scoring=acc_scorer)
    
    model = gs_rfc.fit(X_train, y_train)
    
    y_pred = gs_rfc.predict(X_test)

    print("Random Forest best params {0}".format(gs_rfc.best_params_))
    print("Random Forest Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

## AdaBoost classifier

In [8]:
def AdaBoostCl(X_train, X_test, y_train, y_test):
    
    adaboost = AdaBoostClassifier()
    params = { 'n_estimators': [50, 100, 150, 200], 'learning_rate': np.logspace(-4, 0, 9) }
    acc_scorer = make_scorer(accuracy_score) #Pas nécessaire avec AdBosst car sa fonction score est de base accuracy

    gs_adaboost = GridSearchCV(adaboost, params, scoring=acc_scorer)

    gs_adaboost.fit(X_train, y_train)

    y_pred = gs_adaboost.predict(X_test)
    
    print("AdaaBoost best params {0}".format(gs_adaboost.best_params_))
    print("AdaBoost Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

# Run classifiers on several train sets

## 1. Test set included in initial clustering

This run takes a dataset which was clustered as a whole: train and test samples together. This is likely to cause data leakage.

In [9]:
# Separate features and labels
X = df_customers[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                                      'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                                      'TotalAmountSpent']]
y = df_customers['cluster_k']

# Random sampling for train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Run all algorythms on this set

In [10]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

Non regularized Logistic Regression Accuracy Score on test set 0.90
Regularized Logistic Regression best params {'C': 100.0, 'penalty': 'l1'}
Regularized Logistic Regression Accuracy Score on test set 0.91

SVM classification best params {'C': 0.0001}
SVM classification Accuracy Score on test set 0.76

Random Forest best params {'n_estimators': 200}
Random Forest Accuracy Score on test set 0.98

AdaaBoost best params {'learning_rate': 0.31622776601683794, 'n_estimators': 200}
AdaBoost Accuracy Score on test set 0.90


## 2. Clustering done only on train sets, test set labelled in a second step

In this approach, the customer base was spplit in train and test set, then the train set was classified by a k-means clustering, and finally the test set was labelled using the classifier predict function.

In [11]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data_2_train.csv"
full_path = os.path.join("data", datafile)
df_customers_train = pd.read_csv(full_path)
print(df_customers_train.shape)

# Load data into Pandas dataframe
datafile = "datazon_customer_data_2_test.csv"
full_path = os.path.join("data", datafile)
df_customers_test = pd.read_csv(full_path)
print(df_customers_test.shape)

X_train = df_customers_train[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                              'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                              'TotalAmountSpent']]
y_train = df_customers_train['cluster_k']

X_test = df_customers_test[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                            'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                            'TotalAmountSpent']]
y_test = df_customers_test['cluster_k']

(2896, 9)
(1427, 9)


## Run all algorythms on this set

In [12]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

Non regularized Logistic Regression Accuracy Score on test set 0.96
Regularized Logistic Regression best params {'C': 1000.0, 'penalty': 'l1'}
Regularized Logistic Regression Accuracy Score on test set 0.95

SVM classification best params {'C': 0.10000000000000001}
SVM classification Accuracy Score on test set 0.54

Random Forest best params {'n_estimators': 100}
Random Forest Accuracy Score on test set 0.99

AdaaBoost best params {'learning_rate': 0.31622776601683794, 'n_estimators': 50}
AdaBoost Accuracy Score on test set 0.97


## 3. Time biased train and test sets

In [13]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data_3_train.csv"
full_path = os.path.join("data", datafile)
df_customers_train = pd.read_csv(full_path)
print(df_customers_train.shape)

# Load data into Pandas dataframe
datafile = "datazon_customer_data_3_test.csv"
full_path = os.path.join("data", datafile)
df_customers_test = pd.read_csv(full_path)
print(df_customers_test.shape)

X_train = df_customers_train[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                              'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                              'TotalAmountSpent']]
y_train = df_customers_train['cluster_k']

X_test = df_customers_test[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                            'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                            'TotalAmountSpent']]
y_test = df_customers_test['cluster_k']

(1151, 9)
(3172, 9)


## Run all algorythms on this set

In [14]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

Non regularized Logistic Regression Accuracy Score on test set 0.94
Regularized Logistic Regression best params {'C': 10.0, 'penalty': 'l2'}
Regularized Logistic Regression Accuracy Score on test set 0.92

SVM classification best params {'C': 1.0}
SVM classification Accuracy Score on test set 0.82

Random Forest best params {'n_estimators': 50}
Random Forest Accuracy Score on test set 0.95

AdaaBoost best params {'learning_rate': 0.031622776601683791, 'n_estimators': 150}
AdaBoost Accuracy Score on test set 0.87


## Prepare expense value biased train and test sets

In [15]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data_4_train.csv"
full_path = os.path.join("data", datafile)
df_customers_train = pd.read_csv(full_path)
print(df_customers_train.shape)

# Load data into Pandas dataframe
datafile = "datazon_customer_data_4_test.csv"
full_path = os.path.join("data", datafile)
df_customers_test = pd.read_csv(full_path)
print(df_customers.shape)

X_train = df_customers_train[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                              'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                              'TotalAmountSpent']]
y_train = df_customers_train['cluster_k']

X_test = df_customers_test[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                            'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                            'TotalAmountSpent']]
y_test = df_customers_test['cluster_k']


(3242, 9)
(4323, 9)


## Run all algorythms on this set

In [16]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

Non regularized Logistic Regression Accuracy Score on test set 0.84
Regularized Logistic Regression best params {'C': 100.0, 'penalty': 'l1'}
Regularized Logistic Regression Accuracy Score on test set 0.84

SVM classification best params {'C': 0.001}
SVM classification Accuracy Score on test set 0.84

Random Forest best params {'n_estimators': 1000}
Random Forest Accuracy Score on test set 0.85

AdaaBoost best params {'learning_rate': 0.01, 'n_estimators': 200}
AdaBoost Accuracy Score on test set 0.24
