In [None]:
%load_ext pycodestyle_magic

In [None]:
# %%pycodestyle

# Required libraries
import datetime as dt
import matplotlib.pyplot as plt

import os
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 

start = time.time()

In [None]:
# Constants
my_data = "../data"

# Load data

In [None]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data.csv"
full_path = os.path.join(my_data, datafile)
df_customers = pd.read_csv(full_path)
print(df_customers.shape)

In [None]:
# Sample data to get a certain knowledge of the dataframe content
df_customers.head()

# Define classification functions

## Logistic regression

In [None]:
def LogisticRegressionCl(X_train, X_test, y_train, y_test):
    
    # First run of LR is without regularization
    lr = LogisticRegression()
    acc_scorer = make_scorer(accuracy_score)
    
    lr.fit(X_train,y_train)

    y_pred = lr.predict(X_test)

    print("Non regularized Logistic Regression Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

    # second run of LR is with regularization
    lr = LogisticRegression()
    params = {'C': np.logspace(-3, 3, 7) , 'penalty':['l1','l2'] }
    acc_scorer = make_scorer(accuracy_score)

    gs_lr = GridSearchCV(lr, params)

    gs_lr.fit(X_train, y_train)

    y_pred = gs_lr.predict(X_test)
    
    print("Regularized Logistic Regression best params {0}".format(gs_lr.best_params_))
    print("Regularized Logistic Regression Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

## Linear Support Vector Classification

In [None]:
def SVMCl(X_train, X_test, y_train, y_test):

    svm = LinearSVC()

    params = { 'C': np.logspace(-4, 4, 9) }

    gs_svm = GridSearchCV(svm, params)
    gs_svm.fit(X_train, y_train)

    y_pred = gs_svm.predict(X_test)
    
    print("SVM classification best params {0}".format(gs_svm.best_params_))
    print("SVM classification Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

## Random Forest

In [None]:
def RandomForestCl(X_train, X_test, y_train, y_test):

    rfc = RandomForestClassifier()
    params = { 'n_estimators': [50, 100, 200, 500, 1000] }
    acc_scorer = make_scorer(accuracy_score)
    
    gs_rfc = GridSearchCV(rfc, params, scoring=acc_scorer)
    
    model = gs_rfc.fit(X_train, y_train)
    
    y_pred = gs_rfc.predict(X_test)

    print("Random Forest best params {0}".format(gs_rfc.best_params_))
    print("Random Forest Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

## AdaBoost classifier

In [None]:
def AdaBoostCl(X_train, X_test, y_train, y_test):
    
    adaboost = AdaBoostClassifier()
    params = { 'n_estimators': [50, 100, 150, 200], 'learning_rate': np.logspace(-4, 0, 9) }
    acc_scorer = make_scorer(accuracy_score) #Pas nécessaire avec AdBosst car sa fonction score est de base accuracy

    gs_adaboost = GridSearchCV(adaboost, params, scoring=acc_scorer)

    gs_adaboost.fit(X_train, y_train)

    y_pred = gs_adaboost.predict(X_test)
    
    print("AdaaBoost best params {0}".format(gs_adaboost.best_params_))
    print("AdaBoost Accuracy Score on test set {0:.2f}".format(accuracy_score(y_test, y_pred)))

# Run classifiers on several train sets

## 1. Test set included in initial clustering

This run takes a dataset which was clustered as a whole: train and test samples together. This is likely to cause data leakage.

In [None]:
# Separate features and labels
X = df_customers[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                                      'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                                      'TotalAmountSpent']]
y = df_customers['cluster_k']

# Random sampling for train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Run all algorythms on this set

In [None]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

## 2. Clustering done only on train sets, test set labelled in a second step

In this approach, the customer base was spplit in train and test set, then the train set was classified by a k-means clustering, and finally the test set was labelled using the classifier predict function.

In [None]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data_2_train.csv"
full_path = os.path.join("data", datafile)
df_customers_train = pd.read_csv(full_path)
print(df_customers_train.shape)

# Load data into Pandas dataframe
datafile = "datazon_customer_data_2_test.csv"
full_path = os.path.join("data", datafile)
df_customers_test = pd.read_csv(full_path)
print(df_customers_test.shape)

X_train = df_customers_train[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                              'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                              'TotalAmountSpent']]
y_train = df_customers_train['cluster_k']

X_test = df_customers_test[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                            'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                            'TotalAmountSpent']]
y_test = df_customers_test['cluster_k']

## Run all algorythms on this set

In [None]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

## 3. Time biased train and test sets

In [None]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data_3_train.csv"
full_path = os.path.join("data", datafile)
df_customers_train = pd.read_csv(full_path)
print(df_customers_train.shape)

# Load data into Pandas dataframe
datafile = "datazon_customer_data_3_test.csv"
full_path = os.path.join("data", datafile)
df_customers_test = pd.read_csv(full_path)
print(df_customers_test.shape)

X_train = df_customers_train[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                              'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                              'TotalAmountSpent']]
y_train = df_customers_train['cluster_k']

X_test = df_customers_test[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                            'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                            'TotalAmountSpent']]
y_test = df_customers_test['cluster_k']

## Run all algorythms on this set

In [None]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)

## Prepare expense value biased train and test sets

In [None]:
# Load data into Pandas dataframe
datafile = "datazon_customer_data_4_train.csv"
full_path = os.path.join("data", datafile)
df_customers_train = pd.read_csv(full_path)
print(df_customers_train.shape)

# Load data into Pandas dataframe
datafile = "datazon_customer_data_4_test.csv"
full_path = os.path.join("data", datafile)
df_customers_test = pd.read_csv(full_path)
print(df_customers.shape)

X_train = df_customers_train[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                              'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                              'TotalAmountSpent']]
y_train = df_customers_train['cluster_k']

X_test = df_customers_test[['InvoiceNumberYear', 'LastInvoiceMonthNb',
                            'MeanLinesPerInvoice', 'MeanAmountPerInvoice',
                            'TotalAmountSpent']]
y_test = df_customers_test['cluster_k']


## Run all algorythms on this set

In [None]:
LogisticRegressionCl(X_train, X_test, y_train, y_test)
print()
SVMCl(X_train, X_test, y_train, y_test)
print()
RandomForestCl(X_train, X_test, y_train, y_test)
print()
AdaBoostCl(X_train, X_test, y_train, y_test)