In [1]:
# import the required libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 


#sklearn modules for Model Selection--------------------------------------

from sklearn import svm, tree, linear_model, neighbors
from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#sklearn modules for Model Evaluation & Improvement---------------------------
    
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
# from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold

from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import make_scorer, recall_score, log_loss
from sklearn.metrics import average_precision_score
  

#Standard libraries for data visualization---------------------

import numpy as np
import pandas as pd
import seaborn as sn
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib 
%matplotlib inline
color = sn.color_palette()
import matplotlib.ticker as mtick
from IPython.display import display
pd.options.display.max_columns = None
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve
import datetime


#Miscellaneous Utilitiy Libraries--------------------------------------
    
import random
import os
import re
import sys
import timeit
import string
import time
from datetime import datetime
from time import time
from dateutil.parser import parse
import joblib

import datetime



In [2]:
# create functions to categorize data

# helper functions
from sklearn.base import BaseEstimator, TransformerMixin
# from average_precision import apk

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

# 1. Data Preprocessing

In [3]:
# import data files

article = pd.read_csv('dataset/articles.csv')
customer = pd.read_csv('dataset/customers.csv')
transaction = pd.read_csv('dataset/transactions_train.csv')

# 1. Customer

In [4]:
# replace N/A values with 0 in FN (fashion_news) column

customer['FN'] = customer['FN'].replace(np.nan, 0)

customer['FN'].unique()

array([0., 1.])

In [5]:
# replace N/A values with 0 in Active status column

customer['Active'] = customer['Active'].replace(np.nan, 0)

customer['Active'].unique()

array([0., 1.])

In [6]:
# drop postal_code column because it does not bring interpretation
customer = customer.drop(columns='postal_code')

In [7]:
# create encoding values for columns containing text
customer.club_member_status = Categorize().fit_transform(customer[['club_member_status']]).club_member_status
customer.fashion_news_frequency = Categorize().fit_transform(customer[['fashion_news_frequency']]).fashion_news_frequency

In [8]:
# check the columns of customer data
customer.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 6 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      1371980 non-null  float64
 2   Active                  1371980 non-null  float64
 3   club_member_status      1371980 non-null  int8   
 4   fashion_news_frequency  1371980 non-null  int8   
 5   age                     1356119 non-null  float64
dtypes: float64(3), int8(2), object(1)
memory usage: 192.3 MB


# 2. Transaction 3M

In [10]:
# convert date column into date data type
transaction.t_dat = pd.to_datetime(transaction.t_dat) #, format='%y%m%d')

In [11]:
# sort dataset by ascending order of date and customer_id
transaction.sort_values(['t_dat', 'customer_id'], inplace=True)

# reset index of dataframe
transaction = transaction.reset_index(drop=True)

### 2.1 Transaction - Training - 3M model

In [13]:
# create function to transform date into date-range
def create_time_period_3M_training(apply_df):

    max_date = max(apply_df.t_dat) 

    l18m = max_date - datetime.timedelta(days=(360+180))
    l15m = max_date - datetime.timedelta(days=360+90)
    l12m = max_date - datetime.timedelta(days=360)
    l9m = max_date - datetime.timedelta(days=270)
    l6m = max_date - datetime.timedelta(days=180)
    l3m = max_date - datetime.timedelta(days=90)

    transaction_conditions = [
                            (apply_df['t_dat'] <= l18m),
                            (apply_df['t_dat'] > l18m) & (apply_df['t_dat'] <= l15m),
                            (apply_df['t_dat'] > l15m) & (apply_df['t_dat'] <= l12m),
                            (apply_df['t_dat'] > l12m) & (apply_df['t_dat'] <= l9m),
                            (apply_df['t_dat'] > l9m) & (apply_df['t_dat'] <= l6m),
                            (apply_df['t_dat'] > l6m) & (apply_df['t_dat'] <= l3m),
                            (apply_df['t_dat'] > l3m)
                        ] 

    values = ['l2y','l18m','l15m','l12m','l9m','l6m','l3m']

    apply_df['time_period'] = np.select(transaction_conditions, values)

    return apply_df

In [14]:
# apply the function into dataframe
transaction_3M_training = create_time_period_3M_training(transaction)

# testing first value of customer_id
transaction_3M_training['customer_id'][0]

'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318'

In [15]:
# create the pivot table - group by customer id for transaction dataframe
transaction_3M_training_master = pd.pivot_table(transaction_3M_training, index=['customer_id','time_period'], values=['price'], aggfunc=[np.sum, np.count_nonzero])

transaction_3M_training_master = pd.DataFrame(transaction_3M_training_master)

transaction_3M_training_master = transaction_3M_training_master.stack()

transaction_3M_training_master = transaction_3M_training_master.reset_index()

transaction_3M_training_master = transaction_3M_training_master.rename(columns={'level_2':'value','sum':'order_value','count_nonzero':'order_number'})

# creating the extra ABS column (value per order) 
transaction_3M_training_master['abs'] = transaction_3M_training_master['order_value']/transaction_3M_training_master['order_number']

transaction_3M_training_master

Unnamed: 0,customer_id,time_period,value,order_value,order_number,abs
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l12m,price,0.096220,4,0.024055
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l15m,price,0.186356,5,0.037271
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l18m,price,0.111814,3,0.037271
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l2y,price,0.110119,3,0.036706
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l3m,price,0.050831,1,0.050831
...,...,...,...,...,...,...
3842524,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l2y,price,0.416068,24,0.017336
3842525,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l3m,price,0.142203,10,0.014220
3842526,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l9m,price,0.033881,1,0.033881
3842527,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,l6m,price,0.209203,7,0.029886


In [16]:
# drop "value" column
transaction_3M_training_master = transaction_3M_training_master.drop('value', axis=1)

transaction_3M_training_master

Unnamed: 0,customer_id,time_period,order_value,order_number,abs
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l12m,0.096220,4,0.024055
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l15m,0.186356,5,0.037271
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l18m,0.111814,3,0.037271
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l2y,0.110119,3,0.036706
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l3m,0.050831,1,0.050831
...,...,...,...,...,...
3842524,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l2y,0.416068,24,0.017336
3842525,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l3m,0.142203,10,0.014220
3842526,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l9m,0.033881,1,0.033881
3842527,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,l6m,0.209203,7,0.029886


In [17]:
# create fact columns for each data range
transaction_3M_training_master = pd.pivot_table(data=transaction_3M_training_master,index='customer_id',values=['order_value','order_number','abs'],columns='time_period')

transaction_3M_training_master = transaction_3M_training_master.replace(np.nan,0)

transaction_3M_training_master = transaction_3M_training_master.reset_index()

transaction_3M_training_master

Unnamed: 0_level_0,customer_id,abs,abs,abs,abs,abs,abs,abs,order_number,order_number,order_number,order_number,order_number,order_number,order_number,order_value,order_value,order_value,order_value,order_value,order_value,order_value
time_period,Unnamed: 1_level_1,l12m,l15m,l18m,l2y,l3m,l6m,l9m,l12m,l15m,l18m,l2y,l3m,l6m,l9m,l12m,l15m,l18m,l2y,l3m,l6m,l9m
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,0.018729,4.0,5.0,3.0,3.0,1.0,0.0,5.0,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000,0.093644
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,0.024872,1.0,18.0,24.0,9.0,1.0,14.0,19.0,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051,0.472559
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,0.023147,0.0,0.0,0.0,4.0,1.0,10.0,3.0,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644,0.069441
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,0.000000
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.000000,2.0,0.0,0.0,4.0,4.0,3.0,0.0,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,0.035576,3.0,0.0,23.0,0.0,12.0,11.0,2.0,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271,0.071153
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,0.025365,12.0,8.0,11.0,9.0,8.0,18.0,18.0,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746,0.456576
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,0.033881,3.0,7.0,0.0,24.0,10.0,0.0,1.0,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000,0.033881
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.000000,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,0.000000


In [18]:
# create column to identify customers who have first order in the last 3 months (churn period) of the dataset
transaction_3M_training_master.columns = [f'{j}_{i}' for i,j in transaction_3M_training_master.columns]

p21m_cols = ['l12m_order_number','l15m_order_number', 'l18m_order_number', 'l2y_order_number', 'l6m_order_number', 'l9m_order_number']

transaction_3M_training_master['p21m_order_number'] = transaction_3M_training_master[list(transaction_3M_training_master[p21m_cols])].sum(axis=1)

# transaction_master[transaction_master.customer_id == 0].iloc[:,:16]
transaction_3M_training_master['new_mb'] = np.where(transaction_3M_training_master['p21m_order_number']==0,1,0)

transaction_3M_training_master['new_mb'].value_counts()

0    1291147
1      71134
Name: new_mb, dtype: int64

In [19]:
# create the churn status column: customers who did not have any order in the latest 3 months

transaction_3M_training_master['churn_status'] = np.where(transaction_3M_training_master['l3m_order_number']==0,1,0)

transaction_3M_training_master = transaction_3M_training_master.rename(columns={'_customer_id':'customer_id'})

transaction_3M_training_master

Unnamed: 0,customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l3m_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l3m_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l3m_order_value,l6m_order_value,l9m_order_value,p21m_order_number,new_mb,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,0.018729,4.0,5.0,3.0,3.0,1.0,0.0,5.0,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000,0.093644,20.0,0,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,0.024872,1.0,18.0,24.0,9.0,1.0,14.0,19.0,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051,0.472559,85.0,0,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,0.023147,0.0,0.0,0.0,4.0,1.0,10.0,3.0,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644,0.069441,17.0,0,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,0.000000,2.0,0,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.000000,2.0,0.0,0.0,4.0,4.0,3.0,0.0,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068,0.000000,9.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,0.035576,3.0,0.0,23.0,0.0,12.0,11.0,2.0,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271,0.071153,39.0,0,0
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,0.025365,12.0,8.0,11.0,9.0,8.0,18.0,18.0,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746,0.456576,76.0,0,0
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,0.033881,3.0,7.0,0.0,24.0,10.0,0.0,1.0,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000,0.033881,35.0,0,0
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.000000,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,0.000000,7.0,0,1


In [21]:
# merge transaction df with customer df on customer_id and new_mb columns

transaction_3M_training_master = transaction_3M_training_master.reset_index(drop=True)

customer_3M_training = customer.merge(transaction_3M_training_master[['customer_id','new_mb']], on='customer_id', how='left')

# drop new customer from customer df 
customer_3M_training = customer_3M_training[customer_3M_training['new_mb'] == 0]

customer_3M_training

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,new_mb
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.0
...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.0


In [23]:
# merge customer df with master transaction df in full to get all the columns for data training
dataset_3M_training = customer_3M_training.merge(transaction_3M_training_master, on='customer_id', how='left')

dataset_3M_training = dataset_3M_training[dataset_3M_training.new_mb_x == 0] 

dataset_3M_training = dataset_3M_training.reset_index(drop=True)

dataset_3M_training

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,new_mb_x,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l3m_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l3m_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l3m_order_value,l6m_order_value,l9m_order_value,p21m_order_number,new_mb_y,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.0,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,0.018729,4.0,5.0,3.0,3.0,1.0,0.0,5.0,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000,0.093644,20.0,0,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.0,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,0.024872,1.0,18.0,24.0,9.0,1.0,14.0,19.0,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051,0.472559,85.0,0,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.0,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,0.023147,0.0,0.0,0.0,4.0,1.0,10.0,3.0,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644,0.069441,17.0,0,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.0,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,0.000000,2.0,0,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.0,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.000000,2.0,0.0,0.0,4.0,4.0,3.0,0.0,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068,0.000000,9.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291142,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.0,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,0.035576,3.0,0.0,23.0,0.0,12.0,11.0,2.0,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271,0.071153,39.0,0,0
1291143,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.0,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,0.025365,12.0,8.0,11.0,9.0,8.0,18.0,18.0,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746,0.456576,76.0,0,0
1291144,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.0,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,0.033881,3.0,7.0,0.0,24.0,10.0,0.0,1.0,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000,0.033881,35.0,0,0
1291145,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.000000,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,0.000000,7.0,0,1


In [24]:
# drop L3M data (as these are used for churn prediction)
dataset_3M_training = dataset_3M_training.drop(['l3m_order_number','l3m_order_value','l3m_abs','new_mb_y'], axis=1)

dataset_3M_training = dataset_3M_training.rename(columns={'new_mb_x':'new_mb'})

dataset_3M_training

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,new_mb,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.0,0.024055,0.037271,0.037271,0.036706,0.000000,0.018729,4.0,5.0,3.0,3.0,0.0,5.0,0.096220,0.186356,0.111814,0.110119,0.000000,0.093644,20.0,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.0,0.050831,0.028702,0.036353,0.036141,0.024075,0.024872,1.0,18.0,24.0,9.0,14.0,19.0,0.050831,0.516644,0.872475,0.325271,0.337051,0.472559,85.0,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.0,0.000000,0.000000,0.000000,0.036424,0.042864,0.023147,0.0,0.0,0.0,4.0,10.0,3.0,0.000000,0.000000,0.000000,0.145695,0.428644,0.069441,17.0,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.0,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,2.0,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.0,0.051890,0.000000,0.000000,0.027525,0.042356,0.000000,2.0,0.0,0.0,4.0,3.0,0.0,0.103780,0.000000,0.000000,0.110102,0.127068,0.000000,9.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291142,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.0,0.007893,0.000000,0.028576,0.000000,0.022479,0.035576,3.0,0.0,23.0,0.0,11.0,2.0,0.023678,0.000000,0.657237,0.000000,0.247271,0.071153,39.0,0
1291143,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.0,0.019588,0.012907,0.026331,0.023900,0.022375,0.025365,12.0,8.0,11.0,9.0,18.0,18.0,0.235051,0.103254,0.289644,0.215102,0.402746,0.456576,76.0,0
1291144,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.0,0.017610,0.020564,0.000000,0.017336,0.000000,0.033881,3.0,7.0,0.0,24.0,0.0,1.0,0.052831,0.143949,0.000000,0.416068,0.000000,0.033881,35.0,0
1291145,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.0,0.000000,0.000000,0.000000,0.000000,0.029886,0.000000,0.0,0.0,0.0,0.0,7.0,0.0,0.000000,0.000000,0.000000,0.000000,0.209203,0.000000,7.0,1


In [25]:
# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [26]:
# apply MinMaxScaler on columns that need to be normalized
normalize_cols = ['age', 'l12m_abs', 'l15m_abs', 'l18m_abs', 'l2y_abs', 'l6m_abs',
       'l9m_abs', 'l12m_order_number', 'l15m_order_number',
       'l18m_order_number', 'l2y_order_number', 'l6m_order_number',
       'l9m_order_number', 'l12m_order_value', 'l15m_order_value',
       'l18m_order_value', 'l2y_order_value', 'l6m_order_value',
       'l9m_order_value', 'p21m_order_number']

# for col in normalize_cols:
#     scaled = scaler.fit_transform(dataset[col])

dataset_3M_training[normalize_cols] = scaler.fit_transform(dataset_3M_training[normalize_cols])

In [28]:
# Final test on churn status values
dataset_3M_training.churn_status.value_counts(normalize=True)

1    0.653187
0    0.346813
Name: churn_status, dtype: float64

### 2.2 Transaction - Testing - 3M model

In [29]:
# create function to transform date into date-range
def create_time_period_3M_testing(apply_df):

    max_date = max(apply_df.t_dat) 

    l18m = max_date - datetime.timedelta(days=(360+180))
    l15m = max_date - datetime.timedelta(days=360+90)
    l12m = max_date - datetime.timedelta(days=360)
    l9m = max_date - datetime.timedelta(days=270)
    l6m = max_date - datetime.timedelta(days=180)
    l3m = max_date - datetime.timedelta(days=90)


    transaction_conditions = [
                                (apply_df['t_dat'] <= l18m),
                                (apply_df['t_dat'] > l18m) & (apply_df['t_dat'] <= l15m),
                                (apply_df['t_dat'] > l15m) & (apply_df['t_dat'] <= l12m),
                                (apply_df['t_dat'] > l12m) & (apply_df['t_dat'] <= l9m),
                                (apply_df['t_dat'] > l9m) & (apply_df['t_dat'] <= l6m),
                                (apply_df['t_dat'] > l6m) & (apply_df['t_dat'] <= l3m),
                                (apply_df['t_dat'] > l3m)
                            ] 

    values = ['l3m','l2y','l18m','l15m','l12m','l9m','l6m']

    apply_df['time_period'] = np.select(transaction_conditions, values)

    return apply_df

In [31]:
# transaction = pd.read_csv('dataset/transactions_train.csv')

# transaction.t_dat = pd.to_datetime(transaction.t_dat) #, format='%y%m%d')

# apply the function into dataframe
transaction_3M_testing = create_time_period_3M_testing(transaction)

transaction_3M_testing

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,time_period
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,l3m
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,l3m
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,l3m
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,l3m
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,l3m
...,...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,l6m
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,l6m
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,l6m
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,l6m


In [33]:
# create the pivot table - group by customer id for transaction dataframe

transaction_3M_testing_master = pd.pivot_table(transaction_3M_testing, index=['customer_id','time_period'], values=['price'], aggfunc=[np.sum, np.count_nonzero])

transaction_3M_testing_master = pd.DataFrame(transaction_3M_testing_master)

transaction_3M_testing_master = transaction_3M_testing_master.stack()

transaction_3M_testing_master = transaction_3M_testing_master.reset_index()

transaction_3M_testing_master = transaction_3M_testing_master.rename(columns={'level_2':'value','sum':'order_value','count_nonzero':'order_number'})

transaction_3M_testing_master['abs'] = transaction_3M_testing_master['order_value']/transaction_3M_testing_master['order_number']

transaction_3M_testing_master

Unnamed: 0,customer_id,time_period,value,order_value,order_number,abs
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l12m,price,0.093644,5,0.018729
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l15m,price,0.096220,4,0.024055
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l18m,price,0.186356,5,0.037271
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l2y,price,0.111814,3,0.037271
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l3m,price,0.110119,3,0.036706
...,...,...,...,...,...,...
3842524,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l18m,price,0.143949,7,0.020564
3842525,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l3m,price,0.416068,24,0.017336
3842526,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l6m,price,0.142203,10,0.014220
3842527,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,l9m,price,0.209203,7,0.029886


In [34]:
# create fact columns for each data range
transaction_3M_testing_master = pd.pivot_table(data=transaction_3M_testing_master,index='customer_id',values=['order_value','order_number','abs'],columns='time_period')

transaction_3M_testing_master = transaction_3M_testing_master.replace(np.nan,0)

transaction_3M_testing_master = transaction_3M_testing_master.reset_index()

transaction_3M_testing_master

Unnamed: 0_level_0,customer_id,abs,abs,abs,abs,abs,abs,abs,order_number,order_number,order_number,order_number,order_number,order_number,order_number,order_value,order_value,order_value,order_value,order_value,order_value,order_value
time_period,Unnamed: 1_level_1,l12m,l15m,l18m,l2y,l3m,l6m,l9m,l12m,l15m,l18m,l2y,l3m,l6m,l9m,l12m,l15m,l18m,l2y,l3m,l6m,l9m
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.018729,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,5.0,4.0,5.0,3.0,3.0,1.0,0.0,0.093644,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024872,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,19.0,1.0,18.0,24.0,9.0,1.0,14.0,0.472559,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.023147,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,3.0,0.0,0.0,0.0,4.0,1.0,10.0,0.069441,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.000000,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.0,2.0,0.0,0.0,4.0,4.0,3.0,0.000000,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.035576,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,2.0,3.0,0.0,23.0,0.0,12.0,11.0,0.071153,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.025365,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,18.0,12.0,8.0,11.0,9.0,8.0,18.0,0.456576,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.033881,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,1.0,3.0,7.0,0.0,24.0,10.0,0.0,0.033881,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203


In [35]:
# rename columns and stack dataset
transaction_3M_testing_master.columns = [f'{j}_{i}' for i,j in transaction_3M_testing_master.columns]

transaction_3M_testing_master

Unnamed: 0,_customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l3m_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l3m_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l3m_order_value,l6m_order_value,l9m_order_value
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.018729,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,5.0,4.0,5.0,3.0,3.0,1.0,0.0,0.093644,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024872,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,19.0,1.0,18.0,24.0,9.0,1.0,14.0,0.472559,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.023147,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,3.0,0.0,0.0,0.0,4.0,1.0,10.0,0.069441,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.000000,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.0,2.0,0.0,0.0,4.0,4.0,3.0,0.000000,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.035576,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,2.0,3.0,0.0,23.0,0.0,12.0,11.0,0.071153,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.025365,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,18.0,12.0,8.0,11.0,9.0,8.0,18.0,0.456576,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.033881,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,1.0,3.0,7.0,0.0,24.0,10.0,0.0,0.033881,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203


In [36]:
# apply fucntion on columns and create churn status colum
p21m_cols = ['l12m_order_number','l15m_order_number', 'l18m_order_number', 'l2y_order_number', 'l6m_order_number', 'l9m_order_number']

transaction_3M_testing_master['p21m_order_number'] = transaction_3M_testing_master[list(transaction_3M_testing_master[p21m_cols])].sum(axis=1)

transaction_3M_testing_master.loc[transaction_3M_testing_master['l6m_order_number']==0,'churn_status'] = 1

transaction_3M_testing_master['churn_status'] = transaction_3M_testing_master['churn_status'].replace(np.nan,0)

transaction_3M_testing_master

Unnamed: 0,_customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l3m_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l3m_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l3m_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.018729,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,5.0,4.0,5.0,3.0,3.0,1.0,0.0,0.093644,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000,18.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024872,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,19.0,1.0,18.0,24.0,9.0,1.0,14.0,0.472559,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051,77.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.023147,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,3.0,0.0,0.0,0.0,4.0,1.0,10.0,0.069441,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644,14.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.000000,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.0,2.0,0.0,0.0,4.0,4.0,3.0,0.000000,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.035576,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,2.0,3.0,0.0,23.0,0.0,12.0,11.0,0.071153,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271,51.0,0.0
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.025365,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,18.0,12.0,8.0,11.0,9.0,8.0,18.0,0.456576,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746,75.0,0.0
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.033881,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,1.0,3.0,7.0,0.0,24.0,10.0,0.0,0.033881,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000,21.0,0.0
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,7.0,1.0


In [3]:
# Final test on churn status values
transaction_3M_testing_master.churn_status.value_counts()

In [38]:
# rename column and reset index of dataframe
transaction_3M_testing_master = transaction_3M_testing_master.rename(columns={'_customer_id':'customer_id'})

transaction_3M_testing_master = transaction_3M_testing_master.reset_index(drop=True)

transaction_3M_testing_master

Unnamed: 0,customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l3m_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l3m_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l3m_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.018729,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,5.0,4.0,5.0,3.0,3.0,1.0,0.0,0.093644,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000,18.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024872,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,19.0,1.0,18.0,24.0,9.0,1.0,14.0,0.472559,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051,77.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.023147,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,3.0,0.0,0.0,0.0,4.0,1.0,10.0,0.069441,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644,14.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.000000,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.0,2.0,0.0,0.0,4.0,4.0,3.0,0.000000,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.035576,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,2.0,3.0,0.0,23.0,0.0,12.0,11.0,0.071153,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271,51.0,0.0
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.025365,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,18.0,12.0,8.0,11.0,9.0,8.0,18.0,0.456576,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746,75.0,0.0
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.033881,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,1.0,3.0,7.0,0.0,24.0,10.0,0.0,0.033881,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000,21.0,0.0
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,7.0,1.0


In [39]:
# merge customer and transaction dataframe
customer_3M_testing = customer.merge(transaction_3M_testing_master[['customer_id']], on='customer_id', how='left')

# customer_3M_testing = customer_3M_testing[customer_3M_testing['new_mb'] == 0]

customer_3M_testing = customer_3M_testing.reset_index(drop=True)

In [40]:
customer_3M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0
...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0


In [41]:
# full merge customer and transaction dataframe
dataset_3M_testing = customer_3M_testing.merge(transaction_3M_testing_master, on='customer_id', how='left')

dataset_3M_testing = dataset_3M_testing.reset_index(drop=True)

dataset_3M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l3m_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l3m_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l3m_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.018729,0.024055,0.037271,0.037271,0.036706,0.050831,0.000000,5.0,4.0,5.0,3.0,3.0,1.0,0.0,0.093644,0.096220,0.186356,0.111814,0.110119,0.050831,0.000000,18.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.024872,0.050831,0.028702,0.036353,0.036141,0.027102,0.024075,19.0,1.0,18.0,24.0,9.0,1.0,14.0,0.472559,0.050831,0.516644,0.872475,0.325271,0.027102,0.337051,77.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.023147,0.000000,0.000000,0.000000,0.036424,0.061000,0.042864,3.0,0.0,0.0,0.0,4.0,1.0,10.0,0.069441,0.000000,0.000000,0.000000,0.145695,0.061000,0.428644,14.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.000000,0.051890,0.000000,0.000000,0.027525,0.032186,0.042356,0.0,2.0,0.0,0.0,4.0,4.0,3.0,0.000000,0.103780,0.000000,0.000000,0.110102,0.128746,0.127068,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.035576,0.007893,0.000000,0.028576,0.000000,0.024418,0.022479,2.0,3.0,0.0,23.0,0.0,12.0,11.0,0.071153,0.023678,0.000000,0.657237,0.000000,0.293017,0.247271,51.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.025365,0.019588,0.012907,0.026331,0.023900,0.013119,0.022375,18.0,12.0,8.0,11.0,9.0,8.0,18.0,0.456576,0.235051,0.103254,0.289644,0.215102,0.104949,0.402746,75.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.033881,0.017610,0.020564,0.000000,0.017336,0.014220,0.000000,1.0,3.0,7.0,0.0,24.0,10.0,0.0,0.033881,0.052831,0.143949,0.000000,0.416068,0.142203,0.000000,21.0,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,7.0,1.0


In [42]:
# normalize dataset 
dataset_3M_testing = dataset_3M_testing.drop(['l3m_order_number','l3m_order_value','l3m_abs'], axis=1)

dataset_3M_testing[normalize_cols] = scaler.fit_transform(dataset_3M_testing[normalize_cols])

dataset_3M_testing


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.018729,0.024055,0.037271,0.037271,0.050831,0.000000,5.0,4.0,5.0,3.0,1.0,0.0,0.093644,0.096220,0.186356,0.111814,0.050831,0.000000,18.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.024872,0.050831,0.028702,0.036353,0.027102,0.024075,19.0,1.0,18.0,24.0,1.0,14.0,0.472559,0.050831,0.516644,0.872475,0.027102,0.337051,77.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.023147,0.000000,0.000000,0.000000,0.061000,0.042864,3.0,0.0,0.0,0.0,1.0,10.0,0.069441,0.000000,0.000000,0.000000,0.061000,0.428644,14.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.000000,0.051890,0.000000,0.000000,0.032186,0.042356,0.0,2.0,0.0,0.0,4.0,3.0,0.000000,0.103780,0.000000,0.000000,0.128746,0.127068,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.035576,0.007893,0.000000,0.028576,0.024418,0.022479,2.0,3.0,0.0,23.0,12.0,11.0,0.071153,0.023678,0.000000,0.657237,0.293017,0.247271,51.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.025365,0.019588,0.012907,0.026331,0.013119,0.022375,18.0,12.0,8.0,11.0,8.0,18.0,0.456576,0.235051,0.103254,0.289644,0.104949,0.402746,75.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.033881,0.017610,0.020564,0.000000,0.014220,0.000000,1.0,3.0,7.0,0.0,10.0,0.0,0.033881,0.052831,0.143949,0.000000,0.142203,0.000000,21.0,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.029886,0.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.209203,7.0,1.0


In [45]:
dataset_3M_testing.churn_status.value_counts(normalize=True)

1.0    0.61908
0.0    0.38092
Name: churn_status, dtype: float64

### 2.3 3M Modeling

In [46]:
# Replace N/A values with 0

dataset_3M_training = dataset_3M_training.replace(np.nan,0)

dataset_3M_training = dataset_3M_training.drop(columns='new_mb')

In [47]:
# create X and y for training
response = dataset_3M_training["churn_status"]
cus_id = dataset_3M_training["customer_id"]
traindata = dataset_3M_training.drop(columns=["churn_status","customer_id"],axis=1)

In [48]:
# create ratio for train test set 
ratio = int(round(traindata.shape[0]*0.8,0))

ratio

1032918

In [49]:
# create train test dataset 
X_train = traindata.iloc[:ratio,:]
y_train = response.iloc[:ratio]
cus_id_test = cus_id.iloc[:ratio]

X_test = traindata.iloc[ratio:,:]
y_test = response.iloc[ratio:]
cus_id_test = cus_id.iloc[ratio:]


#to resolve any class imbalance - use stratify parameter.

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (1032918, 24)
Number transactions y_train dataset:  (1032918,)
Number transactions X_test dataset:  (258229, 24)
Number transactions y_test dataset:  (258229,)


#### Using Logistic Regression after testing over many algorithms
Please check "churn_prediction_v2.ipynb" file in draft folder to see model selection part

In [50]:
# Fitting Logistic Regression to the Training set 
classifier = LogisticRegression(random_state = 0, penalty = 'l2', C = 100)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

#Evaluate results
acc = accuracy_score(y_test, y_pred )
prec = precision_score(y_test, y_pred )
rec = recall_score(y_test, y_pred )
f1 = f1_score(y_test, y_pred )
f2 = fbeta_score(y_test, y_pred, beta=2.0)

#probability score
y_pred_probs = classifier.predict_proba(X_test)
y_pred_probs  = y_pred_probs [:, 1]

results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score
0,Logistic Regression,0.769809,0.783665,0.894703,0.835511,0.870048


In [51]:
#Revalidate final results with Confusion Matrix:
cm = confusion_matrix(y_test, y_pred) 

pd.crosstab(y_test,y_pred,rownames=['ACTUAL'],colnames=['PRED'])

PRED,0,1
ACTUAL,Unnamed: 1_level_1,Unnamed: 2_level_1
0,47821,41675
1,17767,150966


In [52]:
# create summary for each customer

training_3M = pd.concat([cus_id_test, y_test], axis = 1).dropna()

training_3M['customer_id'] = cus_id_test

training_3M['actual'] = y_test

training_3M['predictions'] = y_pred 

training_3M["churn_prob(%)"] = y_pred_probs 

training_3M["churn_prob(%)"] = training_3M["churn_prob(%)"]*100

training_3M["churn_prob(%)"]=training_3M["churn_prob(%)"].round(2)

training_3M = training_3M[['customer_id', 'actual', 'predictions', 'churn_prob(%)']]

# final_results ['Ranking'] = pd.qcut(final_results['churn_prob(%)'].rank(method = 'first'),10,labels=range(10,0,-1))

training_3M.head(30)

Unnamed: 0,customer_id,actual,predictions,churn_prob(%)
1032918,ccd3b4cbd41990737eeceeacbd3b741b648d9fdb6903fb...,1,1,62.21
1032919,ccd3cb667c480d10255cfe91357b59b78763dcfa7191c3...,1,1,95.27
1032920,ccd3cbb6a9feb94ec5e7ab0b5e424627db5a57d171b258...,1,1,82.89
1032921,ccd3cddfa077d339e08b85a7c4204d12ce7d4010a75d04...,1,1,81.15
1032922,ccd3d2d078601ffe0472232c2d75f4b703c0b51f4097e9...,0,1,57.75
1032923,ccd3e09a7c4b1962b345cebbd70cc28ec16d76e65ac717...,1,1,82.17
1032924,ccd3e529fbe23574d12e560a90bb64d265c5a11fd64d11...,0,1,59.35
1032925,ccd3ecf9786068597a40ec98b3a4ffbfc49eede16994d0...,1,1,73.4
1032926,ccd3f640fbdac62269e4df6513210ffb095f91e7a3a3c3...,1,1,70.7
1032927,ccd4000b383d861f59ee6b6dbc2b4a1ace8de9d8485470...,0,0,48.89


In [53]:
# save trained model to local folder
import pickle as pkl

saved_model = pkl.dumps(classifier)
 
lr_classifier_from_pickle = pkl.loads(saved_model) 

In [54]:
# prepare testing dataset
dataset_3M_testing = dataset_3M_testing.replace(np.nan,0)

dataset_3M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.100301,0.000000,0.020161,0.010336,0.013774,0.008357,0.002994,0.000000,0.009174,0.008549,0.008290,0.010641,0.003047,0.000000,0.012097,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.053478,0.071378,0.076613,0.002584,0.049587,0.066852,0.002994,0.042945,0.046293,0.004516,0.022983,0.083032,0.001625,0.032264,0.051747,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.120368,0.127085,0.012097,0.000000,0.000000,0.000000,0.002994,0.030675,0.006803,0.000000,0.000000,0.000000,0.003657,0.041032,0.009409,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.000000,0.001344,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.063512,0.125578,0.000000,0.005168,0.000000,0.000000,0.011976,0.009202,0.000000,0.009221,0.000000,0.000000,0.007719,0.012164,0.006048,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.048183,0.066647,0.008065,0.007752,0.000000,0.064067,0.035928,0.033742,0.006970,0.002104,0.000000,0.062548,0.017567,0.023670,0.034274,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.025886,0.066337,0.072581,0.031008,0.022039,0.030641,0.023952,0.055215,0.044728,0.020884,0.004593,0.027565,0.006292,0.038553,0.050403,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.028060,0.000000,0.004032,0.007752,0.019284,0.000000,0.029940,0.000000,0.003319,0.004694,0.006403,0.000000,0.008525,0.000000,0.014113,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704,1.0


In [56]:
X_new = dataset_3M_testing.iloc[:,1:25]

X_new

Unnamed: 0,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number
0,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.100301,0.000000,0.020161,0.010336,0.013774,0.008357,0.002994,0.000000,0.009174,0.008549,0.008290,0.010641,0.003047,0.000000,0.012097
1,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.053478,0.071378,0.076613,0.002584,0.049587,0.066852,0.002994,0.042945,0.046293,0.004516,0.022983,0.083032,0.001625,0.032264,0.051747
2,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.120368,0.127085,0.012097,0.000000,0.000000,0.000000,0.002994,0.030675,0.006803,0.000000,0.000000,0.000000,0.003657,0.041032,0.009409
3,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.000000,0.001344
4,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.063512,0.125578,0.000000,0.005168,0.000000,0.000000,0.011976,0.009202,0.000000,0.009221,0.000000,0.000000,0.007719,0.012164,0.006048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.048183,0.066647,0.008065,0.007752,0.000000,0.064067,0.035928,0.033742,0.006970,0.002104,0.000000,0.062548,0.017567,0.023670,0.034274
1371976,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.025886,0.066337,0.072581,0.031008,0.022039,0.030641,0.023952,0.055215,0.044728,0.020884,0.004593,0.027565,0.006292,0.038553,0.050403
1371977,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.028060,0.000000,0.004032,0.007752,0.019284,0.000000,0.029940,0.000000,0.003319,0.004694,0.006403,0.000000,0.008525,0.000000,0.014113
1371978,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704


In [57]:
# apply trained model on test dataset
new_churn_prediction = lr_classifier_from_pickle.predict(X_new)

new_churn_prediction

array([0, 0, 0, ..., 0, 1, 1])

In [58]:
# predict churn probability for each customer in the nrew period
new_churn_proba = lr_classifier_from_pickle.predict_proba(X_new)
new_churn_proba = new_churn_proba[:,1]
new_churn_proba

array([0.45467205, 0.07384569, 0.33129326, ..., 0.38203632, 0.65374343,
       0.9090894 ])

In [59]:
# add churn prediction column to original dataframe
dataset_3M_testing['new_churn_prediction'] = new_churn_prediction
dataset_3M_testing['new_churn_proba'] = new_churn_proba

dataset_3M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status,new_churn_prediction,new_churn_proba
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.100301,0.000000,0.020161,0.010336,0.013774,0.008357,0.002994,0.000000,0.009174,0.008549,0.008290,0.010641,0.003047,0.000000,0.012097,0.0,0,0.454672
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.053478,0.071378,0.076613,0.002584,0.049587,0.066852,0.002994,0.042945,0.046293,0.004516,0.022983,0.083032,0.001625,0.032264,0.051747,0.0,0,0.073846
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.120368,0.127085,0.012097,0.000000,0.000000,0.000000,0.002994,0.030675,0.006803,0.000000,0.000000,0.000000,0.003657,0.041032,0.009409,0.0,0,0.331293
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.000000,0.001344,1.0,1,0.906447
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.063512,0.125578,0.000000,0.005168,0.000000,0.000000,0.011976,0.009202,0.000000,0.009221,0.000000,0.000000,0.007719,0.012164,0.006048,0.0,0,0.315325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.048183,0.066647,0.008065,0.007752,0.000000,0.064067,0.035928,0.033742,0.006970,0.002104,0.000000,0.062548,0.017567,0.023670,0.034274,0.0,0,0.272188
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.025886,0.066337,0.072581,0.031008,0.022039,0.030641,0.023952,0.055215,0.044728,0.020884,0.004593,0.027565,0.006292,0.038553,0.050403,0.0,0,0.078723
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.028060,0.000000,0.004032,0.007752,0.019284,0.000000,0.029940,0.000000,0.003319,0.004694,0.006403,0.000000,0.008525,0.000000,0.014113,0.0,0,0.382036
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704,1.0,1,0.653743


In [60]:
# create churn probability for each customer
results_3M = dataset_3M_testing[['customer_id','new_churn_proba']]

results_3M = results_3M.rename(columns={'new_churn_proba':'churn_proba_3M'})

results_3M = results_3M.reset_index(drop=True)

results_3M

Unnamed: 0,customer_id,churn_proba_3M
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.454672
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.073846
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.331293
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.906447
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.315325
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.272188
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.078723
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.382036
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.653743


### End of notebook

# Testing -  Transaction 6M 
We tried to change the churn definition to 6 months however it does not have as high accuracy as the 3-month model

==========> This part is not used for the customer segmentation model

In [62]:

# transaction.t_dat = pd.to_datetime(transaction.t_dat) #, format='%y%m%d')

# transaction.sort_values(['t_dat', 'customer_id'], inplace=True)

# transaction = transaction.reset_index(drop=True)

### 3.1 Transaction - Training - 6M model

In [63]:
def create_time_period_6M_training(apply_df):

    max_date = max(apply_df.t_dat) 

    l18m = max_date - datetime.timedelta(days=(360+180))
    l15m = max_date - datetime.timedelta(days=360+90)
    l12m = max_date - datetime.timedelta(days=360)
    l9m = max_date - datetime.timedelta(days=270)
    l6m = max_date - datetime.timedelta(days=180)
    # l3m = max_date - datetime.timedelta(days=90)

    transaction_conditions = [
                            (apply_df['t_dat'] <= l18m),
                            (apply_df['t_dat'] > l18m) & (apply_df['t_dat'] <= l15m),
                            (apply_df['t_dat'] > l15m) & (apply_df['t_dat'] <= l12m),
                            (apply_df['t_dat'] > l12m) & (apply_df['t_dat'] <= l9m),
                            (apply_df['t_dat'] > l9m) & (apply_df['t_dat'] <= l6m),
                            # (apply_df['t_dat'] > l6m) & (apply_df['t_dat'] <= l3m),
                            (apply_df['t_dat'] > l6m)
                        ] 

    values = ['l2y','l18m','l15m','l12m','l9m','l6m']

    apply_df['time_period'] = np.select(transaction_conditions, values)

    return apply_df

In [64]:
transaction_6M_training = create_time_period_6M_training(transaction)

transaction_6M_training['customer_id'][0]


'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318'

In [65]:
transaction_6M_training_master = pd.pivot_table(transaction_6M_training, index=['customer_id','time_period'], values=['price'], aggfunc=[np.sum, np.count_nonzero])

transaction_6M_training_master = pd.DataFrame(transaction_6M_training_master)

transaction_6M_training_master = transaction_6M_training_master.stack()

transaction_6M_training_master = transaction_6M_training_master.reset_index()

transaction_6M_training_master = transaction_6M_training_master.rename(columns={'level_2':'value','sum':'order_value','count_nonzero':'order_number'})

transaction_6M_training_master['abs'] = transaction_6M_training_master['order_value']/transaction_6M_training_master['order_number']

transaction_6M_training_master

Unnamed: 0,customer_id,time_period,value,order_value,order_number,abs
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l12m,price,0.096220,4,0.024055
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l15m,price,0.186356,5,0.037271
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l18m,price,0.111814,3,0.037271
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l2y,price,0.110119,3,0.036706
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l6m,price,0.050831,1,0.050831
...,...,...,...,...,...,...
3543560,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l2y,price,0.416068,24,0.017336
3543561,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l6m,price,0.142203,10,0.014220
3543562,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l9m,price,0.033881,1,0.033881
3543563,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,l6m,price,0.209203,7,0.029886


In [66]:
transaction_6M_training_master = transaction_6M_training_master.drop('value', axis=1)

transaction_6M_training_master

Unnamed: 0,customer_id,time_period,order_value,order_number,abs
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l12m,0.096220,4,0.024055
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l15m,0.186356,5,0.037271
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l18m,0.111814,3,0.037271
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l2y,0.110119,3,0.036706
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l6m,0.050831,1,0.050831
...,...,...,...,...,...
3543560,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l2y,0.416068,24,0.017336
3543561,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l6m,0.142203,10,0.014220
3543562,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l9m,0.033881,1,0.033881
3543563,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,l6m,0.209203,7,0.029886


In [67]:
transaction_6M_training_master = pd.pivot_table(data=transaction_6M_training_master,index='customer_id',values=['order_value','order_number','abs'],columns='time_period')

transaction_6M_training_master = transaction_6M_training_master.replace(np.nan,0)

transaction_6M_training_master = transaction_6M_training_master.reset_index()

transaction_6M_training_master

Unnamed: 0_level_0,customer_id,abs,abs,abs,abs,abs,abs,order_number,order_number,order_number,order_number,order_number,order_number,order_value,order_value,order_value,order_value,order_value,order_value
time_period,Unnamed: 1_level_1,l12m,l15m,l18m,l2y,l6m,l9m,l12m,l15m,l18m,l2y,l6m,l9m,l12m,l15m,l18m,l2y,l6m,l9m
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.024055,0.037271,0.037271,0.036706,0.050831,0.018729,4.0,5.0,3.0,3.0,1.0,5.0,0.096220,0.186356,0.111814,0.110119,0.050831,0.093644
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.050831,0.028702,0.036353,0.036141,0.024277,0.024872,1.0,18.0,24.0,9.0,15.0,19.0,0.050831,0.516644,0.872475,0.325271,0.364153,0.472559
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.000000,0.000000,0.000000,0.036424,0.044513,0.023147,0.0,0.0,0.0,4.0,11.0,3.0,0.000000,0.000000,0.000000,0.145695,0.489644,0.069441
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.051890,0.000000,0.000000,0.027525,0.036545,0.000000,2.0,0.0,0.0,4.0,7.0,0.0,0.103780,0.000000,0.000000,0.110102,0.255814,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.007893,0.000000,0.028576,0.000000,0.023491,0.035576,3.0,0.0,23.0,0.0,23.0,2.0,0.023678,0.000000,0.657237,0.000000,0.540288,0.071153
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.019588,0.012907,0.026331,0.023900,0.019527,0.025365,12.0,8.0,11.0,9.0,26.0,18.0,0.235051,0.103254,0.289644,0.215102,0.507695,0.456576
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.017610,0.020564,0.000000,0.017336,0.014220,0.033881,3.0,7.0,0.0,24.0,10.0,1.0,0.052831,0.143949,0.000000,0.416068,0.142203,0.033881
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.029886,0.000000,0.0,0.0,0.0,0.0,7.0,0.0,0.000000,0.000000,0.000000,0.000000,0.209203,0.000000


In [68]:
transaction_6M_training_master.columns = [f'{j}_{i}' for i,j in transaction_6M_training_master.columns]

p21m_cols = ['l12m_order_number','l15m_order_number', 'l18m_order_number', 'l2y_order_number', 'l9m_order_number']

transaction_6M_training_master['p21m_order_number'] = transaction_6M_training_master[list(transaction_6M_training_master[p21m_cols])].sum(axis=1)

# transaction_master[transaction_master.customer_id == 0].iloc[:,:16]
transaction_6M_training_master['new_mb'] = np.where(transaction_6M_training_master['p21m_order_number']==0,1,0)

transaction_6M_training_master['new_mb'].value_counts()

0    1189917
1     172364
Name: new_mb, dtype: int64

In [69]:
# transaction_master = transaction_master[transaction_master['new_mb'] == 0]

transaction_6M_training_master['churn_status'] = np.where(transaction_6M_training_master['l6m_order_number']==0,1,0)

transaction_6M_training_master = transaction_6M_training_master.rename(columns={'_customer_id':'customer_id'})

transaction_6M_training_master

Unnamed: 0,customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,new_mb,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.024055,0.037271,0.037271,0.036706,0.050831,0.018729,4.0,5.0,3.0,3.0,1.0,5.0,0.096220,0.186356,0.111814,0.110119,0.050831,0.093644,20.0,0,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.050831,0.028702,0.036353,0.036141,0.024277,0.024872,1.0,18.0,24.0,9.0,15.0,19.0,0.050831,0.516644,0.872475,0.325271,0.364153,0.472559,71.0,0,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.000000,0.000000,0.000000,0.036424,0.044513,0.023147,0.0,0.0,0.0,4.0,11.0,3.0,0.000000,0.000000,0.000000,0.145695,0.489644,0.069441,7.0,0,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,2.0,0,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.051890,0.000000,0.000000,0.027525,0.036545,0.000000,2.0,0.0,0.0,4.0,7.0,0.0,0.103780,0.000000,0.000000,0.110102,0.255814,0.000000,6.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.007893,0.000000,0.028576,0.000000,0.023491,0.035576,3.0,0.0,23.0,0.0,23.0,2.0,0.023678,0.000000,0.657237,0.000000,0.540288,0.071153,28.0,0,0
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.019588,0.012907,0.026331,0.023900,0.019527,0.025365,12.0,8.0,11.0,9.0,26.0,18.0,0.235051,0.103254,0.289644,0.215102,0.507695,0.456576,58.0,0,0
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.017610,0.020564,0.000000,0.017336,0.014220,0.033881,3.0,7.0,0.0,24.0,10.0,1.0,0.052831,0.143949,0.000000,0.416068,0.142203,0.033881,35.0,0,0
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.000000,0.000000,0.000000,0.000000,0.029886,0.000000,0.0,0.0,0.0,0.0,7.0,0.0,0.000000,0.000000,0.000000,0.000000,0.209203,0.000000,0.0,1,0


In [70]:
transaction_6M_training_master = transaction_6M_training_master.reset_index(drop=True)

In [71]:
customer_6M_training = customer.merge(transaction_6M_training_master[['customer_id','new_mb']], on='customer_id', how='left')

customer_6M_training = customer_6M_training[customer_6M_training['new_mb'] == 0]

customer_6M_training

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,new_mb
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.0
...,...,...,...,...,...,...,...
1371974,ffffaff3905b803d1c7e153a1378a5151e1f34f236ba54...,1.0,1.0,0,1,21.0,0.0
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.0


In [72]:
customer_6M_training = customer_6M_training.reset_index(drop=True)

In [73]:
dataset_6M_training = customer_6M_training.merge(transaction_6M_training_master, on='customer_id', how='left')

dataset_6M_training = dataset_6M_training[dataset_6M_training.new_mb_x == 0] 

dataset_6M_training = dataset_6M_training.reset_index(drop=True)

dataset_6M_training

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,new_mb_x,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,new_mb_y,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.0,0.024055,0.037271,0.037271,0.036706,0.050831,0.018729,4.0,5.0,3.0,3.0,1.0,5.0,0.096220,0.186356,0.111814,0.110119,0.050831,0.093644,20.0,0,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.0,0.050831,0.028702,0.036353,0.036141,0.024277,0.024872,1.0,18.0,24.0,9.0,15.0,19.0,0.050831,0.516644,0.872475,0.325271,0.364153,0.472559,71.0,0,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.0,0.000000,0.000000,0.000000,0.036424,0.044513,0.023147,0.0,0.0,0.0,4.0,11.0,3.0,0.000000,0.000000,0.000000,0.145695,0.489644,0.069441,7.0,0,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.0,0.000000,0.000000,0.030492,0.000000,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,0.000000,2.0,0,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.0,0.051890,0.000000,0.000000,0.027525,0.036545,0.000000,2.0,0.0,0.0,4.0,7.0,0.0,0.103780,0.000000,0.000000,0.110102,0.255814,0.000000,6.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189912,ffffaff3905b803d1c7e153a1378a5151e1f34f236ba54...,1.0,1.0,0,1,21.0,0.0,0.000000,0.000000,0.000000,0.122017,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.122017,0.000000,0.000000,1.0,0,1
1189913,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.0,0.007893,0.000000,0.028576,0.000000,0.023491,0.035576,3.0,0.0,23.0,0.0,23.0,2.0,0.023678,0.000000,0.657237,0.000000,0.540288,0.071153,28.0,0,0
1189914,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.0,0.019588,0.012907,0.026331,0.023900,0.019527,0.025365,12.0,8.0,11.0,9.0,26.0,18.0,0.235051,0.103254,0.289644,0.215102,0.507695,0.456576,58.0,0,0
1189915,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.0,0.017610,0.020564,0.000000,0.017336,0.014220,0.033881,3.0,7.0,0.0,24.0,10.0,1.0,0.052831,0.143949,0.000000,0.416068,0.142203,0.033881,35.0,0,0


In [74]:
dataset_6M_training = dataset_6M_training.drop(['l6m_order_number','l6m_order_value','l6m_abs','new_mb_y'], axis=1)

dataset_6M_training = dataset_6M_training.rename(columns={'new_mb_x':'new_mb'})

dataset_6M_training

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,new_mb,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.0,0.024055,0.037271,0.037271,0.036706,0.018729,4.0,5.0,3.0,3.0,5.0,0.096220,0.186356,0.111814,0.110119,0.093644,20.0,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.0,0.050831,0.028702,0.036353,0.036141,0.024872,1.0,18.0,24.0,9.0,19.0,0.050831,0.516644,0.872475,0.325271,0.472559,71.0,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.0,0.000000,0.000000,0.000000,0.036424,0.023147,0.0,0.0,0.0,4.0,3.0,0.000000,0.000000,0.000000,0.145695,0.069441,7.0,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.0,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.060983,0.000000,0.000000,2.0,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.0,0.051890,0.000000,0.000000,0.027525,0.000000,2.0,0.0,0.0,4.0,0.0,0.103780,0.000000,0.000000,0.110102,0.000000,6.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189912,ffffaff3905b803d1c7e153a1378a5151e1f34f236ba54...,1.0,1.0,0,1,21.0,0.0,0.000000,0.000000,0.000000,0.122017,0.000000,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.122017,0.000000,1.0,1
1189913,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.0,0.007893,0.000000,0.028576,0.000000,0.035576,3.0,0.0,23.0,0.0,2.0,0.023678,0.000000,0.657237,0.000000,0.071153,28.0,0
1189914,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.0,0.019588,0.012907,0.026331,0.023900,0.025365,12.0,8.0,11.0,9.0,18.0,0.235051,0.103254,0.289644,0.215102,0.456576,58.0,0
1189915,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.0,0.017610,0.020564,0.000000,0.017336,0.033881,3.0,7.0,0.0,24.0,1.0,0.052831,0.143949,0.000000,0.416068,0.033881,35.0,0


In [76]:
normalize_cols = ['age', 'l12m_abs', 'l15m_abs', 'l18m_abs', 'l2y_abs',
       'l9m_abs', 'l12m_order_number', 'l15m_order_number','l18m_order_number', 
       'l2y_order_number', 'l9m_order_number', 'l12m_order_value', 'l15m_order_value',
       'l18m_order_value', 'l2y_order_value', 'l9m_order_value', 'p21m_order_number']

# for col in normalize_cols:
#     scaled = scaler.fit_transform(dataset[col])

dataset_6M_training[normalize_cols] = scaler.fit_transform(dataset_6M_training[normalize_cols])

In [77]:
dataset_6M_training.churn_status.value_counts(normalize=True)

1    0.522276
0    0.477724
Name: churn_status, dtype: float64

In [78]:
dataset_6M_training =  dataset_6M_training.drop(columns='new_mb')

dataset_6M_training.shape

(1189917, 23)

In [79]:
360+180

540

### 3.2 Transaction - Testing - 6M model

In [80]:
def create_time_period_6M_testing(apply_df):

    max_date = max(apply_df.t_dat) 

    l18m = max_date - datetime.timedelta(days=(360+180))
    # l15m = max_date - datetime.timedelta(days=360+90)
    l12m = max_date - datetime.timedelta(days=360)
    l9m = max_date - datetime.timedelta(days=270)
    l6m = max_date - datetime.timedelta(days=180)
    l3m = max_date - datetime.timedelta(days=90)

    transaction_conditions = [
                            (apply_df['t_dat'] <= l18m),
                            (apply_df['t_dat'] > l18m) & (apply_df['t_dat'] <= l12m),
                            # (apply_df['t_dat'] > l15m) & (apply_df['t_dat'] <= l12m),
                            (apply_df['t_dat'] > l12m) & (apply_df['t_dat'] <= l9m),
                            (apply_df['t_dat'] > l9m) & (apply_df['t_dat'] <= l6m),
                            (apply_df['t_dat'] > l6m) & (apply_df['t_dat'] <= l3m),
                            (apply_df['t_dat'] > l3m)
                        ] 

    values = ['l6m','l2y','l18m','l15m','l12m','l9m']

    apply_df['time_period'] = np.select(transaction_conditions, values)

    return apply_df

In [81]:
transaction_6M_testing = create_time_period_6M_testing(transaction)

transaction_6M_testing

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,time_period
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,l6m
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,l6m
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,l6m
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,l6m
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,l6m
...,...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,l9m
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,l9m
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,l9m
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,l9m


In [82]:
transaction_6M_testing_master = pd.pivot_table(transaction_6M_testing, index=['customer_id','time_period'], values=['price'], aggfunc=[np.sum, np.count_nonzero])

In [83]:
transaction_6M_testing_master = pd.DataFrame(transaction_6M_testing_master)

transaction_6M_testing_master = transaction_6M_testing_master.stack()

transaction_6M_testing_master = transaction_6M_testing_master.reset_index()

transaction_6M_testing_master = transaction_6M_testing_master.rename(columns={'level_2':'value','sum':'order_value','count_nonzero':'order_number'})

transaction_6M_testing_master['abs'] = transaction_6M_testing_master['order_value']/transaction_6M_testing_master['order_number']

transaction_6M_testing_master

Unnamed: 0,customer_id,time_period,value,order_value,order_number,abs
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l15m,price,0.093644,5,0.018729
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l18m,price,0.096220,4,0.024055
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l2y,price,0.298169,8,0.037271
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l6m,price,0.110119,3,0.036706
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,l9m,price,0.050831,1,0.050831
...,...,...,...,...,...,...
3516796,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l2y,price,0.143949,7,0.020564
3516797,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l6m,price,0.416068,24,0.017336
3516798,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,l9m,price,0.142203,10,0.014220
3516799,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,l12m,price,0.209203,7,0.029886


In [84]:
transaction_6M_testing_master = pd.pivot_table(data=transaction_6M_testing_master,index='customer_id',values=['order_value','order_number','abs'],columns='time_period')

transaction_6M_testing_master = transaction_6M_testing_master.replace(np.nan,0)

transaction_6M_testing_master = transaction_6M_testing_master.reset_index()

transaction_6M_testing_master

Unnamed: 0_level_0,customer_id,abs,abs,abs,abs,abs,abs,order_number,order_number,order_number,order_number,order_number,order_number,order_value,order_value,order_value,order_value,order_value,order_value
time_period,Unnamed: 1_level_1,l12m,l15m,l18m,l2y,l6m,l9m,l12m,l15m,l18m,l2y,l6m,l9m,l12m,l15m,l18m,l2y,l6m,l9m
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.000000,0.018729,0.024055,0.037271,0.036706,0.050831,0.0,5.0,4.0,8.0,3.0,1.0,0.000000,0.093644,0.096220,0.298169,0.110119,0.050831
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024075,0.024872,0.050831,0.033074,0.036141,0.027102,14.0,19.0,1.0,42.0,9.0,1.0,0.337051,0.472559,0.050831,1.389119,0.325271,0.027102
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.042864,0.023147,0.000000,0.000000,0.036424,0.061000,10.0,3.0,0.0,0.0,4.0,1.0,0.428644,0.069441,0.000000,0.000000,0.145695,0.061000
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.042356,0.000000,0.051890,0.000000,0.027525,0.032186,3.0,0.0,2.0,0.0,4.0,4.0,0.127068,0.000000,0.103780,0.000000,0.110102,0.128746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.022479,0.035576,0.007893,0.028576,0.000000,0.024418,11.0,2.0,3.0,23.0,0.0,12.0,0.247271,0.071153,0.023678,0.657237,0.000000,0.293017
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.022375,0.025365,0.019588,0.020679,0.023900,0.013119,18.0,18.0,12.0,19.0,9.0,8.0,0.402746,0.456576,0.235051,0.392898,0.215102,0.104949
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.000000,0.033881,0.017610,0.020564,0.017336,0.014220,0.0,1.0,3.0,7.0,24.0,10.0,0.000000,0.033881,0.052831,0.143949,0.416068,0.142203
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.029886,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.209203,0.000000,0.000000,0.000000,0.000000,0.000000


In [85]:
transaction_6M_testing_master.columns = [f'{j}_{i}' for i,j in transaction_6M_testing_master.columns]

transaction_6M_testing_master

Unnamed: 0,_customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.000000,0.018729,0.024055,0.037271,0.036706,0.050831,0.0,5.0,4.0,8.0,3.0,1.0,0.000000,0.093644,0.096220,0.298169,0.110119,0.050831
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024075,0.024872,0.050831,0.033074,0.036141,0.027102,14.0,19.0,1.0,42.0,9.0,1.0,0.337051,0.472559,0.050831,1.389119,0.325271,0.027102
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.042864,0.023147,0.000000,0.000000,0.036424,0.061000,10.0,3.0,0.0,0.0,4.0,1.0,0.428644,0.069441,0.000000,0.000000,0.145695,0.061000
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.042356,0.000000,0.051890,0.000000,0.027525,0.032186,3.0,0.0,2.0,0.0,4.0,4.0,0.127068,0.000000,0.103780,0.000000,0.110102,0.128746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.022479,0.035576,0.007893,0.028576,0.000000,0.024418,11.0,2.0,3.0,23.0,0.0,12.0,0.247271,0.071153,0.023678,0.657237,0.000000,0.293017
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.022375,0.025365,0.019588,0.020679,0.023900,0.013119,18.0,18.0,12.0,19.0,9.0,8.0,0.402746,0.456576,0.235051,0.392898,0.215102,0.104949
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.000000,0.033881,0.017610,0.020564,0.017336,0.014220,0.0,1.0,3.0,7.0,24.0,10.0,0.000000,0.033881,0.052831,0.143949,0.416068,0.142203
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.029886,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.209203,0.000000,0.000000,0.000000,0.000000,0.000000


In [86]:
p21m_cols = ['l12m_order_number','l15m_order_number', 'l18m_order_number', 'l2y_order_number', 'l6m_order_number', 'l9m_order_number']

transaction_6M_testing_master['p21m_order_number'] = transaction_6M_testing_master[list(transaction_6M_testing_master[p21m_cols])].sum(axis=1)

transaction_6M_testing_master.loc[(transaction_6M_testing_master['l12m_order_number']==0) & (transaction_6M_testing_master['l9m_order_number']==0) ,'churn_status'] = 1

transaction_6M_testing_master['churn_status'] = transaction_6M_testing_master['churn_status'].replace(np.nan,0)

transaction_6M_testing_master

Unnamed: 0,_customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.000000,0.018729,0.024055,0.037271,0.036706,0.050831,0.0,5.0,4.0,8.0,3.0,1.0,0.000000,0.093644,0.096220,0.298169,0.110119,0.050831,21.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024075,0.024872,0.050831,0.033074,0.036141,0.027102,14.0,19.0,1.0,42.0,9.0,1.0,0.337051,0.472559,0.050831,1.389119,0.325271,0.027102,86.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.042864,0.023147,0.000000,0.000000,0.036424,0.061000,10.0,3.0,0.0,0.0,4.0,1.0,0.428644,0.069441,0.000000,0.000000,0.145695,0.061000,18.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.042356,0.000000,0.051890,0.000000,0.027525,0.032186,3.0,0.0,2.0,0.0,4.0,4.0,0.127068,0.000000,0.103780,0.000000,0.110102,0.128746,13.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.022479,0.035576,0.007893,0.028576,0.000000,0.024418,11.0,2.0,3.0,23.0,0.0,12.0,0.247271,0.071153,0.023678,0.657237,0.000000,0.293017,51.0,0.0
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.022375,0.025365,0.019588,0.020679,0.023900,0.013119,18.0,18.0,12.0,19.0,9.0,8.0,0.402746,0.456576,0.235051,0.392898,0.215102,0.104949,84.0,0.0
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.000000,0.033881,0.017610,0.020564,0.017336,0.014220,0.0,1.0,3.0,7.0,24.0,10.0,0.000000,0.033881,0.052831,0.143949,0.416068,0.142203,45.0,0.0
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.029886,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.209203,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0


In [87]:
transaction_6M_testing_master.churn_status.value_counts()

0.0    740816
1.0    621465
Name: churn_status, dtype: int64

In [88]:
transaction_6M_testing_master = transaction_6M_testing_master.rename(columns={'_customer_id':'customer_id'})

transaction_6M_testing_master = transaction_6M_testing_master.reset_index(drop=True)

transaction_6M_testing_master

Unnamed: 0,customer_id,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.000000,0.018729,0.024055,0.037271,0.036706,0.050831,0.0,5.0,4.0,8.0,3.0,1.0,0.000000,0.093644,0.096220,0.298169,0.110119,0.050831,21.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.024075,0.024872,0.050831,0.033074,0.036141,0.027102,14.0,19.0,1.0,42.0,9.0,1.0,0.337051,0.472559,0.050831,1.389119,0.325271,0.027102,86.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.042864,0.023147,0.000000,0.000000,0.036424,0.061000,10.0,3.0,0.0,0.0,4.0,1.0,0.428644,0.069441,0.000000,0.000000,0.145695,0.061000,18.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.042356,0.000000,0.051890,0.000000,0.027525,0.032186,3.0,0.0,2.0,0.0,4.0,4.0,0.127068,0.000000,0.103780,0.000000,0.110102,0.128746,13.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.022479,0.035576,0.007893,0.028576,0.000000,0.024418,11.0,2.0,3.0,23.0,0.0,12.0,0.247271,0.071153,0.023678,0.657237,0.000000,0.293017,51.0,0.0
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.022375,0.025365,0.019588,0.020679,0.023900,0.013119,18.0,18.0,12.0,19.0,9.0,8.0,0.402746,0.456576,0.235051,0.392898,0.215102,0.104949,84.0,0.0
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.000000,0.033881,0.017610,0.020564,0.017336,0.014220,0.0,1.0,3.0,7.0,24.0,10.0,0.000000,0.033881,0.052831,0.143949,0.416068,0.142203,45.0,0.0
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.029886,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.209203,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0


In [89]:
customer_6M_testing = customer.merge(transaction_6M_testing_master[['customer_id']], on='customer_id', how='left')

# customer_3M_testing = customer_3M_testing[customer_3M_testing['new_mb'] == 0]

customer_6M_testing = customer_6M_testing.reset_index(drop=True)

In [90]:
dataset_6M_testing = customer_6M_testing.merge(transaction_6M_testing_master, on='customer_id', how='left')

# dataset_3M_testing = dataset_3M_testing[dataset_3M_testing.new_mb_x == 0] 

dataset_6M_testing = dataset_6M_testing.reset_index(drop=True)

dataset_6M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l6m_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l6m_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l6m_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.000000,0.018729,0.024055,0.037271,0.036706,0.050831,0.0,5.0,4.0,8.0,3.0,1.0,0.000000,0.093644,0.096220,0.298169,0.110119,0.050831,21.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.024075,0.024872,0.050831,0.033074,0.036141,0.027102,14.0,19.0,1.0,42.0,9.0,1.0,0.337051,0.472559,0.050831,1.389119,0.325271,0.027102,86.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.042864,0.023147,0.000000,0.000000,0.036424,0.061000,10.0,3.0,0.0,0.0,4.0,1.0,0.428644,0.069441,0.000000,0.000000,0.145695,0.061000,18.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.000000,0.000000,0.000000,0.030492,0.000000,0.000000,0.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.042356,0.000000,0.051890,0.000000,0.027525,0.032186,3.0,0.0,2.0,0.0,4.0,4.0,0.127068,0.000000,0.103780,0.000000,0.110102,0.128746,13.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.022479,0.035576,0.007893,0.028576,0.000000,0.024418,11.0,2.0,3.0,23.0,0.0,12.0,0.247271,0.071153,0.023678,0.657237,0.000000,0.293017,51.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.022375,0.025365,0.019588,0.020679,0.023900,0.013119,18.0,18.0,12.0,19.0,9.0,8.0,0.402746,0.456576,0.235051,0.392898,0.215102,0.104949,84.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.000000,0.033881,0.017610,0.020564,0.017336,0.014220,0.0,1.0,3.0,7.0,24.0,10.0,0.000000,0.033881,0.052831,0.143949,0.416068,0.142203,45.0,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.029886,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0,0.0,0.0,0.0,0.0,0.209203,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.0


In [91]:
dataset_6M_testing = dataset_6M_testing.drop(['l6m_order_number','l6m_order_value','l6m_abs'], axis=1)

# dataset_3M_testing = dataset_3M_testing.rename(columns={'new_mb_x':'new_mb'})

dataset_6M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,49.0,0.000000,0.018729,0.024055,0.037271,0.050831,0.0,5.0,4.0,8.0,1.0,0.000000,0.093644,0.096220,0.298169,0.050831,21.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,25.0,0.024075,0.024872,0.050831,0.033074,0.027102,14.0,19.0,1.0,42.0,1.0,0.337051,0.472559,0.050831,1.389119,0.027102,86.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,24.0,0.042864,0.023147,0.000000,0.000000,0.061000,10.0,3.0,0.0,0.0,1.0,0.428644,0.069441,0.000000,0.000000,0.061000,18.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,54.0,0.000000,0.000000,0.000000,0.030492,0.000000,0.0,0.0,0.0,2.0,0.0,0.000000,0.000000,0.000000,0.060983,0.000000,2.0,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,52.0,0.042356,0.000000,0.051890,0.000000,0.032186,3.0,0.0,2.0,0.0,4.0,0.127068,0.000000,0.103780,0.000000,0.128746,13.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,24.0,0.022479,0.035576,0.007893,0.028576,0.024418,11.0,2.0,3.0,23.0,12.0,0.247271,0.071153,0.023678,0.657237,0.293017,51.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,21.0,0.022375,0.025365,0.019588,0.020679,0.013119,18.0,18.0,12.0,19.0,8.0,0.402746,0.456576,0.235051,0.392898,0.104949,84.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,21.0,0.000000,0.033881,0.017610,0.020564,0.014220,0.0,1.0,3.0,7.0,10.0,0.000000,0.033881,0.052831,0.143949,0.142203,45.0,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,18.0,0.029886,0.000000,0.000000,0.000000,0.000000,7.0,0.0,0.0,0.0,0.0,0.209203,0.000000,0.000000,0.000000,0.000000,7.0,0.0


In [92]:
dataset_6M_testing[normalize_cols] = scaler.fit_transform(dataset_3M_testing[normalize_cols])

In [93]:
dataset_6M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.000000,0.020161,0.010336,0.013774,0.008357,0.000000,0.009174,0.008549,0.008290,0.010641,0.000000,0.012097,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.071378,0.076613,0.002584,0.049587,0.066852,0.042945,0.046293,0.004516,0.022983,0.083032,0.032264,0.051747,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.127085,0.012097,0.000000,0.000000,0.000000,0.030675,0.006803,0.000000,0.000000,0.000000,0.041032,0.009409,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.001344,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.125578,0.000000,0.005168,0.000000,0.000000,0.009202,0.000000,0.009221,0.000000,0.000000,0.012164,0.006048,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.066647,0.008065,0.007752,0.000000,0.064067,0.033742,0.006970,0.002104,0.000000,0.062548,0.023670,0.034274,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.066337,0.072581,0.031008,0.022039,0.030641,0.055215,0.044728,0.020884,0.004593,0.027565,0.038553,0.050403,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.000000,0.004032,0.007752,0.019284,0.000000,0.000000,0.003319,0.004694,0.006403,0.000000,0.000000,0.014113,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704,0.0


In [94]:
dataset_6M_testing.churn_status.value_counts(normalize=True)

0.0    0.543806
1.0    0.456194
Name: churn_status, dtype: float64

### 3.3 6M Modelling

In [95]:
dataset_6M_training = dataset_6M_training.replace(np.nan,0)

dataset_6M_testing = dataset_6M_testing.replace(np.nan,0)

In [96]:
response = dataset_6M_training["churn_status"]
cus_id = dataset_6M_training["customer_id"]
traindata = dataset_6M_training.drop(columns=["churn_status","customer_id"],axis=1)

In [97]:
ratio = int(round(traindata.shape[0]*0.8,0))

ratio

951934

In [98]:
X_train = traindata.iloc[:ratio,:]
y_train = response.iloc[:ratio]
cus_id_test = cus_id.iloc[:ratio]

X_test = traindata.iloc[ratio:,:]
y_test = response.iloc[ratio:]
cus_id_test = cus_id.iloc[ratio:]


#to resolve any class imbalance - use stratify parameter.

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (951934, 21)
Number transactions y_train dataset:  (951934,)
Number transactions X_test dataset:  (237983, 21)
Number transactions y_test dataset:  (237983,)


In [99]:
#--Step 15.4.1. Logistic Regression-----------------
# Fitting Logistic Regression to the Training set 
classifier_6M = LogisticRegression(random_state = 0, penalty = 'l2', C = 100)
classifier_6M.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier_6M.predict(X_test)

#Evaluate results

acc = accuracy_score(y_test, y_pred )
prec = precision_score(y_test, y_pred )
rec = recall_score(y_test, y_pred )
f1 = f1_score(y_test, y_pred )
f2 = fbeta_score(y_test, y_pred, beta=2.0)

#probability score
y_pred_probs = classifier_6M.predict_proba(X_test)
y_pred_probs  = y_pred_probs [:, 1]

results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score
0,Logistic Regression,0.745457,0.723388,0.831347,0.773619,0.807252


In [100]:
# Step 19: Compare predictions against test set -------------------------------------------------------
#Revalidate final results with Confusion Matrix:
cm = confusion_matrix(y_test, y_pred) 

pd.crosstab(y_test,y_pred,rownames=['ACTUAL'],colnames=['PRED'])

PRED,0,1
ACTUAL,Unnamed: 1_level_1,Unnamed: 2_level_1
0,73900,39579
1,20998,103506


In [101]:
# Step 20: Format Final Results:-------------------------------------------------------

training_6M = pd.concat([cus_id_test, y_test], axis = 1).dropna()

training_6M['customer_id'] = cus_id_test

training_6M['actual'] = y_test

training_6M['predictions'] = y_pred 

training_6M["churn_prob(%)"] = y_pred_probs 

training_6M["churn_prob(%)"] = training_6M["churn_prob(%)"]*100

training_6M["churn_prob(%)"]=training_6M["churn_prob(%)"].round(2)

training_6M = training_6M[['customer_id', 'actual', 'predictions', 'churn_prob(%)']]

# final_results ['Ranking'] = pd.qcut(final_results['churn_prob(%)'].rank(method = 'first'),10,labels=range(10,0,-1))

training_6M.head(10)

Unnamed: 0,customer_id,actual,predictions,churn_prob(%)
951934,cccdc9bb4c7de9aed84190a32b0e6a87c76789af96d297...,1,1,88.11
951935,cccdce063de2198e76d99b7da00ff6f5e24d8639cc6545...,1,0,43.72
951936,cccde5bc3b8db4a2d0d60668d03b25551810ceba8877f3...,1,1,80.21
951937,cccdef54874e4540410e46e8facd994365558023fa7266...,1,1,76.12
951938,cccdf1179e723ead91fbb2cdd0e16496af95c60b977a50...,0,0,48.06
951939,cccdf471b663084cc9f4c4c40b520d87d8e962e88dec60...,0,1,66.05
951940,cccdf4922ee8bd88350d0c11abb5c433fe676f267e4326...,1,1,75.05
951941,ccce0534e5b9847ac3b9e57a7f41a68bee260ce3f9f401...,0,1,69.59
951942,ccce11485e823a181579b63b05266076edaa42ea5a2510...,0,0,3.52
951943,ccce168343e82fbb2b391f558775faaf71d6b55b09a2e2...,1,1,57.14


In [102]:
import pickle as pkl

saved_model = pkl.dumps(classifier_6M)
 
lr_classifier_6M_from_pickle = pkl.loads(saved_model) 

In [103]:
dataset_6M_testing = dataset_6M_testing.replace(np.nan,0)

dataset_6M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l9m_order_value,p21m_order_number,churn_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.000000,0.020161,0.010336,0.013774,0.008357,0.000000,0.009174,0.008549,0.008290,0.010641,0.000000,0.012097,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.071378,0.076613,0.002584,0.049587,0.066852,0.042945,0.046293,0.004516,0.022983,0.083032,0.032264,0.051747,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.127085,0.012097,0.000000,0.000000,0.000000,0.030675,0.006803,0.000000,0.000000,0.000000,0.041032,0.009409,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.001344,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.125578,0.000000,0.005168,0.000000,0.000000,0.009202,0.000000,0.009221,0.000000,0.000000,0.012164,0.006048,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.066647,0.008065,0.007752,0.000000,0.064067,0.033742,0.006970,0.002104,0.000000,0.062548,0.023670,0.034274,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.066337,0.072581,0.031008,0.022039,0.030641,0.055215,0.044728,0.020884,0.004593,0.027565,0.038553,0.050403,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.000000,0.004032,0.007752,0.019284,0.000000,0.000000,0.003319,0.004694,0.006403,0.000000,0.000000,0.014113,0.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704,0.0


In [107]:
X_new = dataset_6M_testing.iloc[:,1:22]

X_new

Unnamed: 0,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l9m_order_value,p21m_order_number
0,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.000000,0.020161,0.010336,0.013774,0.008357,0.000000,0.009174,0.008549,0.008290,0.010641,0.000000,0.012097
1,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.071378,0.076613,0.002584,0.049587,0.066852,0.042945,0.046293,0.004516,0.022983,0.083032,0.032264,0.051747
2,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.127085,0.012097,0.000000,0.000000,0.000000,0.030675,0.006803,0.000000,0.000000,0.000000,0.041032,0.009409
3,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.001344
4,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.125578,0.000000,0.005168,0.000000,0.000000,0.009202,0.000000,0.009221,0.000000,0.000000,0.012164,0.006048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.066647,0.008065,0.007752,0.000000,0.064067,0.033742,0.006970,0.002104,0.000000,0.062548,0.023670,0.034274
1371976,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.066337,0.072581,0.031008,0.022039,0.030641,0.055215,0.044728,0.020884,0.004593,0.027565,0.038553,0.050403
1371977,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.000000,0.004032,0.007752,0.019284,0.000000,0.000000,0.003319,0.004694,0.006403,0.000000,0.000000,0.014113
1371978,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704


In [108]:
new_churn_prediction = lr_classifier_6M_from_pickle.predict(X_new)

new_churn_prediction

array([0, 0, 0, ..., 0, 0, 1])

In [110]:
new_churn_proba = lr_classifier_6M_from_pickle.predict_proba(X_new)
new_churn_proba = new_churn_proba[:,1]
new_churn_proba

array([0.36380319, 0.00794796, 0.21435267, ..., 0.37406372, 0.37497368,
       0.73945575])

In [111]:
dataset_6M_testing['new_churn_prediction'] = new_churn_prediction
dataset_6M_testing['new_churn_proba'] = new_churn_proba


dataset_6M_testing

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,l12m_abs,l15m_abs,l18m_abs,l2y_abs,l9m_abs,l12m_order_number,l15m_order_number,l18m_order_number,l2y_order_number,l9m_order_number,l12m_order_value,l15m_order_value,l18m_order_value,l2y_order_value,l9m_order_value,p21m_order_number,churn_status,new_churn_prediction,new_churn_proba
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,0,0,0.397590,0.036957,0.056998,0.073545,0.088313,0.000000,0.020161,0.010336,0.013774,0.008357,0.000000,0.009174,0.008549,0.008290,0.010641,0.000000,0.012097,0.0,0,0.363803
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,0,0,0.108434,0.049078,0.120442,0.056637,0.086138,0.071378,0.076613,0.002584,0.049587,0.066852,0.042945,0.046293,0.004516,0.022983,0.083032,0.032264,0.051747,0.0,0,0.007948
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,0,0,0.096386,0.045674,0.000000,0.000000,0.000000,0.127085,0.012097,0.000000,0.000000,0.000000,0.030675,0.006803,0.000000,0.000000,0.000000,0.041032,0.009409,0.0,0,0.214353
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,0,0,0.457831,0.000000,0.000000,0.000000,0.072249,0.000000,0.000000,0.000000,0.000000,0.005571,0.000000,0.000000,0.000000,0.000000,0.005804,0.000000,0.001344,1.0,1,0.838944
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,0,1,0.433735,0.000000,0.122952,0.000000,0.000000,0.125578,0.000000,0.005168,0.000000,0.000000,0.009202,0.000000,0.009221,0.000000,0.000000,0.012164,0.006048,0.0,0,0.195659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,0,0,0.096386,0.070201,0.018701,0.000000,0.067709,0.066647,0.008065,0.007752,0.000000,0.064067,0.033742,0.006970,0.002104,0.000000,0.062548,0.023670,0.034274,0.0,0,0.193949
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,0,0,0.060241,0.050052,0.046412,0.025468,0.062391,0.066337,0.072581,0.031008,0.022039,0.030641,0.055215,0.044728,0.020884,0.004593,0.027565,0.038553,0.050403,0.0,0,0.011111
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,0,1,0.060241,0.066856,0.041727,0.040578,0.000000,0.000000,0.004032,0.007752,0.019284,0.000000,0.000000,0.003319,0.004694,0.006403,0.000000,0.000000,0.014113,0.0,0,0.374064
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,0,1,0.024096,0.000000,0.000000,0.000000,0.000000,0.088607,0.000000,0.000000,0.000000,0.000000,0.021472,0.000000,0.000000,0.000000,0.000000,0.020026,0.004704,0.0,0,0.374974


In [112]:
new_results = results_3M.merge(dataset_6M_testing[['customer_id','new_churn_proba']], on='customer_id', how='left')

new_results = new_results.rename(columns={'new_churn_proba':'churn_proba_6M'})

new_results

Unnamed: 0,customer_id,churn_proba_3M,churn_proba_6M
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.454672,0.363803
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.073846,0.007948
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.331293,0.214353
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.906447,0.838944
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0.315325,0.195659
...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.272188,0.193949
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.078723,0.011111
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0.382036,0.374064
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0.653743,0.374974


In [None]:
# new_results.to_csv('final_results_v3.csv')