In [56]:
import numpy as np
import pandas as pd
import datetime
from datetime import date
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
#import plotly.graph_objects as go
from sklearn.preprocessing import normalize ,MinMaxScaler
from sklearn import metrics
from sklearn.mixture import GaussianMixture
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = "{:.3f}".format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 999)

## Load data

In [57]:
df=pd.read_csv('Book1.csv',header=0,sep=';')

In [58]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [59]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

## Feature Engineering

In [60]:
df['Age']=2021-df['Year_Birth']

In [61]:
df['Spending']=df['MntWines']+df['MntFruits']+df['MntMeatProducts']+df['MntFishProducts']+df['MntSweetProducts']+df['MntGoldProds']

In [62]:
df=df.rename(columns={'NumWebPurchases': "Web",'NumCatalogPurchases':'Catalog','NumStorePurchases':'Store'})
df['Marital_Status']=df['Marital_Status'].replace({'Divorced':'Alone','Single':'Alone','Married':'In couple','Together':'In couple','Absurd':'Alone','Widow':'Alone','YOLO':'Alone'})
df['Education']=df['Education'].replace({'Basic':'Undergraduate','2n Cycle':'Undergraduate','Graduation':'Postgraduate','Master':'Postgraduate','PhD':'Postgraduate'})

In [63]:
df['Children']=df['Kidhome']+df['Teenhome']
df['Has_child'] = np.where(df.Children> 0, 'Has child', 'No child')
df['Children'].replace({3: "3 children",2:'2 children',1:'1 child',0:"No child"},inplace=True)
df=df.rename(columns={'MntWines': "Wines",'MntFruits':'Fruits','MntMeatProducts':'Meat','MntFishProducts':'Fish','MntSweetProducts':'Sweets','MntGoldProds':'Gold'})

In [64]:
last_date = date(2021,12,5)
df['Seniority']=pd.to_datetime(df['Dt_Customer'], dayfirst=True,format = '%Y-%m-%d')
df['Seniority'] = pd.to_numeric(df['Seniority'].dt.date.apply(lambda x: (last_date - x)).dt.days, downcast='integer')/30

In [65]:
df=df[['Age','Education','Marital_Status','Income','Seniority','Spending','Has_child','Children','Wines','Fruits','Meat','Fish','Sweets','Gold']]
df.head()

Unnamed: 0,Age,Education,Marital_Status,Income,Seniority,Spending,Has_child,Children,Wines,Fruits,Meat,Fish,Sweets,Gold
0,64,Postgraduate,Alone,58138.0,112.633,1617,No child,No child,635,88,546,172,88,88
1,67,Postgraduate,Alone,46344.0,94.3,27,Has child,2 children,11,1,6,2,1,6
2,56,Postgraduate,In couple,71613.0,100.933,776,No child,No child,426,49,127,111,21,42
3,37,Postgraduate,In couple,26646.0,95.167,53,Has child,1 child,11,4,20,10,3,5
4,40,Postgraduate,In couple,58293.0,95.9,422,Has child,1 child,173,43,118,46,27,15


## Drop missing values and outliers

In [66]:
df.isna().sum()

Age                0
Education          0
Marital_Status     0
Income            24
Seniority          0
Spending           0
Has_child          0
Children           0
Wines              0
Fruits             0
Meat               0
Fish               0
Sweets             0
Gold               0
dtype: int64

In [67]:
df['Income'].sort_values( ascending=False)

2233   666666.000
617    162397.000
687    160803.000
1300   157733.000
164    157243.000
1653   157146.000
2132   156924.000
655    153924.000
1898   113734.000
646    105471.000
252    102692.000
203    102160.000
124    101970.000
1113    98777.000
650     98777.000
2012    96876.000
1958    96843.000
1995    96547.000
914     95529.000
1001    95169.000
1473    94871.000
1982    94642.000
1993    94472.000
943     94384.000
1301    94384.000
1458    93790.000
826     93404.000
424     93027.000
966     92955.000
426     92910.000
140     92859.000
1910    92556.000
1432    92533.000
1721    92491.000
1385    92344.000
591     92163.000
814     91820.000
1992    91712.000
1031    91700.000
1922    91249.000
2167    91172.000
418     91065.000
1897    90933.000
1333    90842.000
116     90765.000
1261    90687.000
1572    90638.000
1179    90638.000
1808    90369.000
390     90300.000
1722    90273.000
703     90247.000
1601    90226.000
2168    90000.000
729     89891.000
734     89

In [68]:
df=df[df['Income']<600000]
df=df.dropna(subset=['Income'])
df.isna().sum()

Age               0
Education         0
Marital_Status    0
Income            0
Seniority         0
Spending          0
Has_child         0
Children          0
Wines             0
Fruits            0
Meat              0
Fish              0
Sweets            0
Gold              0
dtype: int64

## Customer Segmentation

In [69]:
scaler=MinMaxScaler()
df_temp=df[['Income','Seniority','Spending']]
X_std=scaler.fit_transform(df_temp)
X = normalize(X_std,norm='l2')

In [70]:
gmm=GaussianMixture(n_components=4, covariance_type='spherical',max_iter=2000, random_state=0).fit(X)
labels = gmm.predict(X)

In [71]:
labels

array([2, 3, 2, ..., 1, 1, 0], dtype=int64)

In [72]:
## The 4 categories we segment our customers are:

# Rising Star: New customers with high income high spending.
# Star: Old customers with high income and spending.
# Not good: Old customers with low income and low spending.
# Attention: New customers with low income and low spending. 

In [73]:
df_temp['Cluster'] = labels
df_temp=df_temp.replace({0:'Not good',1:'Rising Star',2:'Star',3:'Attention'})
df = df.merge(df_temp.Cluster, left_index=True, right_index=True)

In [74]:
summary=df[['Income','Spending','Seniority','Cluster']]
summary.set_index("Cluster", inplace = True)
summary=summary.groupby('Cluster').describe().T
summary

Unnamed: 0,Cluster,Attention,Not good,Rising Star,Star
Income,count,431.0,781.0,356.0,647.0
Income,mean,44729.313,33893.306,75325.654,65762.448
Income,std,17784.856,12679.763,11796.825,13631.36
Income,min,4023.0,1730.0,41443.0,2447.0
Income,25%,34209.0,24570.0,68046.75,56962.0
Income,50%,43142.0,33762.0,75507.0,65569.0
Income,75%,53649.0,42160.0,81667.25,74881.0
Income,max,162397.0,86836.0,157243.0,160803.0
Spending,count,431.0,781.0,356.0,647.0
Spending,mean,134.042,163.309,1265.435,1096.454


In [75]:
df.head()

Unnamed: 0,Age,Education,Marital_Status,Income,Seniority,Spending,Has_child,Children,Wines,Fruits,Meat,Fish,Sweets,Gold,Cluster
0,64,Postgraduate,Alone,58138.0,112.633,1617,No child,No child,635,88,546,172,88,88,Star
1,67,Postgraduate,Alone,46344.0,94.3,27,Has child,2 children,11,1,6,2,1,6,Attention
2,56,Postgraduate,In couple,71613.0,100.933,776,No child,No child,426,49,127,111,21,42,Star
3,37,Postgraduate,In couple,26646.0,95.167,53,Has child,1 child,11,4,20,10,3,5,Attention
4,40,Postgraduate,In couple,58293.0,95.9,422,Has child,1 child,173,43,118,46,27,15,Attention


## More Feature Engineering

In [77]:
#Age segment

cut_labels_Age = ['Young', 'Adult', 'Mature', 'Senior']
cut_bins = [0, 30, 45, 65, 130]
df['Age_group'] = pd.cut(df['Age'], bins=cut_bins, labels=cut_labels_Age)

#Income segment

cut_labels_Income = ['Low income', 'Low to medium income', 'Medium to high income', 'High income']
df['Income_group'] = pd.qcut(df['Income'], q=4, labels=cut_labels_Income)

#Seniority segment

cut_labels_Seniority = ['New customers', 'Discovering customers', 'Experienced customers', 'Old customers']
df['Seniority_group'] = pd.qcut(df['Seniority'], q=4, labels=cut_labels_Seniority)
df=df.drop(columns=['Age','Income','Seniority'])

In [78]:
cut_labels = ['Low consumer', 'Medium consumer', 'Top consumer']
df['Wines_segment'] = pd.qcut(df['Wines'][df['Wines']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
df['Fruits_segment'] = pd.qcut(df['Fruits'][df['Fruits']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
df['Meat_segment'] = pd.qcut(df['Meat'][df['Meat']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
df['Fish_segment'] = pd.qcut(df['Fish'][df['Fish']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
df['Sweets_segment'] = pd.qcut(df['Sweets'][df['Sweets']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
df['Gold_segment'] = pd.qcut(df['Gold'][df['Gold']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
df.drop(columns=['Spending','Wines','Fruits','Meat','Fish','Sweets','Gold'],inplace=True)
df = df.astype(object)

In [52]:
df.head()

Unnamed: 0,Education,Marital_Status,Has_child,Children,Cluster,Age_group,Income_group,Seniority_group,Wines_segment,Fruits_segment,Meat_segment,Fish_segment,Sweets_segment,Gold_segment
0,Postgraduate,Alone,No child,No child,Star,Mature,Medium to high income,Old customers,Top consumer,Top consumer,Top consumer,Top consumer,Top consumer,Top consumer
1,Postgraduate,Alone,Has child,2 children,Attention,Senior,Low to medium income,New customers,Low consumer,Low consumer,Low consumer,Low consumer,Low consumer,Low consumer
2,Postgraduate,In couple,No child,No child,Star,Mature,High income,Discovering customers,Medium consumer,Top consumer,Medium consumer,Top consumer,Medium consumer,Medium consumer
3,Postgraduate,In couple,Has child,1 child,Attention,Adult,Low income,New customers,Low consumer,Low consumer,Medium consumer,Medium consumer,Low consumer,Low consumer
4,Postgraduate,In couple,Has child,1 child,Attention,Adult,Medium to high income,New customers,Medium consumer,Medium consumer,Medium consumer,Medium consumer,Medium consumer,Medium consumer


In [79]:
df.isna().sum()

Education            0
Marital_Status       0
Has_child            0
Children             0
Cluster              0
Age_group            0
Income_group         0
Seniority_group      0
Wines_segment       13
Fruits_segment     395
Meat_segment         1
Fish_segment       379
Sweets_segment     413
Gold_segment        61
dtype: int64

In [80]:
df.replace(np.nan, "Non consumer", inplace=True)
df.isna().sum()

Education          0
Marital_Status     0
Has_child          0
Children           0
Cluster            0
Age_group          0
Income_group       0
Seniority_group    0
Wines_segment      0
Fruits_segment     0
Meat_segment       0
Fish_segment       0
Sweets_segment     0
Gold_segment       0
dtype: int64

## Association rules for wines top consumers

In [81]:
copy=df.copy() 
df = pd.get_dummies(copy)
frequent_items = apriori(df, use_colnames=True, min_support=0.08, max_len=10)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
product='Wines'
segment='Top consumer'
target = '{\'%s_segment_%s\'}' %(product,segment)
results_personnal_care = rules[rules['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_personnal_care.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4892,"(Cluster_Star, Seniority_group_Old customers)",(Wines_segment_Top consumer),0.112,0.249,0.084,0.746,2.999,0.056,2.957
9949,"(Meat_segment_Top consumer, Income_group_High ...",(Wines_segment_Top consumer),0.177,0.249,0.112,0.634,2.55,0.068,2.054
5056,"(Meat_segment_Top consumer, Income_group_High ...",(Wines_segment_Top consumer),0.19,0.249,0.119,0.626,2.517,0.072,2.01
6321,"(Income_group_High income, Education_Postgradu...",(Wines_segment_Top consumer),0.147,0.249,0.092,0.626,2.516,0.055,2.007
19337,"(Meat_segment_Top consumer, Income_group_High ...",(Wines_segment_Top consumer),0.138,0.249,0.085,0.618,2.483,0.051,1.965


In [27]:
# The top customers of wine are:

# 1) Old customers who belong in the Star cluster.
# 2) People with graduate degree, who are also top consumers for meat and have high income.

## Association rules for fruits top consumers

In [83]:
copy=df.copy() 
df = pd.get_dummies(copy)
frequent_items = apriori(df, use_colnames=True, min_support=0.08, max_len=10)
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
product='Fruits'
segment='Top consumer'
target = '{\'%s_segment_%s\'}' %(product,segment)
results_personnal_care = rules[rules['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results_personnal_care.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5353,"(Sweets_segment_Top consumer, Fish_segment_Top consumer)",(Fruits_segment_Top consumer),0.126,0.203,0.084,0.664,3.27,0.058,2.374
10430,"(Meat_segment_Top consumer, Education_Postgraduate, Fish_segment_Top consumer)",(Fruits_segment_Top consumer),0.132,0.203,0.085,0.645,3.175,0.058,2.245
5341,"(Meat_segment_Top consumer, Fish_segment_Top consumer)",(Fruits_segment_Top consumer),0.146,0.203,0.093,0.639,3.145,0.064,2.207
5347,"(Sweets_segment_Top consumer, Meat_segment_Top consumer)",(Fruits_segment_Top consumer),0.137,0.203,0.087,0.634,3.119,0.059,2.175
5079,"(Sweets_segment_Top consumer, Income_group_High income)",(Fruits_segment_Top consumer),0.129,0.203,0.081,0.629,3.098,0.055,2.15


In [29]:
# The top customers of fruits are:

# 1) Customers who are also top consumers of fish and sweets.
# 2) People with graduate degree, who are also top consumers for meat and fish.