# Segment and Descriptor Differences
This notebook aims to derive population differences using segmentation and descriptor variables.

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install -qqq Xlsxwriter

[?25l[K     |██▏                             | 10 kB 21.8 MB/s eta 0:00:01[K     |████▍                           | 20 kB 28.8 MB/s eta 0:00:01[K     |██████▌                         | 30 kB 28.7 MB/s eta 0:00:01[K     |████████▊                       | 40 kB 15.8 MB/s eta 0:00:01[K     |███████████                     | 51 kB 5.7 MB/s eta 0:00:01[K     |█████████████                   | 61 kB 6.7 MB/s eta 0:00:01[K     |███████████████▎                | 71 kB 5.8 MB/s eta 0:00:01[K     |█████████████████▌              | 81 kB 4.7 MB/s eta 0:00:01[K     |███████████████████▋            | 92 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████▉          | 102 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████        | 112 kB 5.7 MB/s eta 0:00:01[K     |██████████████████████████▏     | 122 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████▍   | 133 kB 5.7 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 143 kB 5.7 MB/s eta 0:00:01[K 

In [3]:
import os
import pandas as pd
import numpy as np
import xlsxwriter
from scipy import stats

In [4]:
# Change to your own directory
try:
    os.chdir("/content/drive/MyDrive/BT4211 Data-driven Marketing")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Load datasets

### Segment and RFM data

In [5]:
rfm_seg2_df = pd.read_csv('Data/RFM_2_segment.csv')
rfm_seg2_df.shape

(293, 5)

In [6]:
rfm_seg3_df = pd.read_csv('Data/RFM_3_segment.csv')
rfm_seg3_df.shape

(293, 5)

In [7]:
rfm_seg4_df = pd.read_csv('Data/RFM_4_segment.csv')
rfm_seg4_df.shape

(293, 5)

In [8]:
rfm_seg5_df = pd.read_csv('Data/RFM_5_segment.csv')
rfm_seg5_df.shape

(293, 5)

### Psychometric and Demographic Numerical data

In [9]:
psycho_demo_num_df = pd.read_csv('Data/psycho_demo_num.csv')
psycho_demo_num_df.shape

(303, 64)

### Psychometric Aggregated data

In [10]:
psycho_demo_agg_df = pd.read_csv('Data/psycho_demo_agg.csv')
psycho_demo_agg_df.shape

(303, 9)

### Factorised data

In [11]:
factor_df = pd.read_csv('Data/factor_features.csv')
factor_df.shape

(303, 19)

## Define functions

In [12]:
def get_segment_diff_pval(segments_dict, population_mean, diff_decimal=2, pval_decimal=4):
  """Get mean and pvalue for each segment."""

  # Initialise dataframes
  segment_diff_df = pd.DataFrame(columns=['Descriptor Variable', 'Population Mean'] + list(segments_dict.keys()))
  segment_pvalue_df = pd.DataFrame(columns=['Descriptor Variable'] + list(segments_dict.keys()))

  for column, pop_mean in population_mean.iteritems():
    # Initialise dicts to store row information for each column
    new_segment_diff_row = {'Descriptor Variable': column, 'Population Mean': round(pop_mean, diff_decimal)}
    new_segment_pvalue_row = {'Descriptor Variable': column}

    # Calculate mean and pvalue for each segment
    for segment, segment_df in segments_dict.items():
      tscore, pvalue = stats.ttest_1samp(segment_df[column], popmean=pop_mean)
      new_segment_diff_row[segment] = round(segment_df[column].mean(), diff_decimal)
      new_segment_pvalue_row[segment] = round(pvalue, pval_decimal)

    # Update dataframe
    segment_diff_df = segment_diff_df.append(new_segment_diff_row, ignore_index=True)
    segment_pvalue_df = segment_pvalue_df.append(new_segment_pvalue_row, ignore_index=True)

  return segment_diff_df, segment_pvalue_df

In [13]:
def get_sig_diff(segment_dict, segment_diff_df, segment_pval_df):
  """Highlight significant differences for segment."""

  sig_diff_df = pd.DataFrame(columns=["Customer", "Variable", "Type", "Segment Mean", "Population Mean", "P-value"])

  for i, row in segment_diff_df.iterrows():
    column = row['Descriptor Variable']
    pop_mean = row['Population Mean']
    
    for segment in segment_dict.keys():
      segment_mean = row[segment]
      segment_pvalue = segment_pval_df[segment_pval_df['Descriptor Variable']==column][segment].to_numpy()[0]
      if segment_pvalue < 0.05 and segment_mean >= pop_mean: # Higher (p-value < 0.05)
        new_row = {"Customer": segment, "Variable": column, "Type": "Higher (p-val < 0.05)", "Segment Mean": segment_mean, "Population Mean": pop_mean, "P-value": segment_pvalue}
        sig_diff_df = sig_diff_df.append(new_row, ignore_index=True)
      elif segment_pvalue < 0.10 and segment_pvalue >= 0.05 and segment_mean >= pop_mean: # Higher (p-value < 0.10)
        new_row = {"Customer": segment, "Variable": column, "Type": "Higher (p-val < 0.10)", "Segment Mean": segment_mean, "Population Mean": pop_mean, "P-value": segment_pvalue}
        sig_diff_df = sig_diff_df.append(new_row, ignore_index=True)
      elif segment_pvalue < 0.05 and segment_mean <= pop_mean: # Lower (p-value < 0.05)
        new_row = {"Customer": segment, "Variable": column, "Type": "Lower (p-val < 0.05)", "Segment Mean": segment_mean, "Population Mean": pop_mean, "P-value": segment_pvalue}
        sig_diff_df = sig_diff_df.append(new_row, ignore_index=True)
      elif segment_pvalue < 0.10 and segment_pvalue > 0.05 and segment_mean <= pop_mean: # Lower (p-value < 0.10)
        new_row = {"Customer": segment, "Variable": column, "Type": "Lower (p-val < 0.10)", "Segment Mean": segment_mean, "Population Mean": pop_mean, "P-value": segment_pvalue}
        sig_diff_df = sig_diff_df.append(new_row, ignore_index=True)
      
  return sig_diff_df

## Numeric Data

### Merge datasets

In [14]:
# Merge datasets
merged_seg2_df = rfm_seg2_df.merge(psycho_demo_num_df, on='Name')
merged_seg3_df = rfm_seg3_df.merge(psycho_demo_num_df, on='Name')
merged_seg4_df = rfm_seg4_df.merge(psycho_demo_num_df, on='Name')
merged_seg5_df = rfm_seg5_df.merge(psycho_demo_num_df, on='Name')

### Load segments

In [15]:
high_segment_df = merged_seg2_df[merged_seg2_df['Cluster'] == "High Value Customer"]
low_segment_df = merged_seg2_df[merged_seg2_df['Cluster'] == "Low Value Customer"]

seg2_dict = {'High Value Customer': high_segment_df,
              'Low Value Customer': low_segment_df}

In [16]:
high_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "Low Value Customer"]

seg3_dict = {'High Value Customer': high_segment_df,
              'Medium Value Customer': medium_segment_df,
              'Low Value Customer': low_segment_df}

In [17]:
high_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Low Value Customer"]
lost_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Lost Customer"]

seg4_dict = {'High Value Customer': high_segment_df,
              'Medium Value Customer': medium_segment_df,
              'Low Value Customer': low_segment_df,
              'Lost Customer': lost_segment_df}

In [18]:
top_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Top Customer"]
high_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Low Value Customer"]
lost_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Lost Customer"]

seg5_dict = {'Top Customer': top_segment_df,
            'High Value Customer': high_segment_df,
            'Medium Value Customer': medium_segment_df,
            'Low Value Customer': low_segment_df,
            'Lost Customer': lost_segment_df}

### Segment Differences Calculation

#### Segmentation Variables

In [19]:
# Get segmentation variables population mean
seg2_pop_mean = rfm_seg2_df.mean(numeric_only=True)
seg3_pop_mean = rfm_seg3_df.mean(numeric_only=True)
seg4_pop_mean = rfm_seg4_df.mean(numeric_only=True)
seg5_pop_mean = rfm_seg5_df.mean(numeric_only=True)

In [20]:
# Get segment differences and pvalues
seg2_seg_diff_df, seg2_seg_pval_df = get_segment_diff_pval(seg2_dict, seg2_pop_mean)
seg3_seg_diff_df, seg3_seg_pval_df = get_segment_diff_pval(seg3_dict, seg3_pop_mean)
seg4_seg_diff_df, seg4_seg_pval_df = get_segment_diff_pval(seg4_dict, seg4_pop_mean)
seg5_seg_diff_df, seg5_seg_pval_df = get_segment_diff_pval(seg5_dict, seg5_pop_mean)

In [21]:
# Write to excel
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter('Data/segmentation_num_diff_pval.xlsx', options=options) as writer:  
  seg2_seg_diff_df.to_excel(writer, sheet_name='2 Mean', index=False)
  seg2_seg_pval_df.to_excel(writer, sheet_name='2 P-val', index=False)

  seg3_seg_diff_df.to_excel(writer, sheet_name='3 Mean', index=False)
  seg3_seg_pval_df.to_excel(writer, sheet_name='3 P-val', index=False)

  seg4_seg_diff_df.to_excel(writer, sheet_name='4 Mean', index=False)
  seg4_seg_pval_df.to_excel(writer, sheet_name='4 P-val', index=False)

  seg5_seg_diff_df.to_excel(writer, sheet_name='5 Mean', index=False)
  seg5_seg_pval_df.to_excel(writer, sheet_name='5 P-val', index=False)

  


#### Descriptor Variables

In [22]:
# Get descriptor variables population mean
desc_pop_mean = psycho_demo_num_df.mean(numeric_only=True)

In [23]:
# Get segment differences and pvalues
seg2_desc_diff_df, seg2_desc_pval_df = get_segment_diff_pval(seg2_dict, desc_pop_mean)
seg3_desc_diff_df, seg3_desc_pval_df = get_segment_diff_pval(seg3_dict, desc_pop_mean)
seg4_desc_diff_df, seg4_desc_pval_df = get_segment_diff_pval(seg4_dict, desc_pop_mean)
seg5_desc_diff_df, seg5_desc_pval_df = get_segment_diff_pval(seg5_dict, desc_pop_mean)

In [70]:
# Write to excel
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter('Data/descriptor_num_diff_pval.xlsx', options=options) as writer:  
  seg2_desc_diff_df.to_excel(writer, sheet_name='2 Mean', index=False)
  seg2_desc_pval_df.to_excel(writer, sheet_name='2 P-val', index=False)

  seg3_desc_diff_df.to_excel(writer, sheet_name='3 Mean', index=False)
  seg3_desc_pval_df.to_excel(writer, sheet_name='3 P-val', index=False)

  seg4_desc_diff_df.to_excel(writer, sheet_name='4 Mean', index=False)
  seg4_desc_pval_df.to_excel(writer, sheet_name='4 P-val', index=False)

  seg5_desc_diff_df.to_excel(writer, sheet_name='5 Mean', index=False)
  seg5_desc_pval_df.to_excel(writer, sheet_name='5 P-val', index=False)

  


### Analyse Significant Differences

#### Segmentation Variables

In [24]:
seg2_seg_sig_diff = get_sig_diff(seg2_dict, seg2_seg_diff_df, seg2_seg_pval_df)
seg2_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),112.35,232.44,0.0
2,High Value Customer,frequency,Higher (p-val < 0.05),17.13,10.13,0.0
4,High Value Customer,monetary_value,Higher (p-val < 0.05),31382.92,20070.56,0.002
1,Low Value Customer,recency,Higher (p-val < 0.05),341.8,232.44,0.0
3,Low Value Customer,frequency,Lower (p-val < 0.05),3.7,10.13,0.0
5,Low Value Customer,monetary_value,Lower (p-val < 0.05),10128.95,20070.56,0.0


In [25]:
seg3_seg_sig_diff = get_sig_diff(seg3_dict, seg3_seg_diff_df, seg3_seg_pval_df)
seg3_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),64.98,232.44,0.0
3,High Value Customer,frequency,Higher (p-val < 0.05),26.26,10.13,0.0
5,High Value Customer,monetary_value,Higher (p-val < 0.05),41731.24,20070.56,0.0025
2,Low Value Customer,recency,Higher (p-val < 0.05),341.8,232.44,0.0
4,Low Value Customer,frequency,Lower (p-val < 0.05),3.7,10.13,0.0
6,Low Value Customer,monetary_value,Lower (p-val < 0.05),10128.95,20070.56,0.0
1,Medium Value Customer,recency,Lower (p-val < 0.05),147.13,232.44,0.0


In [26]:
seg4_seg_sig_diff = get_sig_diff(seg4_dict, seg4_seg_diff_df, seg4_seg_pval_df)
seg4_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),64.98,232.44,0.0
3,High Value Customer,frequency,Higher (p-val < 0.05),26.26,10.13,0.0
6,High Value Customer,monetary_value,Higher (p-val < 0.05),41731.24,20070.56,0.0025
2,Lost Customer,recency,Higher (p-val < 0.05),505.38,232.44,0.0
5,Lost Customer,frequency,Lower (p-val < 0.05),2.12,10.13,0.0
8,Lost Customer,monetary_value,Lower (p-val < 0.05),7805.17,20070.56,0.0
4,Low Value Customer,frequency,Lower (p-val < 0.05),4.55,10.13,0.0
7,Low Value Customer,monetary_value,Lower (p-val < 0.05),11387.67,20070.56,0.0
1,Medium Value Customer,recency,Lower (p-val < 0.05),147.13,232.44,0.0


In [27]:
seg5_seg_sig_diff = get_sig_diff(seg5_dict, seg5_seg_diff_df, seg5_seg_pval_df)
seg5_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
1,High Value Customer,recency,Lower (p-val < 0.05),111.08,232.44,0.0
6,High Value Customer,frequency,Higher (p-val < 0.05),13.77,10.13,0.0028
11,High Value Customer,monetary_value,Higher (p-val < 0.05),29561.48,20070.56,0.0249
4,Lost Customer,recency,Higher (p-val < 0.05),534.12,232.44,0.0
9,Lost Customer,frequency,Lower (p-val < 0.05),1.78,10.13,0.0
14,Lost Customer,monetary_value,Lower (p-val < 0.05),7955.61,20070.56,0.0
3,Low Value Customer,recency,Higher (p-val < 0.05),296.2,232.44,0.002
8,Low Value Customer,frequency,Lower (p-val < 0.05),3.83,10.13,0.0
13,Low Value Customer,monetary_value,Lower (p-val < 0.05),9915.65,20070.56,0.0
2,Medium Value Customer,recency,Lower (p-val < 0.05),199.12,232.44,0.0309


#### Descriptor Variables

In [28]:
seg2_desc_sig_diff = get_sig_diff(seg2_dict, seg2_desc_diff_df, seg2_desc_pval_df)
seg2_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,"When making a decision, I take other people's ...",Higher (p-val < 0.10),5.79,5.62,0.051
1,High Value Customer,I walk/cycle/use public transport to save fuel,Lower (p-val < 0.05),4.69,5.0,0.0334
3,High Value Customer,Has Child,Lower (p-val < 0.10),0.09,0.14,0.051
5,High Value Customer,Preferred Mode of Payment_Net Banking,Lower (p-val < 0.05),0.01,0.03,0.0099
6,High Value Customer,Preferred Mode of Payment_Online Wallets,Lower (p-val < 0.05),0.01,0.02,0.0321
7,High Value Customer,Care for Chronic Illness,Lower (p-val < 0.05),0.07,0.14,0.006
2,Low Value Customer,I walk/cycle/use public transport to save fuel,Higher (p-val < 0.10),5.23,5.0,0.098
4,Low Value Customer,Meditation,Higher (p-val < 0.10),0.53,0.45,0.0596
8,Low Value Customer,Care for Chronic Illness,Higher (p-val < 0.10),0.19,0.14,0.0975


In [29]:
seg3_desc_sig_diff = get_sig_diff(seg3_dict, seg3_desc_diff_df, seg3_desc_pval_df)
seg3_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,"If I could live my life over, I would change a...",Higher (p-val < 0.05),4.64,4.21,0.0465
7,High Value Customer,Preferred Mode of Payment_Credit Card,Higher (p-val < 0.10),0.17,0.07,0.0509
9,High Value Customer,Preferred Mode of Payment_Online Wallets,Lower (p-val < 0.05),0.0,0.02,0.0
4,Low Value Customer,I walk/cycle/use public transport to save fuel,Higher (p-val < 0.10),5.23,5.0,0.098
6,Low Value Customer,Meditation,Higher (p-val < 0.10),0.53,0.45,0.0596
11,Low Value Customer,Care for Chronic Illness,Higher (p-val < 0.10),0.19,0.14,0.0975
1,Medium Value Customer,I feel good when I co-operate with others,Higher (p-val < 0.10),6.2,6.02,0.052
2,Medium Value Customer,"When making a decision, I take other people's ...",Higher (p-val < 0.10),5.82,5.62,0.067
3,Medium Value Customer,"It is my duty to take care of my family, even ...",Higher (p-val < 0.05),6.19,5.96,0.0298
5,Medium Value Customer,Has Child,Lower (p-val < 0.05),0.06,0.14,0.0108


In [30]:
seg4_desc_sig_diff = get_sig_diff(seg4_dict, seg4_desc_diff_df, seg4_desc_pval_df)
seg4_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,"If I could live my life over, I would change a...",Higher (p-val < 0.05),4.64,4.21,0.0465
12,High Value Customer,Preferred Mode of Payment_Credit Card,Higher (p-val < 0.10),0.17,0.07,0.0509
15,High Value Customer,Preferred Mode of Payment_Online Wallets,Lower (p-val < 0.05),0.0,0.02,0.0
13,Lost Customer,Preferred Mode of Payment_Credit Card,Lower (p-val < 0.05),0.0,0.07,0.0
6,Lost Customer,I believe success in life does not mean becomi...,Lower (p-val < 0.10),5.19,5.56,0.0579
8,Lost Customer,Age,Lower (p-val < 0.05),24.38,26.32,0.0009
4,Lost Customer,It upsets me when my work is not recognized by...,Higher (p-val < 0.10),5.46,5.11,0.0542
7,Low Value Customer,I walk/cycle/use public transport to save fuel,Higher (p-val < 0.05),5.33,5.0,0.0466
10,Low Value Customer,Has Child,Higher (p-val < 0.10),0.21,0.14,0.0829
11,Low Value Customer,Meditation,Higher (p-val < 0.05),0.55,0.45,0.0458


In [31]:
seg5_desc_sig_diff = get_sig_diff(seg5_dict, seg5_desc_diff_df, seg5_desc_pval_df)
seg5_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
10,High Value Customer,Meditation,Lower (p-val < 0.05),0.32,0.45,0.0352
1,High Value Customer,I feel good when I co-operate with others,Higher (p-val < 0.05),6.25,6.02,0.0403
3,High Value Customer,It is important that I do my job better than o...,Lower (p-val < 0.10),5.03,5.37,0.0958
17,High Value Customer,Preferred Mode of Payment_Net Banking,Lower (p-val < 0.05),0.0,0.03,0.0
15,High Value Customer,Preferred Mode of Payment_Debit Card,Lower (p-val < 0.10),0.03,0.07,0.057
12,High Value Customer,Meditation Frequency,Lower (p-val < 0.10),0.69,0.92,0.0826
9,High Value Customer,Has Child,Lower (p-val < 0.10),0.08,0.14,0.0844
19,High Value Customer,Care for Chronic Illness,Lower (p-val < 0.05),0.03,0.14,0.0
2,Lost Customer,I feel good when I co-operate with others,Higher (p-val < 0.10),6.27,6.02,0.0613
14,Lost Customer,Preferred Mode of Payment_Credit Card,Lower (p-val < 0.05),0.0,0.07,0.0


## Aggregated Data

### Merge datasets

In [32]:
# Merge datasets
merged_seg2_df = rfm_seg2_df.merge(psycho_demo_agg_df, on='Name')
merged_seg3_df = rfm_seg3_df.merge(psycho_demo_agg_df, on='Name')
merged_seg4_df = rfm_seg4_df.merge(psycho_demo_agg_df, on='Name')
merged_seg5_df = rfm_seg5_df.merge(psycho_demo_agg_df, on='Name')

### Load segments

In [33]:
high_segment_df = merged_seg2_df[merged_seg2_df['Cluster'] == "High Value Customer"]
low_segment_df = merged_seg2_df[merged_seg2_df['Cluster'] == "Low Value Customer"]

seg2_dict = {'High Value Customer': high_segment_df,
              'Low Value Customer': low_segment_df}

In [34]:
high_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "Low Value Customer"]

seg3_dict = {'High Value Customer': high_segment_df,
              'Medium Value Customer': medium_segment_df,
              'Low Value Customer': low_segment_df}

In [35]:
high_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Low Value Customer"]
lost_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Lost Customer"]

seg4_dict = {'High Value Customer': high_segment_df,
              'Medium Value Customer': medium_segment_df,
              'Low Value Customer': low_segment_df,
              'Lost Customer': lost_segment_df}

In [36]:
top_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Top Customer"]
high_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Low Value Customer"]
lost_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Lost Customer"]

seg5_dict = {'Top Customer': top_segment_df,
            'High Value Customer': high_segment_df,
            'Medium Value Customer': medium_segment_df,
            'Low Value Customer': low_segment_df,
            'Lost Customer': lost_segment_df}

### Segment Differences Calculation

#### Segmentation Variables

In [37]:
# Get segmentation variables population mean
seg2_pop_mean = rfm_seg2_df.mean(numeric_only=True)
seg3_pop_mean = rfm_seg3_df.mean(numeric_only=True)
seg4_pop_mean = rfm_seg4_df.mean(numeric_only=True)
seg5_pop_mean = rfm_seg5_df.mean(numeric_only=True)

In [38]:
# Get segment differences and pvalues
seg2_seg_diff_df, seg2_seg_pval_df = get_segment_diff_pval(seg2_dict, seg2_pop_mean)
seg3_seg_diff_df, seg3_seg_pval_df = get_segment_diff_pval(seg3_dict, seg3_pop_mean)
seg4_seg_diff_df, seg4_seg_pval_df = get_segment_diff_pval(seg4_dict, seg4_pop_mean)
seg5_seg_diff_df, seg5_seg_pval_df = get_segment_diff_pval(seg5_dict, seg5_pop_mean)

In [39]:
# Write to excel
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter('Data/segmentation_agg_diff_pval.xlsx', options=options) as writer:  
  seg2_seg_diff_df.to_excel(writer, sheet_name='2 Mean', index=False)
  seg2_seg_pval_df.to_excel(writer, sheet_name='2 P-val', index=False)

  seg3_seg_diff_df.to_excel(writer, sheet_name='3 Mean', index=False)
  seg3_seg_pval_df.to_excel(writer, sheet_name='3 P-val', index=False)

  seg4_seg_diff_df.to_excel(writer, sheet_name='4 Mean', index=False)
  seg4_seg_pval_df.to_excel(writer, sheet_name='4 P-val', index=False)

  seg5_seg_diff_df.to_excel(writer, sheet_name='5 Mean', index=False)
  seg5_seg_pval_df.to_excel(writer, sheet_name='5 P-val', index=False)

  


#### Descriptor Variables

In [40]:
# Get descriptor variables population mean
desc_pop_mean = psycho_demo_agg_df.mean(numeric_only=True)

In [41]:
# Get segment differences and pvalues
seg2_desc_diff_df, seg2_desc_pval_df = get_segment_diff_pval(seg2_dict, desc_pop_mean)
seg3_desc_diff_df, seg3_desc_pval_df = get_segment_diff_pval(seg3_dict, desc_pop_mean)
seg4_desc_diff_df, seg4_desc_pval_df = get_segment_diff_pval(seg4_dict, desc_pop_mean)
seg5_desc_diff_df, seg5_desc_pval_df = get_segment_diff_pval(seg5_dict, desc_pop_mean)

In [42]:
# Write to excel
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter('Data/descriptor_agg_diff_pval.xlsx', options=options) as writer:  
  seg2_desc_diff_df.to_excel(writer, sheet_name='2 Mean', index=False)
  seg2_desc_pval_df.to_excel(writer, sheet_name='2 P-val', index=False)

  seg3_desc_diff_df.to_excel(writer, sheet_name='3 Mean', index=False)
  seg3_desc_pval_df.to_excel(writer, sheet_name='3 P-val', index=False)

  seg4_desc_diff_df.to_excel(writer, sheet_name='4 Mean', index=False)
  seg4_desc_pval_df.to_excel(writer, sheet_name='4 P-val', index=False)

  seg5_desc_diff_df.to_excel(writer, sheet_name='5 Mean', index=False)
  seg5_desc_pval_df.to_excel(writer, sheet_name='5 P-val', index=False)

  


### Analyse Significant Differences

#### Segmentation Variables

In [43]:
seg2_seg_sig_diff = get_sig_diff(seg2_dict, seg2_seg_diff_df, seg2_seg_pval_df)
seg2_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),112.35,232.44,0.0
2,High Value Customer,frequency,Higher (p-val < 0.05),17.13,10.13,0.0
4,High Value Customer,monetary_value,Higher (p-val < 0.05),31382.92,20070.56,0.002
1,Low Value Customer,recency,Higher (p-val < 0.05),341.8,232.44,0.0
3,Low Value Customer,frequency,Lower (p-val < 0.05),3.7,10.13,0.0
5,Low Value Customer,monetary_value,Lower (p-val < 0.05),10128.95,20070.56,0.0


In [44]:
seg3_seg_sig_diff = get_sig_diff(seg3_dict, seg3_seg_diff_df, seg3_seg_pval_df)
seg3_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),64.98,232.44,0.0
3,High Value Customer,frequency,Higher (p-val < 0.05),26.26,10.13,0.0
5,High Value Customer,monetary_value,Higher (p-val < 0.05),41731.24,20070.56,0.0025
2,Low Value Customer,recency,Higher (p-val < 0.05),341.8,232.44,0.0
4,Low Value Customer,frequency,Lower (p-val < 0.05),3.7,10.13,0.0
6,Low Value Customer,monetary_value,Lower (p-val < 0.05),10128.95,20070.56,0.0
1,Medium Value Customer,recency,Lower (p-val < 0.05),147.13,232.44,0.0


In [45]:
seg4_seg_sig_diff = get_sig_diff(seg4_dict, seg4_seg_diff_df, seg4_seg_pval_df)
seg4_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),64.98,232.44,0.0
3,High Value Customer,frequency,Higher (p-val < 0.05),26.26,10.13,0.0
6,High Value Customer,monetary_value,Higher (p-val < 0.05),41731.24,20070.56,0.0025
2,Lost Customer,recency,Higher (p-val < 0.05),505.38,232.44,0.0
5,Lost Customer,frequency,Lower (p-val < 0.05),2.12,10.13,0.0
8,Lost Customer,monetary_value,Lower (p-val < 0.05),7805.17,20070.56,0.0
4,Low Value Customer,frequency,Lower (p-val < 0.05),4.55,10.13,0.0
7,Low Value Customer,monetary_value,Lower (p-val < 0.05),11387.67,20070.56,0.0
1,Medium Value Customer,recency,Lower (p-val < 0.05),147.13,232.44,0.0


In [46]:
seg5_seg_sig_diff = get_sig_diff(seg5_dict, seg5_seg_diff_df, seg5_seg_pval_df)
seg5_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
1,High Value Customer,recency,Lower (p-val < 0.05),111.08,232.44,0.0
6,High Value Customer,frequency,Higher (p-val < 0.05),13.77,10.13,0.0028
11,High Value Customer,monetary_value,Higher (p-val < 0.05),29561.48,20070.56,0.0249
4,Lost Customer,recency,Higher (p-val < 0.05),534.12,232.44,0.0
9,Lost Customer,frequency,Lower (p-val < 0.05),1.78,10.13,0.0
14,Lost Customer,monetary_value,Lower (p-val < 0.05),7955.61,20070.56,0.0
3,Low Value Customer,recency,Higher (p-val < 0.05),296.2,232.44,0.002
8,Low Value Customer,frequency,Lower (p-val < 0.05),3.83,10.13,0.0
13,Low Value Customer,monetary_value,Lower (p-val < 0.05),9915.65,20070.56,0.0
2,Medium Value Customer,recency,Lower (p-val < 0.05),199.12,232.44,0.0309


#### Descriptor Variables

In [47]:
seg2_desc_sig_diff = get_sig_diff(seg2_dict, seg2_desc_diff_df, seg2_desc_pval_df)
seg2_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value


In [48]:
seg3_desc_sig_diff = get_sig_diff(seg3_dict, seg3_desc_diff_df, seg3_desc_pval_df)
seg3_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,Medium Value Customer,Collectivism/Allocentrism,Higher (p-val < 0.05),5.89,5.73,0.0082


In [49]:
seg4_desc_sig_diff = get_sig_diff(seg4_dict, seg4_desc_diff_df, seg4_desc_pval_df)
seg4_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,Medium Value Customer,Collectivism/Allocentrism,Higher (p-val < 0.05),5.89,5.73,0.0082


In [50]:
seg5_desc_sig_diff = get_sig_diff(seg5_dict, seg5_desc_diff_df, seg5_desc_pval_df)
seg5_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,Low Value Customer,Environmental Behaviour,Higher (p-val < 0.05),5.59,5.36,0.033


## Factorised Data

### Merge datasets

In [51]:
# Merge datasets
merged_seg2_df = rfm_seg2_df.merge(factor_df, on='Name')
merged_seg3_df = rfm_seg3_df.merge(factor_df, on='Name')
merged_seg4_df = rfm_seg4_df.merge(factor_df, on='Name')
merged_seg5_df = rfm_seg5_df.merge(factor_df, on='Name')

### Load segments

In [52]:
high_segment_df = merged_seg2_df[merged_seg2_df['Cluster'] == "High Value Customer"]
low_segment_df = merged_seg2_df[merged_seg2_df['Cluster'] == "Low Value Customer"]

seg2_dict = {'High Value Customer': high_segment_df,
              'Low Value Customer': low_segment_df}

In [53]:
high_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg3_df[merged_seg3_df['Cluster'] == "Low Value Customer"]

seg3_dict = {'High Value Customer': high_segment_df,
              'Medium Value Customer': medium_segment_df,
              'Low Value Customer': low_segment_df}

In [54]:
high_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Low Value Customer"]
lost_segment_df = merged_seg4_df[merged_seg4_df['Cluster'] == "Lost Customer"]

seg4_dict = {'High Value Customer': high_segment_df,
              'Medium Value Customer': medium_segment_df,
              'Low Value Customer': low_segment_df,
              'Lost Customer': lost_segment_df}

In [55]:
top_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Top Customer"]
high_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "High Value Customer"]
medium_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Medium Value Customer"]
low_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Low Value Customer"]
lost_segment_df = merged_seg5_df[merged_seg5_df['Cluster'] == "Lost Customer"]

seg5_dict = {'Top Customer': top_segment_df,
            'High Value Customer': high_segment_df,
            'Medium Value Customer': medium_segment_df,
            'Low Value Customer': low_segment_df,
            'Lost Customer': lost_segment_df}

### Segment Differences Calculation

#### Segmentation Variables

In [56]:
# Get segmentation variables population mean
seg2_pop_mean = rfm_seg2_df.mean(numeric_only=True)
seg3_pop_mean = rfm_seg3_df.mean(numeric_only=True)
seg4_pop_mean = rfm_seg4_df.mean(numeric_only=True)
seg5_pop_mean = rfm_seg5_df.mean(numeric_only=True)

In [57]:
# Get segment differences and pvalues
seg2_seg_diff_df, seg2_seg_pval_df = get_segment_diff_pval(seg2_dict, seg2_pop_mean)
seg3_seg_diff_df, seg3_seg_pval_df = get_segment_diff_pval(seg3_dict, seg3_pop_mean)
seg4_seg_diff_df, seg4_seg_pval_df = get_segment_diff_pval(seg4_dict, seg4_pop_mean)
seg5_seg_diff_df, seg5_seg_pval_df = get_segment_diff_pval(seg5_dict, seg5_pop_mean)

In [58]:
# Write to excel
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter('Data/segmentation_factorised_diff_pval.xlsx', options=options) as writer:  
  seg2_seg_diff_df.to_excel(writer, sheet_name='2 Mean', index=False)
  seg2_seg_pval_df.to_excel(writer, sheet_name='2 P-val', index=False)

  seg3_seg_diff_df.to_excel(writer, sheet_name='3 Mean', index=False)
  seg3_seg_pval_df.to_excel(writer, sheet_name='3 P-val', index=False)

  seg4_seg_diff_df.to_excel(writer, sheet_name='4 Mean', index=False)
  seg4_seg_pval_df.to_excel(writer, sheet_name='4 P-val', index=False)

  seg5_seg_diff_df.to_excel(writer, sheet_name='5 Mean', index=False)
  seg5_seg_pval_df.to_excel(writer, sheet_name='5 P-val', index=False)

  


#### Descriptor Variables

In [59]:
# Get descriptor variables population mean
desc_pop_mean = factor_df.mean(numeric_only=True)

In [60]:
# Get segment differences and pvalues
seg2_desc_diff_df, seg2_desc_pval_df = get_segment_diff_pval(seg2_dict, desc_pop_mean)
seg3_desc_diff_df, seg3_desc_pval_df = get_segment_diff_pval(seg3_dict, desc_pop_mean)
seg4_desc_diff_df, seg4_desc_pval_df = get_segment_diff_pval(seg4_dict, desc_pop_mean)
seg5_desc_diff_df, seg5_desc_pval_df = get_segment_diff_pval(seg5_dict, desc_pop_mean)

In [61]:
# Write to excel
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter('Data/descriptor_factorised_diff_pval.xlsx', options=options) as writer:  
  seg2_desc_diff_df.to_excel(writer, sheet_name='2 Mean', index=False)
  seg2_desc_pval_df.to_excel(writer, sheet_name='2 P-val', index=False)

  seg3_desc_diff_df.to_excel(writer, sheet_name='3 Mean', index=False)
  seg3_desc_pval_df.to_excel(writer, sheet_name='3 P-val', index=False)

  seg4_desc_diff_df.to_excel(writer, sheet_name='4 Mean', index=False)
  seg4_desc_pval_df.to_excel(writer, sheet_name='4 P-val', index=False)

  seg5_desc_diff_df.to_excel(writer, sheet_name='5 Mean', index=False)
  seg5_desc_pval_df.to_excel(writer, sheet_name='5 P-val', index=False)

  


### Analyse Significant Differences

#### Segmentation Variables

In [62]:
seg2_seg_sig_diff = get_sig_diff(seg2_dict, seg2_seg_diff_df, seg2_seg_pval_df)
seg2_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),112.35,232.44,0.0
2,High Value Customer,frequency,Higher (p-val < 0.05),17.13,10.13,0.0
4,High Value Customer,monetary_value,Higher (p-val < 0.05),31382.92,20070.56,0.002
1,Low Value Customer,recency,Higher (p-val < 0.05),341.8,232.44,0.0
3,Low Value Customer,frequency,Lower (p-val < 0.05),3.7,10.13,0.0
5,Low Value Customer,monetary_value,Lower (p-val < 0.05),10128.95,20070.56,0.0


In [63]:
seg3_seg_sig_diff = get_sig_diff(seg3_dict, seg3_seg_diff_df, seg3_seg_pval_df)
seg3_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),64.98,232.44,0.0
3,High Value Customer,frequency,Higher (p-val < 0.05),26.26,10.13,0.0
5,High Value Customer,monetary_value,Higher (p-val < 0.05),41731.24,20070.56,0.0025
2,Low Value Customer,recency,Higher (p-val < 0.05),341.8,232.44,0.0
4,Low Value Customer,frequency,Lower (p-val < 0.05),3.7,10.13,0.0
6,Low Value Customer,monetary_value,Lower (p-val < 0.05),10128.95,20070.56,0.0
1,Medium Value Customer,recency,Lower (p-val < 0.05),147.13,232.44,0.0


In [64]:
seg4_seg_sig_diff = get_sig_diff(seg4_dict, seg4_seg_diff_df, seg4_seg_pval_df)
seg4_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
0,High Value Customer,recency,Lower (p-val < 0.05),64.98,232.44,0.0
3,High Value Customer,frequency,Higher (p-val < 0.05),26.26,10.13,0.0
6,High Value Customer,monetary_value,Higher (p-val < 0.05),41731.24,20070.56,0.0025
2,Lost Customer,recency,Higher (p-val < 0.05),505.38,232.44,0.0
5,Lost Customer,frequency,Lower (p-val < 0.05),2.12,10.13,0.0
8,Lost Customer,monetary_value,Lower (p-val < 0.05),7805.17,20070.56,0.0
4,Low Value Customer,frequency,Lower (p-val < 0.05),4.55,10.13,0.0
7,Low Value Customer,monetary_value,Lower (p-val < 0.05),11387.67,20070.56,0.0
1,Medium Value Customer,recency,Lower (p-val < 0.05),147.13,232.44,0.0


In [65]:
seg5_seg_sig_diff = get_sig_diff(seg5_dict, seg5_seg_diff_df, seg5_seg_pval_df)
seg5_seg_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
1,High Value Customer,recency,Lower (p-val < 0.05),111.08,232.44,0.0
6,High Value Customer,frequency,Higher (p-val < 0.05),13.77,10.13,0.0028
11,High Value Customer,monetary_value,Higher (p-val < 0.05),29561.48,20070.56,0.0249
4,Lost Customer,recency,Higher (p-val < 0.05),534.12,232.44,0.0
9,Lost Customer,frequency,Lower (p-val < 0.05),1.78,10.13,0.0
14,Lost Customer,monetary_value,Lower (p-val < 0.05),7955.61,20070.56,0.0
3,Low Value Customer,recency,Higher (p-val < 0.05),296.2,232.44,0.002
8,Low Value Customer,frequency,Lower (p-val < 0.05),3.83,10.13,0.0
13,Low Value Customer,monetary_value,Lower (p-val < 0.05),9915.65,20070.56,0.0
2,Medium Value Customer,recency,Lower (p-val < 0.05),199.12,232.44,0.0309


#### Descriptor Variables

In [66]:
seg2_desc_sig_diff = get_sig_diff(seg2_dict, seg2_desc_diff_df, seg2_desc_pval_df)
seg2_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
1,High Value Customer,factor_8,Higher (p-val < 0.05),9.86,9.63,0.0447
2,High Value Customer,factor_17,Higher (p-val < 0.05),-0.0,-0.01,0.0099
0,Low Value Customer,factor_7,Higher (p-val < 0.10),1.3,1.13,0.0901


In [67]:
seg3_desc_sig_diff = get_sig_diff(seg3_dict, seg3_desc_diff_df, seg3_desc_pval_df)
seg3_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
2,High Value Customer,factor_14,Higher (p-val < 0.10),0.13,0.05,0.0509
0,Low Value Customer,factor_7,Higher (p-val < 0.10),1.3,1.13,0.0901
1,Medium Value Customer,factor_8,Higher (p-val < 0.05),9.98,9.63,0.0077
3,Medium Value Customer,factor_17,Higher (p-val < 0.05),0.0,-0.01,0.0


In [68]:
seg4_desc_sig_diff = get_sig_diff(seg4_dict, seg4_desc_diff_df, seg4_desc_pval_df)
seg4_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
3,High Value Customer,factor_14,Higher (p-val < 0.10),0.13,0.05,0.0509
0,Lost Customer,factor_2,Lower (p-val < 0.05),20.06,21.75,0.001
4,Lost Customer,factor_14,Lower (p-val < 0.05),0.0,0.05,0.0
2,Low Value Customer,factor_8,Lower (p-val < 0.10),9.32,9.63,0.0776
1,Medium Value Customer,factor_8,Higher (p-val < 0.05),9.98,9.63,0.0077
5,Medium Value Customer,factor_17,Higher (p-val < 0.05),0.0,-0.01,0.0


In [69]:
seg5_desc_sig_diff = get_sig_diff(seg5_dict, seg5_desc_diff_df, seg5_desc_pval_df)
seg5_desc_sig_diff.sort_values("Customer")

Unnamed: 0,Customer,Variable,Type,Segment Mean,Population Mean,P-value
3,High Value Customer,factor_7,Lower (p-val < 0.10),0.83,1.13,0.0553
5,High Value Customer,factor_8,Higher (p-val < 0.05),9.95,9.63,0.0472
6,High Value Customer,factor_12,Lower (p-val < 0.10),0.03,0.07,0.057
10,High Value Customer,factor_17,Higher (p-val < 0.05),0.0,-0.01,0.0
2,Lost Customer,factor_2,Lower (p-val < 0.05),19.82,21.75,0.0018
9,Lost Customer,factor_14,Lower (p-val < 0.05),0.0,0.05,0.0
0,Low Value Customer,factor_0,Higher (p-val < 0.10),18.78,17.75,0.0917
1,Low Value Customer,factor_1,Higher (p-val < 0.05),20.05,19.24,0.0377
4,Low Value Customer,factor_7,Higher (p-val < 0.10),1.38,1.13,0.0858
7,Medium Value Customer,factor_12,Higher (p-val < 0.10),0.15,0.07,0.0516
