In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [10]:
data_new = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

In [11]:
# Extract demographic features for clustering
demographic_features = data_new[['age', 'income', 'days_as_member', 'gender_F', 'gender_M']]

# Standardize the features
scaler = StandardScaler()
demographic_features_scaled = scaler.fit_transform(demographic_features)

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_new['segment'] = kmeans.fit_predict(demographic_features_scaled)

# Display the first few rows with the segment labels
data_new[['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'segment']].head()


Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,segment
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,0


In [12]:
data_new_filtered = data_new[data_new['viewed_before_completion'] == 1]

# Group data by segment and offer attributes, then calculate the response rate
response_data = data_new.groupby(['segment', 'is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']).agg({
    'offer_viewed': 'mean',
    'offer_completed': 'mean'
}).reset_index()

# Rename columns for clarity
response_data.rename(columns={'offer_viewed': 'viewed_rate', 'offer_completed': 'completion_rate'}, inplace=True)

# Identify the top segments for each offer type
top_segments = response_data.sort_values(by='completion_rate', ascending=False).groupby(['is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']).head(1)

top_segments

Unnamed: 0,segment,is_bogo,is_discount,reward,difficulty,duration_hrs,viewed_rate,completion_rate
3,0,0,1,2,10,240,0.973333,0.861961
4,0,0,1,3,7,168,0.969817,0.837967
9,0,1,0,10,10,168,0.821319,0.776074
7,0,1,0,5,5,168,0.606583,0.774295
6,0,1,0,5,5,120,0.973258,0.766613
2,0,0,1,2,10,168,0.618267,0.753318
8,0,1,0,10,10,120,0.968069,0.729751
5,0,0,1,5,20,240,0.420155,0.720155
21,2,0,0,0,0,96,0.562092,0.0
20,2,0,0,0,0,72,0.955869,0.0


Conclusion
* Segment 0: Highly responsive to discount offers with lower rewards and higher difficulty over longer durations.
* Characteristics: Segment 0 consists of older males with an average age of 54.57, a mean income of $64,673, and a longer membership duration (mean of 1088.07 days).
* Segment 1: Highly responsive to high-reward BOGO offers with higher difficulty.
* Characteristics: Segment 1 consists of older females with an average age of 57.46, a mean income of $71,303, and a moderate membership duration (mean of 510.97 days).
* Segment 2: Highly responsive to both high-reward BOGO offers and high-difficulty discount offers over various durations.
* Characteristics: Segment 2 consists of middle-aged males with an average age of 50.95, a mean income of $59,464, and a shorter membership duration (mean of 261.18 days).

In [13]:
# Calculate summary statistics for each segment
segment_summary = data_new.groupby('segment').agg({
    'age': ['mean', 'median', 'std'],
    'income': ['mean', 'median', 'std'],
    'days_as_member': ['mean', 'median', 'std'],
    'gender_F': 'mean',
    'gender_M': 'mean'
}).reset_index()
segment_summary

Unnamed: 0_level_0,segment,age,age,age,income,income,income,days_as_member,days_as_member,days_as_member,gender_F,gender_M
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,std,mean,median,std,mean,median,std,mean,mean
0,0,65.983807,64.0,12.523886,86377.767347,86000.0,16129.494655,527.976062,451.0,359.505667,0.980521,0.0
1,1,52.187145,53.0,17.399321,61268.483955,59000.0,20062.424574,529.314533,343.0,450.559347,0.0,0.99266
2,2,46.796307,47.0,15.593529,52418.939394,53000.0,12768.946329,488.877936,347.0,386.006367,0.972917,0.0
