In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [2]:
# Import data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

# Drop rows where bogo = 0 and discount = 0 because we're only interested in offers that can be completed
df = df[(df['is_bogo'] != 0) | (df['is_discount'] != 0)]

# Drop is_discount to avoid collinearity
df.drop(columns=['is_discount'], inplace=True)

df.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,1,23.22
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,1,19.89
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,1,21.72
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,0,5,5,120,1,1,1,1,1,21.72
e2127556f4f64592b11af22de27a7932,68,70000,91,0,1,1,0,0,10,2,168,1,0,1,0,0,0.0


In [3]:
### Segment Customers ###
# Extract demographic features for clustering
demographic_features = df[['age', 'income', 'days_as_member', 'gender_F', 'gender_M']]

# Standardize the features
scaler = StandardScaler()
demographic_features_scaled = scaler.fit_transform(demographic_features)
demographic_features_scaled = pd.DataFrame(demographic_features_scaled, columns=demographic_features.columns)
demographic_features_scaled.head(n=1)

Unnamed: 0,age,income,days_as_member,gender_F,gender_M
0,0.040212,2.153978,-0.346657,1.194401,-1.161219


In [4]:
# Calculate the silhouette score for different number of clusters
silhouette_scores = []

for n_clusters in range(2, 6):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(demographic_features_scaled)
    silhouette_avg = silhouette_score(demographic_features_scaled, cluster_labels)
    silhouette_scores.append({'n_clusters': n_clusters, 'silhouette_score': silhouette_avg})

# Convert list of dicts to DataFrame
silhouette_scores_df = pd.DataFrame(silhouette_scores)
silhouette_scores_df

Unnamed: 0,n_clusters,silhouette_score
0,2,0.399852
1,3,0.325241
2,4,0.297881
3,5,0.284108


In [5]:
# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['segment'] = kmeans.fit_predict(demographic_features_scaled)

# Display the first few rows with the segment labels
df[['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'segment']].head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,segment
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,2
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,2
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,2
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,2
e2127556f4f64592b11af22de27a7932,68,70000,91,0,1,0


In [14]:
# Calculate mean values of features for each segment
cluster_characteristics = df.groupby('segment')[['age', 'income', 'days_as_member', 'gender_F', 'gender_M']].mean()
cluster_characteristics['num_cust'] = df.groupby('segment').size()
cluster_characteristics['perc_cust'] = (cluster_characteristics['num_cust'] / df.shape[0]) * 100

# Display the characteristics of each cluster
cluster_characteristics = round(cluster_characteristics,2)
cluster_characteristics.to_csv('data/04_fct/fct_offer_response_cluster_characteristics.csv')
cluster_characteristics

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,num_cust,perc_cust
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,52.11,61289.98,528.41,0.0,0.99,25557,57.85
1,46.63,52251.11,492.83,0.97,0.0,8351,18.9
2,66.0,86228.07,527.06,0.98,0.0,10273,23.25


In [7]:
# Reset the index to include 'customer_id' as a column
df_reset = df.reset_index()

response_data = df_reset.groupby(['segment', 'is_bogo', 'reward', 'difficulty', 'duration_hrs']).agg(
    {
    'customer_id': 'nunique',
    'offer_viewed': 'mean',
    'viewed_before_completion': 'mean',
    'offer_completed': ['mean', 'sum'],
    'total_transactions': ['sum', 'median'],
    'total_transaction_amount': ['sum', 'median']
    }).reset_index()

# Flatten the MultiIndex columns
response_data.columns = ['_'.join(col).strip('_') for col in response_data.columns.values]

# Rename columns for clarity, including the new customer count and median total transaction amount columns
response_data.rename(columns={
    'customer_id_nunique': 'num_customers',
    'offer_viewed_mean': 'viewed_rate', 
    'viewed_before_completion_mean': 'viewed_before_completion_rate',
    'offer_completed_mean': 'completion_rate',
    'offer_completed_sum': 'offers_completed',
    'total_transactions_sum': 'total_transactions',
    'total_transactions_median': 'median_total_transactions',
    'total_transaction_amount_sum': 'total_transaction_amount',
    'total_transaction_amount_median': 'median_total_transaction_amount',
    }, inplace=True)

rates = ['viewed_rate','viewed_before_completion_rate', 'completion_rate','total_transaction_amount']
response_data[rates] = round(response_data[rates] * 100, 2)

response_data.to_csv(r'data\04_fct\fct_segmented_offer_responses.csv')
response_data.to_pickle(r'data\04_fct\fct_segmented_offer_responses.pkl')
response_data.head()

Unnamed: 0,segment,is_bogo,reward,difficulty,duration_hrs,num_customers,viewed_rate,viewed_before_completion_rate,completion_rate,offers_completed,total_transactions,median_total_transactions,total_transaction_amount,median_total_transaction_amount
0,0,0,2,10,168,3163,52.73,30.98,55.45,1754,3034,1.0,4283473.0,11.0
1,0,0,2,10,240,3234,96.66,63.98,72.17,2334,4579,1.0,4882342.0,13.025
2,0,0,3,7,168,3191,96.68,62.08,70.76,2258,3824,1.0,3919319.0,9.81
3,0,0,5,20,240,3203,33.84,19.2,44.86,1437,3050,0.0,4831934.0,0.0
4,0,1,5,5,120,3148,96.73,49.75,59.78,1882,2745,1.0,3539947.0,8.03


In [8]:
def calculate_score(row, medians):
    score = 0
    # Criteria scoring
    score += row['num_customers'] > medians['num_customers']
    score += row['viewed_rate'] > medians['viewed_rate']
    score += row['viewed_before_completion_rate'] > medians['viewed_before_completion_rate']
    score += row['completion_rate'] > medians['completion_rate']
    score += row['median_total_transactions'] < medians['median_total_transactions']
    score += row['median_total_transaction_amount'] > medians['median_total_transaction_amount']
    return score

def get_optimal_rows(df, segment, top_n=None):
    seg_df = df[df['segment'] == segment].copy()
    medians = seg_df.median()
    
    # Apply score calculation for each row
    seg_df.loc[:, 'score'] = seg_df.apply(lambda row: calculate_score(row, medians), axis=1)
    
    # Sort by score in descending order to get rows with the highest scores at the top
    if top_n is None:
        optimal_rows = seg_df.sort_values(by='score', ascending=False)
    else:
        optimal_rows = seg_df.sort_values(by='score', ascending=False).head(top_n)
    
    return optimal_rows


# Concatenate top rows for each segment
response_scores = pd.concat([get_optimal_rows(response_data, i, top_n=None) for i in range(3)])

response_scores['overspend'] = round(response_scores['median_total_transaction_amount'] - (response_scores['difficulty'] + response_scores['reward']),2)
response_scores['median_total_transaction_amount'] = round(response_scores['median_total_transaction_amount'],2)

response_scores.to_csv(r'data\04_fct\fct_segmented_offer_response_scores.csv')
response_scores.to_pickle(r'data\04_fct\fct_segmented_offer_response_scores.pkl')
response_scores.head()

Unnamed: 0,segment,is_bogo,reward,difficulty,duration_hrs,num_customers,viewed_rate,viewed_before_completion_rate,completion_rate,offers_completed,total_transactions,median_total_transactions,total_transaction_amount,median_total_transaction_amount,score,overspend
1,0,0,2,10,240,3234,96.66,63.98,72.17,2334,4579,1.0,4882342.0,13.02,5,1.02
2,0,0,3,7,168,3191,96.68,62.08,70.76,2258,3824,1.0,3919319.0,9.81,4,-0.19
4,0,1,5,5,120,3148,96.73,49.75,59.78,1882,2745,1.0,3539947.0,8.03,4,-1.97
6,0,1,10,10,120,3202,96.78,35.17,42.22,1352,1922,0.0,3701215.0,0.0,4,-20.0
3,0,0,5,20,240,3203,33.84,19.2,44.86,1437,3050,0.0,4831934.0,0.0,2,-25.0


In [12]:
# Get top 2 offers for each segment
top_2 = pd.concat([get_optimal_rows(response_scores, i, top_n=2) for i in range(3)])

# Simplify the output
cols = ['segment', 'is_bogo', 'reward', 'difficulty', 'duration_hrs', 'median_total_transaction_amount', 'overspend']
top_2 = top_2[cols]
\
top_2.to_csv(r'data\04_fct\fct_segmented_offer_response_scores_top_2.csv')
top_2

Unnamed: 0,segment,is_bogo,reward,difficulty,duration_hrs,median_total_transaction_amount,overspend
1,0,0,2,10,240,13.02,1.02
2,0,0,3,7,168,9.81,-0.19
9,1,0,2,10,240,13.35,1.35
10,1,0,3,7,168,11.25,1.25
17,2,0,2,10,240,20.99,8.99
18,2,0,3,7,168,20.47,10.47
