In [78]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [79]:
# Import data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')
df.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,is_discount,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,0,1,23.22
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,0,0,0,0,96,1,0,1,0,0,0,0.0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,0,1,19.89
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,0,0,0,0,72,1,1,0,0,0,0,0.0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,0,1,21.72


In [80]:
### Segment Customers ###
# Extract demographic features for clustering
demographic_features = df[['age', 'income', 'days_as_member', 'gender_F', 'gender_M']]

# Standardize the features
scaler = StandardScaler()
demographic_features_scaled = scaler.fit_transform(demographic_features)
demographic_features_scaled = pd.DataFrame(demographic_features_scaled, columns=demographic_features.columns)
demographic_features_scaled.head(n=1)

Unnamed: 0,age,income,days_as_member,gender_F,gender_M
0,0.037374,2.155323,-0.346434,1.192128,-1.158533


In [81]:
# Calculate the silhouette score for different number of clusters
silhouette_scores = []

for n_clusters in range(2, 6):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(demographic_features_scaled)
    silhouette_avg = silhouette_score(demographic_features_scaled, cluster_labels)
    silhouette_scores.append({'n_clusters': n_clusters, 'silhouette_score': silhouette_avg})

# Convert list of dicts to DataFrame
silhouette_scores_df = pd.DataFrame(silhouette_scores)
silhouette_scores_df

Unnamed: 0,n_clusters,silhouette_score
0,2,0.399612
1,3,0.324318
2,4,0.299282
3,5,0.292753


In [82]:
# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['segment'] = kmeans.fit_predict(demographic_features_scaled)

# Display the first few rows with the segment labels
df[['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'segment']].head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,segment
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,0


In [83]:
# Calculate mean values of features for each segment
cluster_characteristics = df.groupby('segment')[['age', 'income', 'days_as_member', 'gender_F', 'gender_M']].mean()
cluster_characteristics['num_cust'] = df.groupby('segment').size()
cluster_characteristics['perc_cust'] = (cluster_characteristics['num_cust'] / df.shape[0]) * 100

# Display the characteristics of each cluster
cluster_characteristics = round(cluster_characteristics,2)
cluster_characteristics

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,num_cust,perc_cust
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,65.98,86377.77,527.98,0.98,0.0,12783,23.15
1,52.19,61268.48,529.31,0.0,0.99,31879,57.73
2,46.8,52418.94,488.88,0.97,0.0,10560,19.12


In [84]:
# Reset the index to include 'customer_id' as a column
df_reset = df.reset_index()

response_data = df_reset.groupby(['segment', 'is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']).agg(
    {
    'customer_id': 'nunique',
    'offer_viewed': 'mean',
    'viewed_before_completion': 'mean',
    'offer_completed': ['mean', 'sum'],
    'total_transactions': ['sum', 'median'],
    'total_transaction_amount': ['sum', 'median']
    }).reset_index()

# Flatten the MultiIndex columns
response_data.columns = ['_'.join(col).strip('_') for col in response_data.columns.values]

# Rename columns for clarity, including the new customer count and median total transaction amount columns
response_data.rename(columns={
    'customer_id_nunique': 'num_customers',
    'offer_viewed_mean': 'viewed_rate', 
    'viewed_before_completion_mean': 'viewed_before_completion_rate',
    'offer_completed_mean': 'completion_rate',
    'offer_completed_sum': 'offers_completed',
    'total_transactions_sum': 'total_transactions',
    'total_transactions_median': 'median_total_transactions',
    'total_transaction_amount_sum': 'total_transaction_amount',
    'total_transaction_amount_median': 'median_total_transaction_amount',
    }, inplace=True)

rates = ['viewed_rate','viewed_before_completion_rate', 'completion_rate','total_transaction_amount']
response_data[rates] = round(response_data[rates] * 100, 2)

response_data.to_csv(r'data\04_fct\fct_segmented_offer_responses.csv')
response_data.to_pickle(r'data\04_fct\fct_segmented_offer_responses.pkl')
response_data.head()

Unnamed: 0,segment,is_bogo,is_discount,reward,difficulty,duration_hrs,num_customers,viewed_rate,viewed_before_completion_rate,completion_rate,offers_completed,total_transactions,median_total_transactions,total_transaction_amount,median_total_transaction_amount
0,0,0,0,0,0,72,1309,82.73,0.0,0.0,0,0,0.0,0.0,0.0
1,0,0,0,0,0,96,1271,60.82,0.0,0.0,0,0,0.0,0.0,0.0
2,0,0,1,2,10,168,1281,61.83,37.0,75.33,965,1036,1.0,2305404.0,19.21
3,0,0,1,2,10,240,1275,97.33,75.22,86.2,1099,1189,1.0,2778563.0,21.02
4,0,0,1,3,7,168,1259,96.98,70.85,83.8,1055,1124,1.0,2883622.0,20.51


In [91]:
def calculate_score(row, medians):
    score = 0
    # Criteria scoring
    score += row['num_customers'] > medians['num_customers']
    score += row['viewed_rate'] > medians['viewed_rate']
    score += row['viewed_before_completion_rate'] > medians['viewed_before_completion_rate']
    score += row['completion_rate'] > medians['completion_rate']
    score += row['median_total_transactions'] < medians['median_total_transactions']
    score += row['median_total_transaction_amount'] > medians['median_total_transaction_amount']
    return score

def get_optimal_rows(df, segment, top_n=None):
    seg_df = df[df['segment'] == segment].copy()
    medians = seg_df.median()
    
    # Apply score calculation for each row
    seg_df.loc[:, 'score'] = seg_df.apply(lambda row: calculate_score(row, medians), axis=1)
    
    # Sort by score in descending order to get rows with the highest scores at the top
    if top_n is None:
        optimal_rows = seg_df.sort_values(by='score', ascending=False)
    else:
        optimal_rows = seg_df.sort_values(by='score', ascending=False).head(top_n)
    
    return optimal_rows


# Concatenate top rows for each segment
response_scores = pd.concat([get_optimal_rows(response_data, i, top_n=None) for i in range(3)])

response_scores['overspend'] = round(response_scores['median_total_transaction_amount'] - (response_scores['difficulty'] + response_scores['reward']),2)
response_scores['median_total_transaction_amount'] = round(response_scores['median_total_transaction_amount'],2)

response_scores.to_csv(r'data\04_fct\fct_segmented_offer_response_scores.csv')
response_scores.to_pickle(r'data\04_fct\fct_segmented_offer_response_scores.pkl')
response_scores.head()

Unnamed: 0,segment,is_bogo,is_discount,reward,difficulty,duration_hrs,num_customers,viewed_rate,viewed_before_completion_rate,completion_rate,offers_completed,total_transactions,median_total_transactions,total_transaction_amount,median_total_transaction_amount,score,overspend
3,0,0,1,2,10,240,1275,97.33,75.22,86.2,1099,1189,1.0,2778563.0,21.02,4,9.02
4,0,0,1,3,7,168,1259,96.98,70.85,83.8,1055,1124,1.0,2883622.0,20.51,4,10.51
9,0,1,0,10,10,168,1304,82.13,51.61,77.61,1012,1089,1.0,2671263.0,19.45,4,-0.55
0,0,0,0,0,0,72,1309,82.73,0.0,0.0,0,0,0.0,0.0,0.0,3,0.0
6,0,1,0,5,5,120,1234,97.33,64.1,76.66,946,1015,1.0,2653148.0,19.33,3,9.33


In [92]:
top_2 = pd.concat([get_optimal_rows(response_data, i, top_n=2) for i in range(3)])
top_2.to_csv(r'data\04_fct\fct_segmented_offer_response_scores_top_2.csv')
top_2

Unnamed: 0,segment,is_bogo,is_discount,reward,difficulty,duration_hrs,num_customers,viewed_rate,viewed_before_completion_rate,completion_rate,offers_completed,total_transactions,median_total_transactions,total_transaction_amount,median_total_transaction_amount,score
3,0,0,1,2,10,240,1275,97.33,75.22,86.2,1099,1189,1.0,2778563.0,21.02,4
4,0,0,1,3,7,168,1259,96.98,70.85,83.8,1055,1124,1.0,2883622.0,20.51,4
13,1,0,1,2,10,240,3234,96.66,63.98,72.17,2334,4579,1.0,4882342.0,13.025,5
14,1,0,1,3,7,168,3190,96.68,62.1,70.78,2258,3824,1.0,3919319.0,9.815,5
23,2,0,1,2,10,240,1035,97.58,70.43,78.74,815,1491,1.0,1469670.0,13.38,4
26,2,1,0,5,5,120,1064,96.52,57.8,69.08,735,961,1.0,1247571.0,9.645,4
