In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

projectRoot = Path().resolve().parent
sys.path.append(str(projectRoot))

dataPath = projectRoot / 'data' / 'raw' / 'e_commerce_shopper_behaviour_and_lifestyle.csv'
df_raw = pd.read_csv(dataPath)
df = df_raw.copy()

In [2]:
# separating numerical and categorical features
numerical_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(include=('object')).columns.tolist()

# target variable
target = 'monthly_spend'

# numeric features excluding the target variable
numeric_features = [c for c in numerical_cols if c != target]

# datetime conversion
df['last_purchase_date'] = pd.to_datetime(
    df['last_purchase_date'], 
    format = '%Y-%m-%d'
)

In [3]:
# one-hot encoding -> only applied to categorical columns with low cardinality
low_cardinality_cat_columns = [
    c for c in categorical_cols
    if df[c].nunique() <= 6
]
print(low_cardinality_cat_columns)

df = pd.get_dummies(
    df,
    drop_first = True,
    columns = low_cardinality_cat_columns
)


['gender', 'urban_rural', 'employment_status', 'education_level', 'relationship_status', 'ethnicity', 'language_preference', 'device_type', 'preferred_payment_method', 'shopping_time_of_day', 'budgeting_style']


In [4]:
### Composite Score columns
# customer wellbeing columns
wellbeing_cols = [
    c for c in df.columns
    if 'stress' in c.lower() or 'mental' in c.lower() or 'emotional' in c.lower() or 'health' in c.lower() or 'physical' in c.lower() or 'sleep' in c.lower()
]
print(f'Wellbeing Columns: {wellbeing_cols}')
df['wellbeing_score'] = df[wellbeing_cols].mean(axis=1)

# shopping engagement columns
shopping_engagement_cols = [
    c for c in df.columns
    if 'time' in c.lower() or 'brows' in c.lower()
]
print(f'Shopping Engagement Columns: {shopping_engagement_cols}')
df['shopping_engagement_score'] = df[shopping_engagement_cols].mean(axis=1)

# price awareness columns
price_awareness_cols = [
    c for c in df.columns
    if 'coupon' in c.lower() or 'notification' in c.lower() or 'discount' in c.lower() or 'ad_' in c.lower()
]
print(f'Price Awareness Columns: {price_awareness_cols}')
df['price_awareness_score'] = df[price_awareness_cols].mean(axis=1)

# impulse purchase -> Impulse vs Planned shopping behavior
impulse_purchase_cols = [
    c for c in df.columns
    if 'impulse' in c.lower()
]
print(f'Impulse Purchase Columns: {impulse_purchase_cols}')
df['impulse_purchase_score'] = df[impulse_purchase_cols].mean(axis=1)
df['planned_purchase_score'] = 10 - df['impulse_purchase_score']

# review influence
review_influence_cols = [
    c for c in df.columns
    if 'review' in c.lower() or 'social' in c.lower() or 'reading' in c.lower()
]
print(f'Review Influence Columns: {review_influence_cols}')
df['review_influence_score'] = df[review_influence_cols].mean(axis=1)

Wellbeing Columns: ['environmental_consciousness', 'health_conscious_shopping', 'stress_from_financial_decisions', 'overall_stress_level', 'sleep_quality', 'physical_activity_level', 'mental_health_score']
Shopping Engagement Columns: ['browse_to_buy_ratio', 'daily_session_time_minutes', 'shopping_time_of_day_Evening', 'shopping_time_of_day_Morning', 'shopping_time_of_day_Night']
Price Awareness Columns: ['coupon_usage_frequency', 'ad_views_per_day', 'ad_clicks_per_day', 'notification_response_rate']
Impulse Purchase Columns: ['impulse_purchases_per_month', 'impulse_buying_score']
Review Influence Columns: ['review_writing_frequency', 'social_media_influence_score', 'reading_habits', 'social_sharing_frequency']
