**Load the data**

In [3]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os

# set visulization style
sns.set_style(style="whitegrid")
plt.style.use('seaborn-v0_8-whitegrid')

In [25]:
# --- Ensure consistent working directory for data loading ---
# This block dynamically sets the current working directory to the Git repository root.
# This makes data paths reliable for all collaborators, regardless of where they open the notebook.

current_dir = os.getcwd()
repo_root = current_dir
while not os.path.exists(os.path.join(repo_root, '.git')):
    # Move up one directory
    parent_dir = os.path.dirname(repo_root)
    if parent_dir == repo_root: # Reached filesystem root, .git not found
        raise FileNotFoundError(
            "Could not find the .git directory. "
            "Please ensure you are running this code from within a Git repository."
        )
    repo_root = parent_dir

if os.getcwd() != repo_root:
    os.chdir(repo_root)
    print(f"Working directory set to: {os.getcwd()}") # Informative print for users


# --- Data Loading ---
# Path to the data file, relative to the repository root.
data_file_name = 'Customer Purchasing Behaviors.csv'
data_file_path = os.path.join('src', 'data', data_file_name)

try:
    df = pd.read_csv(data_file_path)
    print(f"Successfully loaded '{data_file_name}'.")
    #print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{data_file_name}' was not found at '{data_file_path}'.")
    print("Please ensure it exists in the 'src/data/' folder relative to the repository root.")
except Exception as e:
    print(f"An error occurred during data loading: {e}")

Successfully loaded 'Customer Purchasing Behaviors.csv'.


**0. Load and Prepare Data**

In [23]:
# Load the dataset
# Ensure 'customer_data.csv' is in the same directory or provide the full path.
try:
    df = pd.read_csv('customer_data.csv')
except FileNotFoundError:
    print("Error: 'customer_data.csv' not found. Please check the file path.")
    # Create a dummy dataframe to allow the rest of the script to run for demonstration
    data = {'user_id': range(238), 'age': range(18, 88, 1), 'annual_income': range(30000, 100000, 295), 
            'purchase_amount': range(150, 650, 2), 'loyalty_score': np.linspace(3, 9.5, 238),
            'region': ['North', 'South', 'West', 'East'] * 59 + ['North', 'South'],
            'purchase_frequency': range(10, 29, 1)[0:238]}
    df = pd.DataFrame(data)
    print("A dummy dataframe has been created for demonstration purposes.")


# Create a copy for feature engineering to keep the original data safe
df_eng = df.copy()
print("Original DataFrame shape:", df_eng.shape)


Error: 'customer_data.csv' not found. Please check the file path.


ValueError: All arrays must be of the same length

**1. Handling Categorical Features**

In [26]:
# Rationale: Group the underrepresented 'East' region to ensure model stability and prevent learning from statistical noise.
df_eng['region_grouped'] = df_eng['region'].replace({'East': 'North'})

# Convert categorical data into numerical format using One-Hot Encoding
region_dummies = pd.get_dummies(df_eng['region_grouped'], prefix='region', drop_first=True)
df_eng = pd.concat([df_eng, region_dummies], axis=1)

print("\n--- DataFrame after handling 'region' ---")
print(df_eng[['user_id', 'region', 'region_grouped', 'region_South', 'region_West']].head())


NameError: name 'df_eng' is not defined

**2. Creating Ratio-Based Features (Behavioral Insights)**

In [27]:
# Rationale: Ratios normalize for effects like purchase frequency and provide deeper behavioral context.
df_eng['spend_per_purchase'] = df_eng['purchase_amount'] / df_eng['purchase_frequency']
df_eng['income_to_spend_ratio'] = df_eng['purchase_amount'] / df_eng['annual_income']

print("\n--- Newly created ratio features ---")
print(df_eng[['user_id', 'spend_per_purchase', 'income_to_spend_ratio']].head())


NameError: name 'df_eng' is not defined

**3. Creating Demographic Tiers (Binning)**

In [28]:
# Rationale: Converts continuous variables into interpretable categories for business analysis and segmentation.
age_bins = [18, 30, 45, 60, 100]
age_labels = ['Young Adult', 'Adult', 'Middle-Aged', 'Senior']
df_eng['age_group'] = pd.cut(df_eng['age'], bins=age_bins, labels=age_labels, right=False)

income_bins = [0, 45000, 65000, 150000]
income_labels = ['Low Income', 'Medium Income', 'High Income']
df_eng['income_bracket'] = pd.cut(df_eng['annual_income'], bins=income_bins, labels=income_labels, right=False)

print("\n--- Newly created demographic tiers ---")
print(df_eng[['user_id', 'age', 'age_group', 'annual_income', 'income_bracket']].head())

NameError: name 'df_eng' is not defined

**4. Creating Composite Scores**

In [29]:
# Rationale: Combines multiple collinear features into single, powerful, and interpretable scores for value and risk.
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df_eng[['purchase_amount', 'purchase_frequency', 'loyalty_score']])
df_scaled = pd.DataFrame(scaled_features, columns=['purchase_scaled', 'frequency_scaled', 'loyalty_scaled'])

# Customer Value Score (weighted sum of key metrics)
weights = {'monetary': 0.5, 'frequency': 0.25, 'loyalty': 0.25}
df_eng['customer_value_score'] = (weights['monetary'] * df_scaled['purchase_scaled'] +
                                  weights['frequency'] * df_scaled['frequency_scaled'] +
                                  weights['loyalty'] * df_scaled['loyalty_scaled'])

# Churn Risk Score (high for low loyalty and frequency)
df_eng['churn_risk_score'] = (0.5 * (1 - df_scaled['loyalty_scaled']) +
                              0.5 * (1 - df_scaled['frequency_scaled']))

print("\n--- Newly created composite scores ---")
print(df_eng[['user_id', 'customer_value_score', 'churn_risk_score']].head())


NameError: name 'MinMaxScaler' is not defined

**5. Creating Interaction and Segmentation Features**

In [30]:
# Rationale: Create binary flags for easy filtering and to identify high-value customer segments like 'Champions'.

# Binary Segmentation Flags (based on top 25% percentile)
high_value_threshold = df_eng['purchase_amount'].quantile(0.75)
high_loyalty_threshold = df_eng['loyalty_score'].quantile(0.75)
high_frequency_threshold = df_eng['purchase_frequency'].quantile(0.75)

df_eng['is_high_value'] = (df_eng['purchase_amount'] > high_value_threshold).astype(int)
df_eng['is_loyal'] = (df_eng['loyalty_score'] > high_loyalty_threshold).astype(int)
df_eng['is_frequent'] = (df_eng['purchase_frequency'] > high_frequency_threshold).astype(int)
df_eng['is_champion'] = (df_eng['is_high_value'] * df_eng['is_loyal'] * df_eng['is_frequent']).astype(int)

print("\n--- Binary Segmentation Flags ---")
print(df_eng[['user_id', 'is_high_value', 'is_loyal', 'is_frequent', 'is_champion']].head())
print(f"Number of Champion Customers: {df_eng['is_champion'].sum()}")


NameError: name 'df_eng' is not defined

**6. Creating Statistical and Business-Savvy Features**

In [31]:
# Rationale: Create normalized ranks and business-oriented scores like 'Growth Potential'.

# Percentile Ranks
df_eng['income_percentile'] = df_eng['annual_income'].rank(pct=True)
df_eng['spending_percentile'] = df_eng['purchase_amount'].rank(pct=True)

# Growth Potential Score (High Income, Relatively Low Spending)
df_eng['growth_potential_score'] = df_eng['income_percentile'] - df_eng['spending_percentile']

print("\n--- Growth Potential & Percentile Scores ---")
print(df_eng.sort_values('growth_potential_score', ascending=False)[['user_id', 'annual_income', 'purchase_amount', 'growth_potential_score']].head())

NameError: name 'df_eng' is not defined

**7. Finalizing the Model-Ready DataFrame**

In [32]:
# Rationale: Create a final, clean DataFrame containing only the identifier and the best engineered features for modeling.
# This prevents data leakage and removes redundant columns.

features_for_modeling = [
    'user_id',
    # --- Core Scores ---
    'customer_value_score',
    'churn_risk_score',
    'growth_potential_score',
    # --- Behavioral Ratios ---
    'spend_per_purchase',
    'income_to_spend_ratio',
    # --- Key Segments/Flags ---
    'is_champion',
    # --- Raw Demographics (for direct use) ---
    'age',
    'annual_income'
]

# Dynamically add the one-hot encoded region columns to the list
final_feature_list = features_for_modeling + list(region_dummies.columns)

df_model_ready = df_eng[final_feature_list].copy()

print("\n--- FINAL MODEL-READY DATAFRAME ---")
print("Shape:", df_model_ready.shape)
print("Columns:", df_model_ready.columns.tolist())
print(df_model_ready.head())


NameError: name 'region_dummies' is not defined

**7. Segmentation features - (is_high_value, is_loyal, is_frequent, customer_tier)**

In [18]:
# is_high_value
df['spender'] = ['is_high_value' if x >= df['purchase_amount'].quantile(0.75) else 'high_value_in_progress' 
                 for x in df['purchase_amount']]
#df['spender'].head()

In [20]:
# is_loyal
df['loyal'] = ['is_loyal' if x >= df['loyalty_score'].quantile(0.75) else 'loyalty_in_progress' 
                 for x in df['loyalty_score']]
#df['loyal'].head()

In [22]:
# is_frequent
df['frequent'] = ['is_frequent' if x >= df['purchase_frequency'].quantile(0.75) else 'frequency_in_progress' 
                 for x in df['purchase_frequency']]
#df['frequent'].head()

In [15]:
# customer_tier. Would this be market segmentations? Would it come out from clustering?

**8. Demographic behavioural interaction - (young_high_spender, senior_loyal, income_age_segment, etc..)**

In [None]:
# Demographic behavioural interaction. Would this be market segmentations? Would it come out from clustering?

**9. Statistical Features - (frequency_percentile, is_outlier_spender, loyalty_deviation, etc..)**

In [None]:
# frequency_percentile
df['frequency_percentile'] = ['is_frequent' if x >= df['purchase_frequency'].quantile(0.75) else 'frequency_in_progress' 
                 for x in df['purchase_frequency']]
df['frequency_percentile'].head()


In [None]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({'purchase_amount': [120, 250, 75, 300, 180, 220, 90, 310]})

# Define labels for each quantile
labels = ['0-25%', '25-50%', '50-75%', '75-100%']

# Create 'spender_group' column with labels
df['spender_group'] = pd.qcut(df['purchase_amount'], q=4, labels=labels)