# Business Objective

Can we predict a customer's spending based on their age, annual income, region, and loyalty score?

In [47]:
# Libraries and Packages
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import SGDRegressor


***Load Data***

In [48]:
# --- Ensure consistent working directory for data loading ---
# This block dynamically sets the current working directory to the Git repository root.
# This makes data paths reliable for all collaborators, regardless of where they open the notebook.

current_dir = os.getcwd()
repo_root = current_dir
while not os.path.exists(os.path.join(repo_root, '.git')):
    # Move up one directory
    parent_dir = os.path.dirname(repo_root)
    if parent_dir == repo_root: # Reached filesystem root, .git not found
        raise FileNotFoundError(
            "Could not find the .git directory. "
            "Please ensure you are running this code from within a Git repository."
        )
    repo_root = parent_dir

if os.getcwd() != repo_root:
    os.chdir(repo_root)
    print(f"Working directory set to: {os.getcwd()}") # Informative print for users


# --- Data Loading ---
# Path to the data file, relative to the repository root.
data_file_name = 'df_eng_customer_purchasing_features.csv'
data_file_path = os.path.join( 'data', 'processed', data_file_name)

try:
    df = pd.read_csv(data_file_path)
    print(f"Successfully loaded '{data_file_name}'.")
    #print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{data_file_name}' was not found at '{data_file_path}'.")
    print("Please ensure it exists in the 'src/data/' folder relative to the repository root.")
except Exception as e:
    print(f"An error occurred during data loading: {e}")

# Create a copy for feature engineering to keep the original data safe
df_LR = df.copy()
print("Original DataFrame shape:", df_LR.shape)

Successfully loaded 'df_eng_customer_purchasing_features.csv'.
Original DataFrame shape: (238, 29)


In [49]:
# Confirm the DataFrame is loaded correctly
df_LR

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency,region_grouped,region_North,region_South,...,is_frequent,is_champion,income_percentile,spending_percentile,growth_potential_score,age_adjusted_percentile,frequency_percentile,log_purchase_amount,log_annual_income,log_purchase_frequency
0,1,25,45000,200,4.5,North,12,North,True,False,...,False,False,0.1408,0.0735,74,0.3333,0-25%,5.2983,10.7144,2.4849
1,2,34,55000,350,7.0,South,18,South,False,True,...,False,False,0.4181,0.2920,100,0.2613,25-50%,5.8579,10.9151,2.8904
2,3,45,65000,500,8.0,West,22,West,False,False,...,False,False,0.6996,0.6702,57,0.1118,50-75%,6.2146,11.0821,3.0910
3,4,22,30000,150,3.0,East,10,North,True,False,...,False,False,0.0042,0.0042,44,0.0196,0-25%,5.0106,10.3090,2.3026
4,5,29,47000,220,4.8,North,13,North,True,False,...,False,False,0.2080,0.0819,100,0.3824,0-25%,5.3936,10.7579,2.5649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,234,40,60000,450,7.2,West,20,West,False,False,...,False,False,0.5441,0.5441,44,0.5586,25-50%,6.1092,11.0021,2.9957
234,235,38,59000,430,6.9,North,20,North,True,False,...,False,False,0.5105,0.4769,59,0.5586,25-50%,6.0638,10.9853,2.9957
235,236,54,74000,630,9.4,South,27,South,False,True,...,True,True,0.9580,0.9580,44,0.8618,75-100%,6.4457,11.2118,3.2958
236,237,32,52000,360,5.8,West,18,West,False,False,...,False,False,0.3046,0.3130,40,0.2613,25-50%,5.8861,10.8590,2.8904


***Verify Data***

In [50]:
# Dataframe Information
print("DataFrame Information:")
df_LR.info()

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   user_id                  238 non-null    int64  
 1   age                      238 non-null    int64  
 2   annual_income            238 non-null    int64  
 3   purchase_amount          238 non-null    int64  
 4   loyalty_score            238 non-null    float64
 5   region                   238 non-null    object 
 6   purchase_frequency       238 non-null    int64  
 7   region_grouped           238 non-null    object 
 8   region_North             238 non-null    bool   
 9   region_South             238 non-null    bool   
 10  region_West              238 non-null    bool   
 11  spend_per_purchase       238 non-null    float64
 12  spend_to_income_ratio    238 non-null    float64
 13  age_group                238 non-null    object 
 14  inc

RMSPE (Root Mean Squared Prediction error)

In [51]:
def RMSPE(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Account for divide by zero = DNE
    Is_not_DNE = y_true != 0
    if not np.any(Is_not_DNE):
        return None
    rmspe = mean_squared_error(y_true[Is_not_DNE], y_pred[Is_not_DNE]) ** (0.5)
    return rmspe

# Linear Regression Model 

#### Model Objectives:

Experiment with using the Linear Regression Model, using L1 and L2 Regularization and different optimizers. 

In [52]:
# define the X and Y variables

# headers
X_headers = ['age', 'annual_income', 'loyalty_score',"customer_value_score", "churn_risk_score", "growth_potential_score", "spend_per_purchase"
             ,"spend_to_income_ratio","is_champion", "age", "annual_income", "region_South"]

X = df_LR[X_headers]

y = df_LR['purchase_amount']

In [53]:
# Spliting the data into training and testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=38)

# setting up the linear regression model and fitting it with the training data
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [54]:
# Prediction
y_pred = model.predict(X_test)

#print(y_test)
#print(y_pred)

print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSPE:", RMSPE(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MSE: 0.0006407499081423751
RMSPE: 0.02531303830326133
R² Score: 0.9999999574221855


### L1 - Lasso

In [55]:
# Spliting the data into training and testing datasets

X_train_Lasso, X_test_Lasso, y_train_Lasso, y_test_Lasso = train_test_split(X, y, test_size=0.2, random_state=38)

# setting up the linear regression model and fitting it with the training data
model_Lasso = LinearRegression()
lasso = Lasso(alpha=0.1)
model_Lasso.fit(X_train_Lasso, y_train_Lasso)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [56]:
# Prediction
y_pred_Lasso = model_Lasso.predict(X_test_Lasso)

print("MSE:", mean_squared_error(y_test_Lasso, y_pred_Lasso))
print("RMSPE:", RMSPE(y_test_Lasso, y_pred))
print("R² Score:", r2_score(y_test_Lasso, y_pred_Lasso))

MSE: 0.0006407499081423751
RMSPE: 0.02531303830326133
R² Score: 0.9999999574221855


### L2 - Ridge

In [57]:
# Spliting the data into training and testing datasets

X_train_Ridge, X_test_Ridge, y_train_Ridge, y_test_Ridge = train_test_split(X, y, test_size=0.2, random_state=38)

# L2 Ridge Model

Ridge_Model = LinearRegression()
Ridge_Model = Ridge(alpha= 1.0)

Ridge_Model.fit(X_train_Ridge, y_train_Ridge)
Ridge_predictions = Ridge_Model.predict(X_test_Ridge)

In [61]:
# Prediction
y_pred_Ridge = model_Lasso.predict(X_test_Ridge)

print("MSE:", mean_squared_error(y_test_Ridge, y_pred_Ridge))
print("RMSPE:", RMSPE(y_test_Ridge, y_pred_Ridge))
print("R² Score:", r2_score(y_test_Ridge, y_pred_Ridge))

MSE: 0.0006407499081423751
RMSPE: 0.02531303830326133
R² Score: 0.9999999574221855


### Optimizers

In [59]:
# Optimizer - SGD
# Split data
X_train_Optimzer, X_test_Optimzer, y_train_Optimzer, y_test_Optimzer = train_test_split(X, y, test_size=0.2, random_state=38)

# Initialize and fit SGDRegressor
sgd = SGDRegressor(loss='squared_error', eta0=0.01, max_iter=1000, random_state=38)
sgd.fit(X_train_Optimzer, y_train_Optimzer)

0,1,2
,loss,'squared_error'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [62]:
# Prediction
y_pred_Optimizer = sgd.predict(X_test_Optimzer)

print("MSE:", mean_squared_error(y_test_Optimzer, y_pred_Optimizer))
print("RMSPE:", RMSPE(y_test_Optimzer, y_pred_Optimizer))
print("R² Score:", r2_score(y_test_Optimzer, y_pred_Optimizer))

MSE: 2.9364732935313877e+36
RMSPE: 1.7136141028631235e+18
R² Score: -1.9512857286269363e+32
