# Business Objective

Can we predict a customer's spending based on their age, annual income, region, and loyalty score?

In [36]:
# Libraries and Packages
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso


In [37]:
# --- Ensure consistent working directory for data loading ---
# This block dynamically sets the current working directory to the Git repository root.
# This makes data paths reliable for all collaborators, regardless of where they open the notebook.

current_dir = os.getcwd()
repo_root = current_dir
while not os.path.exists(os.path.join(repo_root, '.git')):
    # Move up one directory
    parent_dir = os.path.dirname(repo_root)
    if parent_dir == repo_root: # Reached filesystem root, .git not found
        raise FileNotFoundError(
            "Could not find the .git directory. "
            "Please ensure you are running this code from within a Git repository."
        )
    repo_root = parent_dir

if os.getcwd() != repo_root:
    os.chdir(repo_root)
    print(f"Working directory set to: {os.getcwd()}") # Informative print for users


# --- Data Loading ---
# Path to the data file, relative to the repository root.
data_file_name = 'df_eng_customer_purchasing_features.csv'
data_file_path = os.path.join('src', 'data', data_file_name)

try:
    df = pd.read_csv(data_file_path)
    print(f"Successfully loaded '{data_file_name}'.")
    #print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{data_file_name}' was not found at '{data_file_path}'.")
    print("Please ensure it exists in the 'src/data/' folder relative to the repository root.")
except Exception as e:
    print(f"An error occurred during data loading: {e}")

# Create a copy for feature engineering to keep the original data safe
df_LR = df.copy()
print("Original DataFrame shape:", df_LR.shape)

Successfully loaded 'df_eng_customer_purchasing_features.csv'.
Original DataFrame shape: (238, 14)


In [38]:
# Confirm the DataFrame is loaded correctly
df_LR

Unnamed: 0,user_id,customer_value_score,churn_risk_score,growth_potential_score,loyalty_score,spend_per_purchase,spend_to_income_ratio,is_champion,age,annual_income,purchase_amount,region_North,region_South,region_West
0,1,0.136490,0.829060,74,4.5,16.666667,0.004444,False,25,45000,200,True,False,False
1,2,0.469039,0.470085,100,7.0,19.444444,0.006364,False,34,55000,350,False,True,False
2,3,0.716117,0.282051,57,8.0,22.727273,0.007692,False,45,65000,500,False,False,True
3,4,0.000000,1.000000,44,3.0,15.000000,0.005000,False,22,30000,150,True,False,False
4,5,0.182326,0.778205,100,4.8,16.923077,0.004681,False,29,47000,220,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,234,0.606550,0.399145,44,7.2,22.500000,0.007500,False,40,60000,450,False,False,True
234,235,0.574603,0.422222,59,6.9,21.500000,0.007288,False,38,59000,430,True,False,False
235,236,0.972061,0.035470,44,9.4,23.333333,0.008514,True,54,74000,630,False,True,False
236,237,0.433089,0.562393,40,5.8,20.000000,0.006923,False,32,52000,360,False,False,True


In [39]:
# Dataframe Information
print("DataFrame Information:")
df_LR.info()

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 238 non-null    int64  
 1   customer_value_score    238 non-null    float64
 2   churn_risk_score        238 non-null    float64
 3   growth_potential_score  238 non-null    int64  
 4   loyalty_score           238 non-null    float64
 5   spend_per_purchase      238 non-null    float64
 6   spend_to_income_ratio   238 non-null    float64
 7   is_champion             238 non-null    bool   
 8   age                     238 non-null    int64  
 9   annual_income           238 non-null    int64  
 10  purchase_amount         238 non-null    int64  
 11  region_North            238 non-null    bool   
 12  region_South            238 non-null    bool   
 13  region_West             238 non-null    bool   
dtypes: bool(4), float64

# Linear Regression Model 

#### Model Objectives:

Experiment with using the Linear Regression Model, using L1 and L2 Regularization and different optimizers. 

In [40]:
# define the X and Y variables

# headers
X_headers = ['age', 'annual_income', 'loyalty_score',"customer_value_score", "churn_risk_score", "growth_potential_score", "spend_per_purchase"
             ,"spend_to_income_ratio","is_champion", "age", "annual_income", "region_South"]

X = df_LR[X_headers]

y = df_LR['purchase_amount']

In [41]:
# Spliting the data into training and testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=38)

# setting up the linear regression model and fitting it with the training data
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
# Prediction
y_pred = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MSE: 6.815533834007612e-25
R² Score: 1.0


### L1 

In [43]:
# Spliting the data into training and testing datasets

X_train_Lasso, X_test_Lasso, y_train_Lasso, y_test_Lasso = train_test_split(X, y, test_size=0.3, random_state=38)

# setting up the linear regression model and fitting it with the training data
model_Lasso = LinearRegression()
lasso = Lasso(alpha=0.1)
model_Lasso.fit(X_train_Lasso, y_train_Lasso)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [44]:
# Prediction
y_pred_Lasso = model_Lasso.predict(X_test_Lasso)

print("MSE:", mean_squared_error(y_test_Lasso, y_pred_Lasso))
print("R² Score:", r2_score(y_test_Lasso, y_pred_Lasso))

MSE: 6.815533834007612e-25
R² Score: 1.0


### L2