In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

projectRoot = Path().resolve().parent
dataPath = projectRoot / 'data' / 'processed' / 'features_with_composites.parquet'

df = pd.read_parquet(dataPath)
df.shape

(1000000, 90)

In [2]:
### defining target variables
# regression target
y_reg = df['monthly_spend']

# classification target
df['spend_tier'] = pd.qcut(
    df['monthly_spend'],
    q=3,
    labels = [
        'low',
        'medium',
        'high'
    ]
)

# validating balanced classes
print(df['spend_tier'].value_counts(normalize=True))

# defining feature matrix
X = df.drop(columns=['monthly_spend', 'spend_tier'])
features = X.columns.tolist()

spend_tier
low       0.333429
medium    0.333310
high      0.333261
Name: proportion, dtype: float64


In [3]:
# train/test split for regression model
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X,
    y_reg,
    train_size=0.8,
    shuffle=True,
    random_state=7
)

# train/test split for classification model
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X,
    df['spend_tier'],
    train_size=0.8,
    shuffle=True,
    random_state=7,
    stratify=df['spend_tier']
)

### Regression Model Baselines

In [4]:
# mean predictor
y_pred_mean = np.repeat(y_train_reg.mean(), len(y_test_reg))

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_reg, y_train_reg)

y_pred_lr = lr.predict(X_test_reg)

# Decision Tree Regressor
tree_reg = DecisionTreeRegressor(
    max_depth=5,
    min_samples_leaf=50,
    random_state=7
)
tree_reg.fit(X_train_reg, y_train_reg)

y_pred_tree_reg = tree_reg.predict(X_test_reg)

# model evaluation
print(f'Mean Predictor r2: {r2_score(y_test_reg, y_pred_mean):.6f}')
print(f'LR Predictor r2: {r2_score(y_test_reg, y_pred_lr):.6f}')
print(f'Tree Predictor r2: {r2_score(y_test_reg, y_pred_tree_reg):.6f}')

Mean Predictor r2: -0.000004
LR Predictor r2: -0.000125
Tree Predictor r2: -0.000100


As shown, the regression models exhibit no prefictive signals as $R^{2}$ scores are almost converging to 0. All regression baselines achieved $R^{2}$ values approximately equal to zero
or slightly negative. Negative $R^{2}$ values indicate that the models perform worse than a naive mean predictor on the test set.

We can safely conclude that monthly spend is not a meaningful predictor, which is consistent with prior EDA and composite score analysis. The result suggests a motivation for reformulating the problem into a classification problem on spend segmentation.

### Classification Model Baselines

In [5]:
# majority class predictor
y_pred_major = y_train_class.mode()[0]
float((y_test_class == y_pred_major).mean())

# Logistic Regression
clf_lr = LogisticRegression(
    max_iter=1000,
    random_state=7
    #multi_class='multinomiautol'
)
clf_lr.fit(X_train_class, y_train_class)

y_pred_logisticReg = clf_lr.predict(X_test_class)

# Decision Tree Classifier
clf_tree = DecisionTreeClassifier(
    max_depth=8,
    min_samples_leaf=50,
    random_state=7
)
clf_tree.fit(X_train_class, y_train_class)

y_pred_tree_clf = clf_tree.predict(X_test_class)

# model evaluation
class_map = {
    'low': 0,
    'medium': 1,
    'high': 2
}

print(f'Majority Class Predictor - accuracy: {accuracy_score(y_test_class.map(class_map), pd.Series(np.repeat(y_pred_major, len(y_test_class))).map(class_map)):.6f}')
print(f'Logistic Regression Classifier - accuracy: {accuracy_score(y_test_class.map(class_map), pd.Series(y_pred_logisticReg).map(class_map)):.6f}')
print(f'Decision Tree Classifier - accuracy: {accuracy_score(y_test_class.map(class_map), pd.Series(y_pred_tree_clf).map(class_map)):.6f}')

print(classification_report(y_test_class.map(class_map), pd.Series(y_pred_tree_clf).map(class_map)))

Majority Class Predictor - accuracy: 0.333430
Logistic Regression Classifier - accuracy: 0.333015
Decision Tree Classifier - accuracy: 0.333875
              precision    recall  f1-score   support

           0       0.34      0.23      0.28     66686
           1       0.33      0.25      0.29     66662
           2       0.33      0.51      0.41     66652

    accuracy                           0.33    200000
   macro avg       0.33      0.33      0.32    200000
weighted avg       0.33      0.33      0.32    200000



Accuracy scores show that no model is learning anything beyond class balance. The target 'monthly_spend' is noisy and most probably influenced by other external factors that are not present in the feature set. When we discretize the target into spend tiers, the noise remains and behavioral features do not strongly separate spend levels.

Since the spending cannot be predicted, we proceed with segmenting customers before further analyzing behavioral profiles.