In [None]:
import pandas as pd

In [4]:
df = pd.read_csv("../data/e_commerce_shopper_behaviour_and_lifestyle.csv")

In [5]:
df

Unnamed: 0,user_id,age,gender,country,urban_rural,income_level,employment_status,education_level,relationship_status,has_children,...,cart_items_average,checkout_abandonments_per_month,purchase_conversion_rate,app_usage_frequency,notification_response_rate,account_age_months,last_purchase_date,social_sharing_frequency,premium_subscription,return_rate
0,1,56,Female,Germany,Suburban,90860,Self-employed,Associate Degree,Single,0,...,10,2,62,7,74,19,2025-06-22,6,1,50
1,2,69,Male,Japan,Suburban,35423,Unemployed,Bachelor,Single,1,...,5,7,54,5,23,8,2026-07-25,3,0,37
2,3,46,Female,India,Urban,21467,Self-employed,Associate Degree,Married,1,...,3,3,33,7,12,13,2026-02-26,6,0,53
3,4,32,Male,Canada,Urban,41770,Self-employed,Bachelor,Widowed,0,...,5,9,26,4,19,9,2026-10-27,7,0,98
4,5,60,Female,Japan,Urban,183882,Employed,Associate Degree,Widowed,1,...,8,0,18,7,30,3,2026-06-23,3,0,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999996,36,Female,France,Urban,192880,Student,Master,In a relationship,0,...,4,3,26,2,50,21,2025-06-27,2,1,93
999996,999997,37,Male,Canada,Urban,197111,Unemployed,Master,Divorced,0,...,10,9,31,5,78,15,2025-06-05,7,1,97
999997,999998,71,Male,UK,Urban,171275,Unemployed,Bachelor,Married,1,...,6,5,19,7,8,16,2026-09-25,5,0,96
999998,999999,30,Female,Germany,Urban,124983,Employed,High School,Single,0,...,4,2,56,5,8,11,2026-10-25,2,0,46


In [6]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Créer le pipeline
log_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
    ("scaler", StandardScaler()),
])

In [8]:
# Sélectionner les colonnes numériques
numeric_df = df.select_dtypes(include=[np.number])
print(f"Colonnes numériques: {list(numeric_df.columns)}")
print(f"Shape: {numeric_df.shape}")
print(f"\nInfos sur les données:")
print(numeric_df.describe())

Colonnes numériques: ['user_id', 'age', 'income_level', 'has_children', 'household_size', 'weekly_purchases', 'monthly_spend', 'cart_abandonment_rate', 'review_writing_frequency', 'average_order_value', 'coupon_usage_frequency', 'loyalty_program_member', 'referral_count', 'weekend_shopper', 'impulse_purchases_per_month', 'browse_to_buy_ratio', 'return_frequency', 'brand_loyalty_score', 'impulse_buying_score', 'environmental_consciousness', 'health_conscious_shopping', 'travel_frequency', 'hobby_count', 'social_media_influence_score', 'reading_habits', 'exercise_frequency', 'stress_from_financial_decisions', 'overall_stress_level', 'sleep_quality', 'physical_activity_level', 'mental_health_score', 'daily_session_time_minutes', 'product_views_per_day', 'ad_views_per_day', 'ad_clicks_per_day', 'wishlist_items_count', 'cart_items_average', 'checkout_abandonments_per_month', 'purchase_conversion_rate', 'app_usage_frequency', 'notification_response_rate', 'account_age_months', 'social_sharin

In [9]:
# RÉGRESSION LINÉAIRE avec le pipeline
# Utiliser la première colonne numérique comme cible (à adapter selon vos besoins)
X = numeric_df.iloc[:, 1:]  # Tous les features sauf le premier
y = numeric_df.iloc[:, 0]    # Première colonne comme cible

# Diviser en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Créer le modèle avec le pipeline + régression linéaire
regression_model = Pipeline([
    ('preprocessing', log_transformer),
    ('regression', LinearRegression())
])

# Entraîner le modèle
regression_model.fit(X_train, y_train)

# Prédictions
y_pred = regression_model.predict(X_test)

# Évaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("=== RÉGRESSION LINÉAIRE ===")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {np.sqrt(mse):.4f}")

=== RÉGRESSION LINÉAIRE ===
MSE: 83350195110.7741
R² Score: -0.0001
RMSE: 288704.3386


In [10]:
# CLUSTERING avec le pipeline
# Appliquer le pipeline de transformation sur les données
X_transformed = log_transformer.fit_transform(X)

# Clustering K-Means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_transformed)

print("\n=== CLUSTERING K-MEANS ===")
print(f"Nombre de clusters: {len(np.unique(clusters))}")
print(f"Distribution des clusters: {np.bincount(clusters)}")
print(f"Inertie: {kmeans.inertia_:.4f}")

# Ajouter les clusters au DataFrame
df_with_clusters = numeric_df.copy()
df_with_clusters['cluster'] = clusters
print(f"\nDataFrame avec clusters:")
print(df_with_clusters.head())


=== CLUSTERING K-MEANS ===
Nombre de clusters: 3
Distribution des clusters: [263993 280135 455872]
Inertie: 40079137.8977

DataFrame avec clusters:
   user_id  age  income_level  has_children  household_size  weekly_purchases  \
0        1   56         90860             0               5                 4   
1        2   69         35423             1               2                13   
2        3   46         21467             1               6                10   
3        4   32         41770             0               4                16   
4        5   60        183882             1               9                17   

   monthly_spend  cart_abandonment_rate  review_writing_frequency  \
0           2405                      0                         3   
1           3651                     28                         6   
2           2045                     14                         1   
3           1611                     11                         2   
4           3476   