<a href="https://colab.research.google.com/github/chasslayy/Chasslayy-Luxe-AutoAI-project/blob/main/chasslayy_luxe_autoai_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import pandas as pd

np.random.seed(42)


In [10]:
N = 800

engagement_levels = ["low", "medium", "high"]
categories = ["wig", "beauty", "digital"]
sources = ["instagram", "tiktok", "pinterest", "direct", "other"]
aov_bucket = ["low", "medium", "high"]


In [11]:
df = pd.DataFrame({
    "sessions_last_30d": np.random.poisson(lam=6, size=N).clip(0, 40),
    "engagement_level": np.random.choice(engagement_levels, size=N, p=[0.45, 0.40, 0.15]),
    "product_category_viewed": np.random.choice(categories, size=N, p=[0.50, 0.30, 0.20]),
    "discount_used": np.random.binomial(1, 0.35, size=N),
    "days_since_last_interaction": np.random.randint(0, 61, size=N),
    "prior_purchases": np.random.poisson(lam=1.2, size=N).clip(0, 12),
    "avg_order_value_bucket": np.random.choice(aov_bucket, size=N, p=[0.40, 0.45, 0.15]),
    "traffic_source": np.random.choice(sources, size=N, p=[0.45, 0.25, 0.12, 0.10, 0.08]),
})


In [12]:
bucket_base = df["avg_order_value_bucket"].map({
    "low": 35,
    "medium": 75,
    "high": 140
}).astype(float)

category_boost = df["product_category_viewed"].map({
    "digital": -10,
    "beauty": 0,
    "wig": 40
}).astype(float)

noise = np.random.normal(0, 12, size=len(df))

df["order_value_est"] = (bucket_base + category_boost + noise).clip(10, 300).round(2)


In [13]:
base = 0.08
p = (
    base
    + 0.03 * (df["sessions_last_30d"] / 10)
    + 0.08 * (df["prior_purchases"] / 6)
    + 0.10 * df["engagement_level"].map({"low": 0.0, "medium": 0.5, "high": 1.0})
    - 0.06 * (df["days_since_last_interaction"] / 60)
    + 0.04 * df["discount_used"]
    + 0.03 * df["traffic_source"].isin(["instagram", "tiktok"]).astype(int)
).clip(0.01, 0.95)

df["purchase_made"] = np.random.binomial(1, p)


In [14]:
HIGH_VALUE_THRESHOLD = 120

df["high_value_purchase"] = (
    (df["purchase_made"] == 1) &
    (df["order_value_est"] >= HIGH_VALUE_THRESHOLD)
).astype(int)


In [15]:
print(df.shape)
print(df[["purchase_made", "high_value_purchase"]].mean())
df.head()


(800, 11)
purchase_made          0.16625
high_value_purchase    0.03750
dtype: float64


Unnamed: 0,sessions_last_30d,engagement_level,product_category_viewed,discount_used,days_since_last_interaction,prior_purchases,avg_order_value_bucket,traffic_source,order_value_est,purchase_made,high_value_purchase
0,6,medium,beauty,1,44,0,high,instagram,135.06,0,0
1,6,medium,beauty,0,59,0,high,instagram,133.43,0,0
2,4,low,wig,0,55,1,medium,tiktok,106.58,0,0
3,5,low,beauty,1,14,0,medium,tiktok,62.62,0,0
4,4,low,wig,1,5,1,low,instagram,82.99,0,0


In [16]:
df.to_csv("chasslayy_luxe_autoai_dataset.csv", index=False)
print("✅ chasslayy_luxe_autoai_dataset.csv created successfully")


✅ chasslayy_luxe_autoai_dataset.csv created successfully


In [17]:
from google.colab import files
files.download("chasslayy_luxe_autoai_dataset.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [19]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop("purchase_made", axis=1)
y = df_encoded["purchase_made"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds))



Logistic Regression
[[133   0]
 [ 20   7]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       133
           1       1.00      0.26      0.41        27

    accuracy                           0.88       160
   macro avg       0.93      0.63      0.67       160
weighted avg       0.89      0.88      0.84       160


Random Forest
[[133   0]
 [ 21   6]]
              precision    recall  f1-score   support

           0       0.86      1.00      0.93       133
           1       1.00      0.22      0.36        27

    accuracy                           0.87       160
   macro avg       0.93      0.61      0.65       160
weighted avg       0.89      0.87      0.83       160

