In [6]:
import pandas as pd

df = pd.read_csv("2019-Oct.csv")
print(df.head())

                event_time event_type  product_id   category_id  \
0  2019-10-01 00:00:00 UTC       view  44600062.0  2.103807e+18   
1  2019-10-01 00:00:00 UTC       view   3900821.0  2.053014e+18   
2  2019-10-01 00:00:01 UTC       view  17200506.0  2.053014e+18   
3  2019-10-01 00:00:01 UTC       view   1307067.0  2.053014e+18   
4  2019-10-01 00:00:04 UTC       view   1004237.0  2.053014e+18   

                         category_code     brand    price      user_id  \
0                                  NaN  shiseido    35.79  541312140.0   
1  appliances.environment.water_heater      aqua    33.20  554748717.0   
2           furniture.living_room.sofa       NaN   543.10  519107250.0   
3                   computers.notebook    lenovo   251.74  550050854.0   
4               electronics.smartphone     apple  1081.98  535871217.0   

                           user_session  
0  72d76fde-8bb3-4e00-8c23-a032dfed738c  
1  9333dfbd-b87a-4708-9857-6336556b0fcc  
2  566511c2-e2e3-422b-b695

In [7]:
df["is_purchase"] = (df["event_type"] == "purchase").astype(int)
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase
0,2019-10-01 00:00:00 UTC,view,44600062.0,2.103807e+18,,shiseido,35.79,541312140.0,72d76fde-8bb3-4e00-8c23-a032dfed738c,0
1,2019-10-01 00:00:00 UTC,view,3900821.0,2.053014e+18,appliances.environment.water_heater,aqua,33.2,554748717.0,9333dfbd-b87a-4708-9857-6336556b0fcc,0
2,2019-10-01 00:00:01 UTC,view,17200506.0,2.053014e+18,furniture.living_room.sofa,,543.1,519107250.0,566511c2-e2e3-422b-b695-cf8e6e792ca8,0
3,2019-10-01 00:00:01 UTC,view,1307067.0,2.053014e+18,computers.notebook,lenovo,251.74,550050854.0,7c90fc70-0e80-4590-96f3-13c02c18c713,0
4,2019-10-01 00:00:04 UTC,view,1004237.0,2.053014e+18,electronics.smartphone,apple,1081.98,535871217.0,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,0


In [8]:
import numpy as np

df = df.copy()
df = df.dropna(subset=["price"])

df["event_time"] = pd.to_datetime(df["event_time"])
df = df.sort_values(["user_session", "event_time"])

df["log_price"] = np.log1p(df["price"])
df["price_bin"] = pd.qcut(df["price"], q=4, labels=False).astype(int)

cat_mean = df.groupby("category_code")["price"].transform("mean")
df["price_per_category_mean"] = (df["price"] / cat_mean).replace([np.inf, -np.inf], np.nan).fillna(1.0)

cat_median = df.groupby("category_code")["price"].transform("median")
df["price_above_median"] = (df["price"] > cat_median).astype("Int8").fillna(0).astype(int)

df["hour_of_day"] = df["event_time"].dt.hour
df["day_of_week"] = df["event_time"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)
df["is_evening"] = df["hour_of_day"].between(18, 23).astype(int)

df["category_code_encoded"] = df["category_code"].astype("category").cat.codes
df["brand_encoded"] = df["brand"].astype("category").cat.codes

category_counts = df["category_code"].value_counts()
df["category_popularity"] = df["category_code"].map(category_counts).fillna(0)

session_counts = df["user_session"].value_counts()
df["session_event_count"] = df["user_session"].map(session_counts)

df["event_position_in_session"] = df.groupby("user_session").cumcount() + 1
df["is_last_event_in_session"] = (df["event_position_in_session"] == df["session_event_count"]).astype(int)

unique_cats = df.groupby("user_session")["category_code"].nunique()
df["unique_categories_session"] = df["user_session"].map(unique_cats)

sess_start_time = df.groupby("user_session")["event_time"].transform("min")
df["time_since_session_start"] = (df["event_time"] - sess_start_time).dt.total_seconds().clip(lower=0)

session_group = df.groupby("user_session")
df["product_view_count"] = session_group["event_type"].transform(lambda x: (x == "view").sum())
df["cart_add_count"] = session_group["event_type"].transform(lambda x: (x == "cart").sum())
df["purchase_event_count"] = session_group["event_type"].transform(lambda x: (x == "purchase").sum())
df["view_to_cart_ratio"] = (df["cart_add_count"] / df["product_view_count"].replace(0, np.nan)).fillna(0)

last_event = session_group["event_type"].transform("last")
df["last_event_type_encoded"] = last_event.astype("category").cat.codes

df["viewed_same_category_count"] = session_group["category_code"].transform(lambda x: x.value_counts().max())

df["session_start_time"] = session_group["event_time"].transform("min")
df["session_end_time"] = session_group["event_time"].transform("max")
df["session_duration"] = (df["session_end_time"] - df["session_start_time"]).dt.total_seconds()

df["time_since_last_event"] = session_group["event_time"].diff().dt.total_seconds().fillna(0)
df["avg_time_per_event"] = (df["session_duration"] / df["session_event_count"].replace(0, np.nan)).fillna(0)
df["evening_session"] = df["event_time"].dt.hour.isin(range(18, 24)).astype(int)

df["session_avg_price"] = session_group["price"].transform("mean")
df["session_max_price"] = session_group["price"].transform("max")
df["session_min_price"] = session_group["price"].transform("min")
df["price_range"] = df["session_max_price"] - df["session_min_price"]

category_avg_price2 = df.groupby("category_code")["price"].transform("mean")
df["relative_price_to_category"] = ((df["price"] - category_avg_price2) / category_avg_price2).replace([np.inf, -np.inf], np.nan).fillna(0)

df["unique_brands_session"] = session_group["brand"].transform(lambda x: x.nunique())
df["unique_products_session"] = session_group["product_id"].transform(lambda x: x.nunique())
df["brand_switches"] = session_group["brand"].transform(lambda x: (x != x.shift()).sum())
df["category_switches"] = session_group["category_code"].transform(lambda x: (x != x.shift()).sum())

df.drop(["session_start_time", "session_end_time"], axis=1, inplace=True)

features = [
    "log_price",
    "price_bin",
    "price_per_category_mean",
    "price_above_median",
    "hour_of_day",
    "day_of_week",
    "is_weekend",
    "is_evening",
    "category_code_encoded",
    "brand_encoded",
    "category_popularity",
    "session_event_count",
    "event_position_in_session",
    "is_last_event_in_session",
    "unique_categories_session",
    "time_since_session_start",
    "product_view_count",
    "cart_add_count",
    "purchase_event_count",
    "view_to_cart_ratio",
    "last_event_type_encoded",
    "viewed_same_category_count",
    "session_duration",
    "time_since_last_event",
    "avg_time_per_event",
    "session_avg_price",
    "session_max_price",
    "session_min_price",
    "price_range",
    "relative_price_to_category",
    "unique_brands_session",
    "unique_products_session",
    "brand_switches",
    "category_switches"
]

df = df.dropna(subset=features).copy()
X = df[features]
y = df["is_purchase"].astype(int)


In [9]:

df["purchased_in_session"] = (df["event_type"] == "purchase").groupby(df["user_session"]).transform("max").astype(int)

g = df.sort_values(["user_session", "event_time"]).groupby("user_session")

sess = pd.DataFrame({
    "label": g["purchased_in_session"].max(),
    "events": g.size(),
    "unique_products": g["product_id"].nunique(),
    "unique_categories": g["category_code"].nunique(),
    "unique_brands": g["brand"].nunique(),
    "avg_price": g["price"].mean(),
    "max_price": g["price"].max(),
    "min_price": g["price"].min(),
    "last_event_type": g["event_type"].last(),
    "first_ts": g["event_time"].min(),
    "last_ts": g["event_time"].max(),
})

sess["duration_sec"] = (sess["last_ts"] - sess["first_ts"]).dt.total_seconds().clip(lower=0)
sess["views_per_product"] = (sess["events"] / sess["unique_products"].replace(0, 1)).clip(upper=50)

sess["last_event_type"] = sess["last_event_type"].astype("category").cat.codes
sess["hour_start"] = sess["first_ts"].dt.hour
sess["dow_start"] = sess["first_ts"].dt.dayofweek
sess["is_weekend"] = sess["dow_start"].isin([5, 6]).astype(int)

features_sess = [
    "events","unique_products","unique_categories","unique_brands",
    "avg_price","max_price","min_price","duration_sec","views_per_product",
    "last_event_type","hour_start","dow_start","is_weekend"
]

sess = sess.dropna(subset=features_sess+["label"]).copy()

X = sess[features_sess]
y = sess["label"].astype(int)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix
from xgboost import XGBClassifier

X = df[features]
y = df["is_purchase"].astype(int)

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


X_tr2, X_va, y_tr2, y_va = train_test_split(
    X_tr, y_tr, test_size=0.2, random_state=42, stratify=y_tr
)

In [12]:
def fit_and_pick_threshold(spw):
    model = XGBClassifier(
        tree_method="hist",
        n_estimators=2000,
        learning_rate=0.08,
        max_depth=8,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=5.0,
        reg_alpha=0.2,
        gamma=1.0,
        max_delta_step=1,     
        scale_pos_weight=spw,
        eval_metric="aucpr",
        n_jobs=-1,
        random_state=42,
    )
    model.fit(X_tr2, y_tr2)
    va_proba = model.predict_proba(X_va)[:, 1]
    
    prec, rec, thr = precision_recall_curve(y_va, va_proba)
    prec, rec = prec[:-1], rec[:-1]           
    f1 = 2*(prec*rec)/(prec+rec+1e-12)
    i = int(np.argmax(f1))
    best_thr = float(thr[i])
    
    return model, best_thr, float(f1[i]), float(prec[i]), float(rec[i])


In [15]:
base_ratio = (y_tr2 == 0).sum() / max(1, (y_tr2 == 1).sum())

candidates = [0.75, 1.0, 1.25, 1.5, 2.0]
results = []
for m in candidates:
    mdl, thr, f1, p, r = fit_and_pick_threshold(base_ratio*m)
    results.append((m, thr, f1, p, r, mdl))

best = max(results, key=lambda t: t[2])
best_mult, best_thr, best_f1, best_p, best_r, best_model = best
print(f"Chosen scale_pos_weight = ratio*{best_mult:.2f}")
print(f"Valid best F1={best_f1:.4f} at thr={best_thr:.4f} (P={best_p:.3f}, R={best_r:.3f})")
te_proba = best_model.predict_proba(X_te)[:, 1]
y_pred = (te_proba >= best_thr).astype(int)

print("\nTest set performance at chosen threshold")
print(classification_report(y_te, y_pred, digits=3))
print(confusion_matrix(y_te, y_pred))


Chosen scale_pos_weight = ratio*0.75
Valid best F1=0.7446 at thr=0.8701 (P=0.799, R=0.697)

Test set performance at chosen threshold
              precision    recall  f1-score   support

           0      0.995     0.997     0.996    281046
           1      0.798     0.697     0.744      4778

    accuracy                          0.992    285824
   macro avg      0.897     0.847     0.870    285824
weighted avg      0.992     0.992     0.992    285824

[[280204    842]
 [  1446   3332]]
