In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, roc_auc_score

train = pd.read_csv("../../data/train_prices_decisions_2025.csv")
test = pd.read_csv("../../data/test_user_info_2025.csv")

In [2]:
# ---- Compute medians ----
med1 = train['Covariate1'].median()
med2 = train['Covariate2'].median()
med3 = train['Covariate3'].median()

def segment_row(row):
    return (
        int(row['Covariate1'] >= med1),
        int(row['Covariate2'] >= med2),
        int(row['Covariate3'] >= med3)
    )

# Assign segment label to each row in training data
train['segment'] = train.apply(segment_row, axis=1)

In [3]:
print(med1, med2, med3)

2.7193025761078644 2.7215555543935457 7.262601783583493


In [4]:
# ---- Training models ----
FEATURES = ['price_item', 'Covariate1', 'Covariate2', 'Covariate3']
TARGET = "item_bought"

segment_to_model = {}

for seg in [(a,b,c) for a in (0,1) for b in (0,1) for c in (0,1)]:
    
    train_seg = train[train['segment'] == seg]
    print(f"Training segment {seg}, size = {len(train_seg)}")
    
    if len(train_seg) < 50:
        print(f"Segment {seg} too small, skipping")
        continue
    
    X_seg = train_seg[FEATURES]
    y_seg = train_seg[TARGET]

    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=350,
        learning_rate=0.2,
        random_state=42,
        tree_method='hist'
    )
    model.fit(X_seg, y_seg)
    
    segment_to_model[seg] = model

print("Finished training all segment models!")


Training segment (0, 0, 0), size = 6313
Training segment (0, 0, 1), size = 6155
Training segment (0, 1, 0), size = 6242
Training segment (0, 1, 1), size = 6290
Training segment (1, 0, 0), size = 6187
Training segment (1, 0, 1), size = 6345
Training segment (1, 1, 0), size = 6258
Training segment (1, 1, 1), size = 6210
Finished training all segment models!


In [5]:
test['segment'] = test.apply(segment_row, axis=1)

#Predict best price using the correct segment model
price_grid = np.linspace(0.01, 500, 100)
P = len(price_grid)

results = []

for idx, row in test.iterrows():
    seg = row['segment']
    
    if seg not in segment_to_model:
        # fallback: use global model or default price
        continue
    
    model = segment_to_model[seg]
    
    # repeat covariates for price-grid
    cov_rep = np.tile(row[['Covariate1','Covariate2','Covariate3']].values, (P,1))
    price_rep = price_grid.reshape(-1,1)

    X_full = np.hstack([price_rep, cov_rep])
    X_full_df = pd.DataFrame(X_full, columns=['price_item','Covariate1','Covariate2','Covariate3'])
    X_full_df = X_full_df.astype(float)

    probs = model.predict_proba(X_full_df)[:,1]
    revenue = probs * price_grid

    best_idx = revenue.argmax()
    best_price = price_grid[best_idx]
    best_rev = revenue[best_idx]

    results.append({
        "user_index": row['user_index'],
        "price_item": best_price,
        "expected_revenue": best_rev,
    })

result_df = pd.DataFrame(results)

In [6]:
result_df.to_csv("static_prices_submission.csv", index=False)

In [7]:
result_df['expected_revenue'].sum()

3665014.778904567

In [8]:
result_df['expected_revenue'].sum()

3665014.778904567

In [9]:
import pickle

with open("8_xgb.pkl", "wb") as f:
    pickle.dump(segment_to_model, f)