In [1]:
!pip install xgboost pandas scikit-learn matplotlib


Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 6.7 MB/s eta 0:00:11
   -- ------------------------------------- 4.2/72.0 MB 14.0 MB/s eta 0:00:05
   ---- ----------------------------------- 8.1/72.0 MB 15.7 MB/s eta 0:00:05
   ------ --------------------------------- 12.1/72.0 MB 18.0 MB/s eta 0:00:04
   ---------- ----------------------------- 18.6/72.0 MB 19.6 MB/s eta 0:00:03
   ------------- -------------------------- 23.6/72.0 MB 20.2 MB/s eta 0:00:03
   ---------------- ----------------------- 28.8/72.0 MB 21.0 MB/s eta 0:00:03
   ------------------ --------------------- 34.1/72.0 MB 21.6 MB/s eta 0:00:02
   --------------------- ------------------ 39.3/72.0 MB 21.9 MB/s eta 0:00:02
   ---

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

import matplotlib.pyplot as plt

print("✅ Imports successful")
print("XGBoost version:", xgb.__version__)


✅ Imports successful
XGBoost version: 3.1.2


In [16]:
np.random.seed(42)

num_jobs = 50
msp_per_job = 10
total_rows = num_jobs * msp_per_job


In [17]:
data = {
    "job_id": np.repeat(np.arange(num_jobs), msp_per_job),
    "msp_id": np.tile(np.arange(msp_per_job), num_jobs),
    "distance_km": np.random.uniform(1, 30, total_rows),
    "price_quote": np.random.uniform(500, 2000, total_rows),
    "past_accept_rate": np.random.uniform(0.3, 0.9, total_rows),
    "completion_rate": np.random.uniform(0.7, 1.0, total_rows),
    "rating": np.random.uniform(3.0, 5.0, total_rows),
}


In [18]:
df = pd.DataFrame(data)



In [19]:
df["relevance"] = (
    0.3 * (1 / df["distance_km"]) +
    0.3 * (df["past_accept_rate"]) +
    0.3 * (df["completion_rate"]) +
    0.1 * (df["rating"] / 5)
)


In [20]:
df['relevance']=pd.qcut(df['relevance'],4,labels=[0,1,2,3]).astype(int)


In [21]:
print("Data created")
df.head(30)

Data created


Unnamed: 0,job_id,msp_id,distance_km,price_quote,past_accept_rate,completion_rate,rating,relevance
0,0,0,11.861663,1547.242571,0.41108,0.855725,3.523411,0
1,0,1,28.570715,1304.14455,0.625141,0.843755,3.493958,1
2,0,2,22.227824,964.291424,0.823768,0.707693,4.812509,2
3,0,3,18.361096,1720.69253,0.739335,0.802374,3.499092,1
4,0,4,5.524541,1527.096759,0.783937,0.814059,3.543899,2
5,0,5,5.523841,743.925409,0.69527,0.819647,4.518797,2
6,0,6,2.684425,1866.390777,0.715366,0.874052,3.89948,3
7,0,7,26.119108,1733.805864,0.809517,0.860081,4.553421,2
8,0,8,18.432335,1924.69987,0.449801,0.882372,3.130732,0
9,0,9,21.534105,1588.579263,0.593655,0.929465,3.975142,1


In [22]:
# Split jobs (queries) into train and test sets
unique_jobs = df["job_id"].unique()
train_jobs, test_jobs = train_test_split(unique_jobs, test_size=0.2, random_state=42)

In [23]:
train_df = df[df["job_id"].isin(train_jobs)]
test_df = df[df["job_id"].isin(test_jobs)]

In [25]:
train_df.head(30)

Unnamed: 0,job_id,msp_id,distance_km,price_quote,past_accept_rate,completion_rate,rating,relevance
0,0,0,11.861663,1547.242571,0.41108,0.855725,3.523411,0
1,0,1,28.570715,1304.14455,0.625141,0.843755,3.493958,1
2,0,2,22.227824,964.291424,0.823768,0.707693,4.812509,2
3,0,3,18.361096,1720.69253,0.739335,0.802374,3.499092,1
4,0,4,5.524541,1527.096759,0.783937,0.814059,3.543899,2
5,0,5,5.523841,743.925409,0.69527,0.819647,4.518797,2
6,0,6,2.684425,1866.390777,0.715366,0.874052,3.89948,3
7,0,7,26.119108,1733.805864,0.809517,0.860081,4.553421,2
8,0,8,18.432335,1924.69987,0.449801,0.882372,3.130732,0
9,0,9,21.534105,1588.579263,0.593655,0.929465,3.975142,1


In [28]:
test_df.head(30)

Unnamed: 0,job_id,msp_id,distance_km,price_quote,past_accept_rate,completion_rate,rating,relevance
130,13,0,13.104919,1570.024379,0.734052,0.926849,3.324467,2
131,13,1,7.441127,1842.810257,0.597525,0.804271,3.025215,1
132,13,2,4.476096,1267.516163,0.348628,0.899474,4.119511,1
133,13,3,10.79084,1298.170228,0.43211,0.938635,4.054799,1
134,13,4,28.344381,660.758017,0.709955,0.978153,4.438707,3
135,13,5,10.372885,1171.11855,0.345679,0.770393,4.780516,0
136,13,6,16.044928,1298.9259,0.810724,0.819795,3.158813,2
137,13,7,21.38755,863.705755,0.597088,0.745725,4.462993,1
138,13,8,11.545258,903.864846,0.588352,0.997745,3.374824,2
139,13,9,29.18168,1065.926245,0.655445,0.9781,4.716354,2


In [None]:
# features and labels
feature_cols = ["distance_km", "price_quote", "past_accept_rate", "completion_rate", "rating"]
X_train = train_df[feature_cols]
y_train = train_df["relevance"]
X_test = test_df[feature_cols]
y_test = test_df["relevance"]

In [30]:
# Group sizes: how many MSPs per job
group_train = train_df.groupby("job_id").size().to_list()
group_test = test_df.groupby("job_id").size().to_list()
