In [49]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

import xgboost as xgb

### This function does some preprocessing on our data: 
    1) Replacing feature: feature_value with feature_value
    2) Deleting useless column 138

In [50]:
def preprocess_data(df):
    df[1] = df[1].apply(lambda x: x[4:])
    for col in range(2, 138):
        df.iloc[:, col] = df.iloc[:, col].apply(lambda x: x.split(':')[1])
    df = df.drop(columns=[138])
    
    df.columns = ['feat_' + str(i-1) for i in df.columns]
    df.rename(columns={'feat_-1': 'y', 'feat_0': 'qid'}, inplace=True)
    
    return df

### Transforming whole DataFrame (df) into features dataframe (X) and target column (y)

In [51]:
def get_features_and_labels(df):
    X = df.drop(columns=['y'])
    y = df['y']
    
    return X, y

## What is DCG (Discounted Cumulative Gain) metric:
### $$ DCG = \sum_{i=1}^{n} G_q(d_q^{i}) * D(i)$$
#### where,
### $ G_q(d) $ - document benefit function  |  $G_q(d) = 2^{y(q, d) - 1}$
### $ D(i) $ - position discount function      |  $D(i) = \frac{1}{\log_2(i + 1)}$
### $ q $ - index of the query
### $ i $ - index of the document

### However, a low $DCG$ metric value does not always mean that ranking has worked poorly. It could be that there are simply no relevant documents for the query, or very few of them. To get rid of this problem, the $DCG$ value is normalized by this metric for perfect ranking, when documents are sorted by true relevance values:

## $nDCG(q) = \frac{DCG(q)}{max DCG(q)}$

### To obtain a metric for a set of queries, consider the average value of $nDCG$

### Function for model evaluation by NDCG metric

In [52]:
def evaluate_model(model, x_test, y_test, model_type):
        
    y_pred = model.predict(x_test)

    ndcg_scores = {
        "NDCG Score": ndcg_score([y_test], [y_pred]),
        "NDCG@5 Score": ndcg_score([y_test], [y_pred], k=5),
        "NDCG@10 Score": ndcg_score([y_test], [y_pred], k=10),
        "NDCG@20 Score": ndcg_score([y_test], [y_pred], k=20),
    }

    print("-" * 50)
    for score_name, score_value in ndcg_scores.items():
        print(f"{model_type} {score_name}: {score_value:.4f}")

In [53]:
def data_loader(train_path, valid_path, test_path):
    train_df = pd.read_csv(train_path, header=None, sep=" ")
    valid_df = pd.read_csv(valid_path, header=None, sep=" ")
    test_df = pd.read_csv(test_path, header=None, sep=" ")

    return train_df, valid_df, test_df

train_df, valid_df, test_df = data_loader("data/Fold1/train.txt", 
                                        "data/Fold1/vali.txt",
                                        "data/Fold1//test.txt")

In [54]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,129,130,131,132,133,134,135,136,137,138
0,2,qid:1,1:3,2:3,3:0,4:0,5:3,6:1,7:1,8:0,...,128:11089534,129:2,130:116,131:64034,132:13,133:3,134:0,135:0,136:0,
1,2,qid:1,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:11089534,129:2,130:124,131:64034,132:1,133:2,134:0,135:0,136:0,
2,0,qid:1,1:3,2:0,3:2,4:0,5:3,6:1,7:0,8:0.666667,...,128:3,129:1,130:124,131:3344,132:14,133:67,134:0,135:0,136:0,
3,2,qid:1,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:11089534,129:13,130:123,131:63933,132:1,133:3,134:0,135:0,136:0,
4,1,qid:1,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:5,129:7,130:256,131:49697,132:1,133:13,134:0,135:0,136:0,


## Preprocess the data

In [55]:
train_df = preprocess_data(train_df)
valid_df = preprocess_data(valid_df)
test_df = preprocess_data(test_df)
train_df.head()

Unnamed: 0,y,qid,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_127,feat_128,feat_129,feat_130,feat_131,feat_132,feat_133,feat_134,feat_135,feat_136
0,2,1,3,3,0,0,3,1,1,0.0,...,62,11089534,2,116,64034,13,3,0,0,0
1,2,1,3,0,3,0,3,1,0,1.0,...,54,11089534,2,124,64034,1,2,0,0,0
2,0,1,3,0,2,0,3,1,0,0.666667,...,45,3,1,124,3344,14,67,0,0,0
3,2,1,3,0,3,0,3,1,0,1.0,...,56,11089534,13,123,63933,1,3,0,0,0
4,1,1,3,0,3,0,3,1,0,1.0,...,64,5,7,256,49697,1,13,0,0,0


## Simple EDA

##### 723k documents in train sample

In [8]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723412 entries, 0 to 723411
Columns: 138 entries, y to feat_136
dtypes: int64(1), object(137)
memory usage: 761.6+ MB
None


##### 241k doxuments in test sample

In [9]:
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241521 entries, 0 to 241520
Columns: 138 entries, y to feat_136
dtypes: int64(1), object(137)
memory usage: 254.3+ MB
None


In [10]:
num_queries_train = train_df['qid'].nunique()
num_queries_test = test_df['qid'].nunique()
print(f"Got {num_queries_train} train and {num_queries_test} test queries")

Got 6000 train and 2000 test queries


# Training (LinReg | XGBoost Ranker)

In [56]:
X_train, y_train = get_features_and_labels(train_df)
X_test, y_test = get_features_and_labels(test_df)

In [65]:
def train_model(df):
    query_col = "qid"

    queries = df[query_col].unique()
    train_queries, test_queries = train_test_split(queries, test_size=0.2, random_state=42)

    train_df = df[df[query_col].isin(train_queries)]
    test_df = df[df[query_col].isin(test_queries)]
    train_df = train_df.astype(np.float32)
    test_df = test_df.astype(np.float32)

    x_train, y_train = get_features_and_labels(train_df)
    x_test, y_test = get_features_and_labels(test_df)



    linreg = LinearRegression().fit(x_train, y_train)
    evaluate_model(linreg, x_test, y_test, "Linear Regression Model (pointwise)")
    

    train_df_sorted = train_df.sort_values(by=query_col)

    sorted_indices = train_df_sorted.index

    x_train_sorted = x_train.loc[sorted_indices].reset_index(drop=True)
    y_train_sorted = y_train.loc[sorted_indices].reset_index(drop=True)

    xgbranker = xgb.XGBRanker(
        tree_method="hist",
        objective="rank:pairwise"
    )

    xgbranker2 = xgb.XGBRanker(
        tree_method="hist",
        objective="rank:ndcg"
    )

    if query_col in x_train_sorted.columns:
        x_train_sorted = x_train_sorted.drop(columns=[query_col])

    xgbranker.fit(x_train_sorted, y_train_sorted, qid=train_df_sorted[query_col].values)

    evaluate_model(xgbranker, x_test, y_test, "XGBoost Pairwise Ranker Model (logit)")

    xgbranker2.fit(x_train_sorted, y_train_sorted, qid=train_df_sorted[query_col].values)

    evaluate_model(xgbranker2, x_test, y_test, "XGBoost Pairwise Ranker Model (lambdamart)")

    return xgbranker, xgbranker2


In [58]:
xgbranker, xgbranker2 = train_model(train_df)

--------------------------------------------------
Linear Regression Model (pointwise) NDCG Score: 0.9195
Linear Regression Model (pointwise) NDCG@5 Score: 0.3503
Linear Regression Model (pointwise) NDCG@10 Score: 0.2432
Linear Regression Model (pointwise) NDCG@20 Score: 0.2372
--------------------------------------------------
XGBoost Pairwise Ranker Model (lambdamart) NDCG Score: 0.9338
XGBoost Pairwise Ranker Model (lambdamart) NDCG@5 Score: 1.0000
XGBoost Pairwise Ranker Model (lambdamart) NDCG@10 Score: 0.9523
XGBoost Pairwise Ranker Model (lambdamart) NDCG@20 Score: 0.8675
--------------------------------------------------
XGBoost Pairwise Ranker Model (ndcg) NDCG Score: 0.9341
XGBoost Pairwise Ranker Model (ndcg) NDCG@5 Score: 0.9672
XGBoost Pairwise Ranker Model (ndcg) NDCG@10 Score: 0.9614
XGBoost Pairwise Ranker Model (ndcg) NDCG@20 Score: 0.9377


# Catboost

In [15]:
from catboost import CatBoost, Pool

In [23]:
def to_catboost_dataset(df):
    y = df['y'].to_numpy()                       
    q = df['qid'].to_numpy().astype('uint32')      
    X = df.drop(columns=['y', 'qid']).to_numpy() 
    return (X, y, q)

In [17]:
X_train, y_train, q_train = to_catboost_dataset(train_df)
X_valid, y_valid, q_valid = to_catboost_dataset(valid_df)
X_test, y_test, q_test = to_catboost_dataset(test_df)
        
pool_train = Pool(data=X_train, label=y_train, group_id=q_train)
pool_valid = Pool(data=X_valid, label=y_valid, group_id = q_valid)
pool_test = Pool(data=X_test, label=y_test, group_id=q_test)

### Pairwise metrics: PairLogit

In [24]:
params = {
    'n_estimators': 1000,            
    'eval_metric': 'NDCG',     
    'random_seed': 123,
    'verbose': 100,
    'eta': 0.1,
    'max_bin': 64,
    'max_depth': 4,
    'loss_function': 'PairLogit'
}

In [20]:
cb_pairlogit = CatBoost(params)
cb_pairlogit.fit(pool_train, eval_set=pool_valid, use_best_model=True)

0:	test: 0.6687723	best: 0.6687723 (0)	total: 10.4s	remaining: 2h 53m
100:	test: 0.7733300	best: 0.7733300 (100)	total: 18m 43s	remaining: 2h 46m 37s
200:	test: 0.7826945	best: 0.7826945 (200)	total: 38m 4s	remaining: 2h 31m 19s
300:	test: 0.7872120	best: 0.7872378 (297)	total: 56m 36s	remaining: 2h 11m 27s
400:	test: 0.7889815	best: 0.7890837 (391)	total: 1h 15m 1s	remaining: 1h 52m 3s
500:	test: 0.7895205	best: 0.7895537 (497)	total: 1h 36m 36s	remaining: 1h 36m 24s
600:	test: 0.7904014	best: 0.7904680 (597)	total: 2h 3m 13s	remaining: 1h 21m 56s
700:	test: 0.7915918	best: 0.7916955 (697)	total: 2h 24m 43s	remaining: 1h 1m 48s
800:	test: 0.7918725	best: 0.7920434 (750)	total: 2h 46m 1s	remaining: 41m 17s
900:	test: 0.7916781	best: 0.7920434 (750)	total: 3h 7m 3s	remaining: 20m 34s
999:	test: 0.7926596	best: 0.7926596 (999)	total: 3h 28m 4s	remaining: 0us

bestTest = 0.7926595694
bestIteration = 999



<catboost.core.CatBoost at 0x18552d270e0>

### Groupwise metrics: YetiRank

In [25]:
params = {
    'n_estimators': 1000,            
    'eval_metric': 'NDCG',      
    'random_seed': 123,
    'verbose': 100,
    'eta': 0.1,
    'max_bin': 64,
    'max_depth': 4,
    'loss_function': 'YetiRank'
}

In [22]:
cb_yetirank = CatBoost(params)
cb_yetirank.fit(pool_train, eval_set=pool_valid, use_best_model=True)

0:	test: 0.6488399	best: 0.6488399 (0)	total: 1.24s	remaining: 20m 39s
100:	test: 0.7862985	best: 0.7862985 (100)	total: 1m 22s	remaining: 12m 12s
200:	test: 0.7905463	best: 0.7905463 (200)	total: 2m 46s	remaining: 11m 1s
300:	test: 0.7925186	best: 0.7929063 (290)	total: 4m 11s	remaining: 9m 45s
400:	test: 0.7936103	best: 0.7939413 (399)	total: 5m 30s	remaining: 8m 13s
500:	test: 0.7944577	best: 0.7945601 (493)	total: 6m 46s	remaining: 6m 45s
600:	test: 0.7951157	best: 0.7954438 (594)	total: 8m 5s	remaining: 5m 22s
700:	test: 0.7958456	best: 0.7958656 (666)	total: 9m 25s	remaining: 4m 1s
800:	test: 0.7960214	best: 0.7962475 (785)	total: 10m 43s	remaining: 2m 39s
900:	test: 0.7961115	best: 0.7963754 (896)	total: 12m 1s	remaining: 1m 19s
999:	test: 0.7967209	best: 0.7967486 (982)	total: 13m 20s	remaining: 0us

bestTest = 0.7967485783
bestIteration = 982

Shrink model to first 983 iterations.


<catboost.core.CatBoost at 0x18538ab9fd0>

### Groupwise metrics: LambdaMART

In [30]:
params = {
    'n_estimators': 1000,            
    'eval_metric': 'NDCG',      
    'random_seed': 123,
    'verbose': 100,
    'eta': 0.1,
    'max_bin': 64,
    'max_depth': 4,
    'loss_function': 'LambdaMart'
}

In [31]:
cb_lm = CatBoost(params)
cb_lm.fit(pool_train, eval_set=pool_valid, use_best_model=True)

0:	test: 0.6577562	best: 0.6577562 (0)	total: 492ms	remaining: 8m 11s
100:	test: 0.7846900	best: 0.7846900 (100)	total: 55.6s	remaining: 8m 14s
200:	test: 0.7892624	best: 0.7894800 (197)	total: 1m 50s	remaining: 7m 17s
300:	test: 0.7910991	best: 0.7911250 (288)	total: 2m 45s	remaining: 6m 23s
400:	test: 0.7924486	best: 0.7925266 (398)	total: 3m 40s	remaining: 5m 28s
500:	test: 0.7932052	best: 0.7933017 (495)	total: 4m 35s	remaining: 4m 34s
600:	test: 0.7938951	best: 0.7940184 (593)	total: 5m 29s	remaining: 3m 39s
700:	test: 0.7947973	best: 0.7948647 (697)	total: 6m 24s	remaining: 2m 44s
800:	test: 0.7950165	best: 0.7950167 (799)	total: 7m 20s	remaining: 1m 49s
900:	test: 0.7952311	best: 0.7953538 (889)	total: 8m 15s	remaining: 54.5s
999:	test: 0.7956838	best: 0.7958061 (975)	total: 9m 12s	remaining: 0us

bestTest = 0.795806107
bestIteration = 975

Shrink model to first 976 iterations.


<catboost.core.CatBoost at 0x186bd59a2a0>

### Groupwise methods: QueryRMSE

In [46]:
params = {
    'n_estimators': 5000,            
    'eval_metric': 'NDCG',      
    'random_seed': 123,
    'verbose': 100,
    'eta': 0.1,
    'max_bin': 64,
    'max_depth': 4,
    'loss_function': 'QueryRMSE'
}

In [47]:
cb_qr = CatBoost(params)
cb_qr.fit(pool_train, eval_set=pool_valid, use_best_model=True)

0:	test: 0.6453704	best: 0.6453704 (0)	total: 125ms	remaining: 10m 22s
100:	test: 0.7836810	best: 0.7836810 (100)	total: 12s	remaining: 9m 43s
200:	test: 0.7888356	best: 0.7888356 (200)	total: 23.7s	remaining: 9m 26s
300:	test: 0.7901007	best: 0.7901194 (299)	total: 35.5s	remaining: 9m 14s
400:	test: 0.7919699	best: 0.7919699 (400)	total: 47.2s	remaining: 9m 1s
500:	test: 0.7925735	best: 0.7927511 (470)	total: 59.7s	remaining: 8m 56s
600:	test: 0.7930723	best: 0.7932972 (557)	total: 1m 11s	remaining: 8m 46s
700:	test: 0.7934804	best: 0.7936851 (683)	total: 1m 24s	remaining: 8m 38s
800:	test: 0.7938380	best: 0.7943202 (761)	total: 1m 36s	remaining: 8m 25s
900:	test: 0.7941259	best: 0.7943202 (761)	total: 1m 49s	remaining: 8m 16s
1000:	test: 0.7945486	best: 0.7948044 (953)	total: 2m 1s	remaining: 8m 6s
1100:	test: 0.7951710	best: 0.7951948 (1098)	total: 2m 14s	remaining: 7m 54s
1200:	test: 0.7953509	best: 0.7954017 (1194)	total: 2m 26s	remaining: 7m 42s
1300:	test: 0.7956386	best: 0.7956

<catboost.core.CatBoost at 0x186bd5acc20>

# Inference on Test sample

In [67]:
def predict(model, df, name):
    
    if df.isnull().any().any():
        raise ValueError("Input DataFrame contains NaN values")

    df = df.astype(float)
    
    x, y = get_features_and_labels(df)

    predictions = model.predict(x)

    df["predicted_rank"] = predictions
    df = df.sort_values(by="predicted_rank", ascending=False)

    evaluate_model(model, x, y, f"{name} test dataset Inference")

    return df

In [69]:
predict(xgbranker, test_df, "XGBoost pairwise logit:")
print("="*50)
predict(xgbranker2, test_df, "XGBoost lambdamart (ncdg optimizing):")
print("="*50)
predict(cb_pairlogit, test_df, "Catboost pairwise logit:")
print("="*50)
predict(cb_yetirank, test_df, "Catboost yetirank:")
print("="*50)
predict(cb_lm, test_df, "Catboost lambdamart:")
print("="*50)
predict(cb_qr, test_df, "Catboost queryRMSE loss:")
print("="*50)

--------------------------------------------------
XGBoost pairwise logit: test dataset Inference NDCG Score: 0.9364
XGBoost pairwise logit: test dataset Inference NDCG@5 Score: 0.9344
XGBoost pairwise logit: test dataset Inference NDCG@10 Score: 0.8835
XGBoost pairwise logit: test dataset Inference NDCG@20 Score: 0.8554
--------------------------------------------------
XGBoost lambdamart (ncdg optimizing): test dataset Inference NDCG Score: 0.9365
XGBoost lambdamart (ncdg optimizing): test dataset Inference NDCG@5 Score: 1.0000
XGBoost lambdamart (ncdg optimizing): test dataset Inference NDCG@10 Score: 0.9487
XGBoost lambdamart (ncdg optimizing): test dataset Inference NDCG@20 Score: 0.9049
--------------------------------------------------
Catboost pairwise logit: test dataset Inference NDCG Score: 0.9030
Catboost pairwise logit: test dataset Inference NDCG@5 Score: 0.4628
Catboost pairwise logit: test dataset Inference NDCG@10 Score: 0.4379
Catboost pairwise logit: test dataset Inf