In [1]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from hnmchallenge.dataset import Dataset
from hnmchallenge.filtered_dataset import FilterdDataset
from hnmchallenge.stratified_dataset import StratifiedDataset
from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.constant import *
from hnmchallenge.models.sgmc.sgmc import SGMC
from hnmchallenge.models.ease.ease import EASE
from hnmchallenge.models.itemknn.itemknn import ItemKNN 
from hnmchallenge.feature_manager import FeatureManager

In [2]:
dataset = StratifiedDataset()
dr = DataReader()

In [4]:
fm = FeatureManager(dataset, "train")

In [6]:
features_df= pd.read_feather(dr.get_preprocessed_data_path()/ "xgb_datasets" / "dataset_v4.feather")

In [7]:
import math
TRAIN_PERC = 0.8
VAL_PERC = 0.1
TEST_PERC = 0.1
unique_users = features_df[DEFAULT_USER_COL].unique()
train_len = math.ceil(len(unique_users)*TRAIN_PERC)
val_len =math.ceil(len(unique_users)*VAL_PERC)
test_len =math.ceil(len(unique_users)*TEST_PERC)

np.random.seed(RANDOM_SEED)
np.random.shuffle(unique_users)
train_users, val_users, test_users = unique_users[:train_len], unique_users[train_len:train_len+val_len], unique_users[train_len+val_len:]

In [8]:
train_df = features_df[features_df[DEFAULT_USER_COL].isin(train_users)]
val_df = features_df[features_df[DEFAULT_USER_COL].isin(val_users)]
test_df = features_df[features_df[DEFAULT_USER_COL].isin(test_users)]

In [9]:
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [22]:
model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:map',
    random_state=RANDOM_SEED, 
    learning_rate=1e-3,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=100, 
    subsample=0.75,
    n_gpu = -1,
    )

In [11]:
X_train = train_df.loc[:, ~train_df.columns.isin([DEFAULT_USER_COL, DEFAULT_ITEM_COL, "relevance"])]
Y_train = train_df["relevance"].copy().values
qid_train = train_df[DEFAULT_USER_COL].copy().values

X_val = val_df.loc[:, ~val_df.columns.isin([DEFAULT_USER_COL, DEFAULT_ITEM_COL, "relevance"])]
Y_val = val_df["relevance"].copy().values
qid_val = val_df[DEFAULT_USER_COL].copy().values

X_test = test_df.loc[:, ~test_df.columns.isin([DEFAULT_USER_COL, DEFAULT_ITEM_COL, "relevance"])]
Y_test = test_df["relevance"].copy().values
qid_test = test_df[DEFAULT_USER_COL].copy().values

In [13]:
X=pd.unique(train_df[DEFAULT_USER_COL])

In [14]:
X

array([     29,     129,     136, ..., 1128600, 1128602, 1128611])

In [12]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

In [26]:
score=[]
tree=[]
for train_user,test_user in kfold.split(features_df):
    train_df = features_df[features_df[DEFAULT_USER_COL].isin(train_user)]
    test_df = features_df[features_df[DEFAULT_USER_COL].isin(test_user)]

    X_train = train_df.loc[:, ~train_df.columns.isin([DEFAULT_USER_COL, DEFAULT_ITEM_COL, "relevance"])]
    Y_train = train_df["relevance"].copy().values
    qid_train = train_df[DEFAULT_USER_COL].copy().values

    X_test = test_df.loc[:, ~test_df.columns.isin([DEFAULT_USER_COL, DEFAULT_ITEM_COL, "relevance"])]
    Y_test = test_df["relevance"].copy().values
    qid_test = test_df[DEFAULT_USER_COL].copy().values  
    
    model.fit(X_train, Y_train, qid=qid_train, eval_set=[(X_test, Y_test)], eval_qid=[qid_test], eval_metric=["map@12"],
          verbose=True,  early_stopping_rounds=20)

    tree.append(model.best_ntree_limit)
    score.append(model.best_score)



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "n_gpu" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-map@12:0.20345
[1]	validation_0-map@12:0.20434
[2]	validation_0-map@12:0.21022
[3]	validation_0-map@12:0.20832
[4]	validation_0-map@12:0.20794
[5]	validation_0-map@12:0.21070
[6]	validation_0-map@12:0.21135
[7]	validation_0-map@12:0.20963
[8]	validation_0-map@12:0.21166
[9]	validation_0-map@12:0.21082
[10]	validation_0-map@12:0.20971
[11]	validation_0-map@12:0.20995
[12]	validation_0-map@12:0.21015
[13]	validation_0-map@12:0.20917
[14]	validation_0-map@12:0.20836
[15]	validation_0-map@12:0.20858
[16]	validation_0-map@12:0.20962
[17]	validation_0-map@12:0.21007
[18]	validation_0-map@12:0.21052
[19]	validation_0-map@12:0.20997
[20]	validation_0-map@12:0.20981
[21]	

In [27]:
score

[0.218268,
 0.229106,
 0.207683,
 0.216236,
 0.219044,
 0.212351,
 0.224286,
 0.215131,
 0.22752,
 0.215142]

In [28]:
tree

[75, 75, 30, 56, 54, 72, 67, 51, 58, 7]