In [21]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_parquet('../data/train/train_sample.parquet')

In [3]:
train_df['inventory_id'] = train_df['inventory_id'].astype("category")
##train_df['inventory_id'].describe()

In [23]:
print(train_df.dtypes)

gender               str
age_group            str
inventory_id    category
day_of_week          str
hour                 str
                  ...   
seq_len            int64
unique_pages       int64
last_page          int64
first_page         int64
repeat_ratio     float64
Length: 123, dtype: object


In [5]:
def featurize(seq):
    pages = list(map(int, seq.split(",")))

    return {
        "seq_len": len(pages),
        "unique_pages": len(set(pages)),
        "last_page": pages[-1],
        "first_page": pages[0],
        #"has_page_75": int(75 in pages),
        "repeat_ratio": 1 - len(set(pages)) / len(pages)
    }

In [6]:
features = train_df["seq"].apply(featurize)

In [8]:
df_features = pd.DataFrame(features.tolist())

In [14]:
train_df.drop(columns=['seq'],inplace = True)

In [17]:
train_df = pd.concat([train_df, df_features], axis=1)

In [19]:
train_df.head()

Unnamed: 0,gender,age_group,inventory_id,day_of_week,hour,l_feat_1,l_feat_2,l_feat_3,l_feat_4,l_feat_5,...,history_b_27,history_b_28,history_b_29,history_b_30,clicked,seq_len,unique_pages,last_page,first_page,repeat_ratio
0,1.0,8.0,36,3,22,1.0,2.0,1.0,7.0,675.0,...,0.026163,0.008176,0.021258,0.053965,0,843,70,479,9,0.916963
1,1.0,7.0,2,1,11,2.0,2.0,2.0,10.0,439.0,...,1.10345,0.344828,0.896553,0.568966,0,35,14,479,57,0.6
2,1.0,7.0,2,4,14,2.0,2.0,3.0,7.0,728.0,...,0.074854,0.023392,0.060819,0.038597,0,454,54,479,9,0.881057
3,1.0,7.0,2,4,6,2.0,2.0,2.0,1.0,615.0,...,0.065843,0.020576,0.053498,0.067901,0,225,40,479,9,0.822222
4,2.0,7.0,37,7,20,2.0,2.0,3.0,5.0,280.0,...,0.046112,0.01441,0.037466,0.023776,0,726,84,35,144,0.884298


In [22]:
X = train_df.drop(columns=['clicked'])
y = train_df['clicked']

In [24]:
cat_cols = X.select_dtypes(include="object").columns

for col in cat_cols:
    X[col] = X[col].astype("category")

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include="object").columns


In [34]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Luego separas train y validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42
)

In [30]:
model = xgb.XGBClassifier(
    tree_method="hist",
    enable_categorical=True,
    max_depth=6,
    n_estimators=500,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    early_stopping_rounds=50,  
    random_state=42
)

In [37]:
model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)]
)

[0]	validation_0-auc:0.69514	validation_1-auc:0.69555
[1]	validation_0-auc:0.70352	validation_1-auc:0.70288
[2]	validation_0-auc:0.71071	validation_1-auc:0.70989
[3]	validation_0-auc:0.71389	validation_1-auc:0.71329
[4]	validation_0-auc:0.71590	validation_1-auc:0.71533
[5]	validation_0-auc:0.71762	validation_1-auc:0.71683
[6]	validation_0-auc:0.71828	validation_1-auc:0.71741
[7]	validation_0-auc:0.71927	validation_1-auc:0.71828
[8]	validation_0-auc:0.72000	validation_1-auc:0.71877
[9]	validation_0-auc:0.72099	validation_1-auc:0.71958
[10]	validation_0-auc:0.72148	validation_1-auc:0.71996
[11]	validation_0-auc:0.72237	validation_1-auc:0.72064
[12]	validation_0-auc:0.72283	validation_1-auc:0.72091
[13]	validation_0-auc:0.72324	validation_1-auc:0.72131
[14]	validation_0-auc:0.72381	validation_1-auc:0.72180
[15]	validation_0-auc:0.72420	validation_1-auc:0.72209
[16]	validation_0-auc:0.72476	validation_1-auc:0.72248
[17]	validation_0-auc:0.72538	validation_1-auc:0.72302
[18]	validation_0-au

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",50
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,True


In [38]:
print(roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

0.7417952843252247


In [40]:
print(roc_auc_score(y_val, model.predict_proba(X_val)[:,1]))

0.7420259327077228
