In [14]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, classification_report, log_loss
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier

In [15]:
df = pd.read_parquet('../data.parquet')
print(df.shape)
df.head()

(8051, 93)


Unnamed: 0,is_title_fight,winner,no_of_rounds,event_status,weight_class_bantamweight,weight_class_catch_weight,weight_class_featherweight,weight_class_flyweight,weight_class_heavyweight,weight_class_light_heavyweight,...,b_wins_by_ko_tko,b_wins_by_sub,b_wins_by_m_dec,b_wins_by_s_dec,b_wins_by_u_dec,b_age,b_is_debut,delta_age,delta_height,delta_reach
0,False,Red,3.0,completed,False,False,False,False,False,False,...,0,0,0,0,0,25,1,5,2.0,3.0
1,False,Red,3.0,completed,False,False,False,False,False,False,...,0,0,0,0,0,32,1,-2,6.0,0.0
2,False,Red,3.0,completed,False,False,False,False,True,False,...,0,0,0,0,0,23,1,3,-2.0,0.0
3,True,Red,5.0,completed,False,False,False,False,False,False,...,0,0,0,0,0,25,1,1,0.0,0.0
4,False,Red,2.0,completed,False,False,False,False,False,False,...,0,0,0,0,0,27,1,-3,-4.0,-3.0


In [16]:
le = LabelEncoder()

TARGET = 'winner'

df[TARGET] = le.fit_transform(df[TARGET])

### Train/validation split

In [17]:
mask_completed = df["event_status"] == "completed"
mask_upcoming = df["event_status"] == "upcoming"

y = df.loc[mask_completed, TARGET]

X_completed = (
    df
    .loc[mask_completed]
    .drop(columns=[TARGET, 'event_status'])
)

X_upcoming = (
  df
    .loc[mask_upcoming]
    .drop(columns=[TARGET, 'event_status'])
)

In [18]:
n = len(df)
print(len(df))
split_idx = int(n * 0.85)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

8051


In [19]:
X_train = X_completed.iloc[:split_idx]
X_test = X_completed.iloc[split_idx:]

y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

In [20]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (6843, 91)
y_train : (6843,)
X_test : (1195, 91)
y_test : (1195,)


In [21]:
assert train_df.index.max() < test_df.index.min()
assert TARGET not in X_train.columns

print(y_train.unique())
print(y_train.mean(), y_test.mean())

[1 0]
0.6432851088703785 0.5631799163179916


In [22]:
# numeric_cols = X_train.select_dtypes(include=["number", "bool"]).columns
# categorical_cols = X_train.select_dtypes(exclude=["number", "bool"]).columns

# numeric_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
# ])

# categorical_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("onehot", OneHotEncoder(handle_unknown="ignore"))
# ])

# preprocess = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numeric_cols),
#         ("cat", categorical_transformer, categorical_cols),
#     ],
#     remainder="drop"
# )

In [23]:
gb = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=6,
    max_iter=500,
    min_samples_leaf=20,
    l2_regularization=1.0,
    random_state=42
)

In [24]:
clf = Pipeline(steps=[
    ("model", gb),
])

clf.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('model', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"loss  loss: {'log_loss'}, default='log_loss' The loss function to use in the boosting process. For binary classification problems, 'log_loss' is also known as logistic loss, binomial deviance or binary crossentropy. Internally, the model fits one tree per boosting iteration and uses the logistic sigmoid function (expit) as inverse link function to compute the predicted positive class probability. For multiclass classification problems, 'log_loss' is also known as multinomial deviance or categorical crossentropy. Internally, the model fits one tree per boosting iteration and per class and uses the softmax function as inverse link function to compute the predicted probabilities of the classes.",'log_loss'
,"learning_rate  learning_rate: float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage.",0.05
,"max_iter  max_iter: int, default=100 The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built.",500
,"max_leaf_nodes  max_leaf_nodes: int or None, default=31 The maximum number of leaves for each tree. Must be strictly greater than 1. If None, there is no maximum limit.",31
,"max_depth  max_depth: int or None, default=None The maximum depth of each tree. The depth of a tree is the number of edges to go from the root to the deepest leaf. Depth isn't constrained by default.",6
,"min_samples_leaf  min_samples_leaf: int, default=20 The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built.",20
,"l2_regularization  l2_regularization: float, default=0 The L2 regularization parameter penalizing leaves with small hessians. Use ``0`` for no regularization (default).",1.0
,"max_features  max_features: float, default=1.0 Proportion of randomly chosen features in each and every node split. This is a form of regularization, smaller values make the trees weaker learners and might prevent overfitting. If interaction constraints from `interaction_cst` are present, only allowed features are taken into account for the subsampling. .. versionadded:: 1.4",1.0
,"max_bins  max_bins: int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin is always reserved for missing values. Must be no larger than 255.",255
,"categorical_features  categorical_features: array-like of {bool, int, str} of shape (n_features) or shape (n_categorical_features,), default='from_dtype' Indicates the categorical features. - None : no feature will be considered categorical. - boolean array-like : boolean mask indicating categorical features. - integer array-like : integer indices indicating categorical  features. - str array-like: names of categorical features (assuming the training  data has feature names). - `""from_dtype""`: dataframe columns with dtype ""category"" are  considered to be categorical features. The input must be an object  exposing a ``__dataframe__`` method such as pandas or polars  DataFrames to use this feature. For each categorical feature, there must be at most `max_bins` unique categories. Negative values for categorical features encoded as numeric dtypes are treated as missing values. All categorical values are converted to floating point numbers. This means that categorical values of 1.0 and 1 are treated as the same category. Read more in the :ref:`User Guide `. .. versionadded:: 0.24 .. versionchanged:: 1.2  Added support for feature names. .. versionchanged:: 1.4  Added `""from_dtype""` option. .. versionchanged:: 1.6  The default value changed from `None` to `""from_dtype""`.",'from_dtype'


In [25]:
proba = clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("AUC:", roc_auc_score(y_test, proba))
print("LogLoss:", log_loss(y_test, proba))
print(classification_report(y_test, pred, digits=4))

AUC: 0.6351243645141273
LogLoss: 0.6873486360022231
              precision    recall  f1-score   support

           0     0.5947    0.4272    0.4972       522
           1     0.6354    0.7741    0.6979       673

    accuracy                         0.6226      1195
   macro avg     0.6150    0.6007    0.5976      1195
weighted avg     0.6176    0.6226    0.6102      1195

