In [1]:
import joblib

import pandas as pd
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Train/save the preprocessings and the model

In [2]:
PATH_TO_DATA = "train_data/train.csv"

target_col = 'binary_target'
categorical_cols = ['частота_пополнения']
continuous_cols = ['сумма', 'секретный_скор', "pack_freq", 'частота', 'доход']
drop_cols = ['client_id', 'mrg_',
            'регион', 'использование', 'on_net',
            'зона_1', 'зона_2', 'pack',
            'сегмент_arpu', 'объем_данных', 'продукт_1', 'продукт_2']

In [3]:
df = pd.read_csv(PATH_TO_DATA)

# Drop unnecessary columns
df = df.drop(columns=drop_cols)

# Split the data into features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Preprocess the data with SimpleImputer and other transformers
# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
        ]), categorical_cols),
        ('cont', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), continuous_cols)
    ])

In [4]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=3, n_estimators=200))
])

In [5]:
# Fit the model
pipeline.fit(X, y)

# Save the model
joblib.dump(pipeline, 'app/models/xgboost_pipeline.pkl')

[23:27:14] AllReduce: 0.001136s, 1 calls @ 1136us

[23:27:14] MakeCuts: 0.001175s, 1 calls @ 1175us

[23:27:14] DEBUG: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:130: Using tree method: 0
[23:27:16] Configure: 0.00018s, 1 calls @ 180us

[23:27:16] EvalOneIter: 0.001436s, 200 calls @ 1436us

[23:27:16] GetGradient: 0.352849s, 200 calls @ 352849us

[23:27:16] PredictRaw: 0.000725s, 200 calls @ 725us

[23:27:16] UpdateOneIter: 2.43734s, 200 calls @ 2437344us

[23:27:16] BoostNewTrees: 2.07612s, 200 calls @ 2076123us

[23:27:16] CommitModel: 0.000197s, 200 calls @ 197us

[23:27:16] BuildHistogram: 0.291251s, 1000 calls @ 291251us

[23:27:16] EvaluateSplits: 0.058669s, 1200 calls @ 58669us

[23:27:16] InitData: 0.507766s, 200 calls @ 507766us

[23:27:16] InitRoot: 0.339359s, 200 calls @ 339359us

[23:27:16] LeafPartition: 2.3e-05s, 200 calls @ 23us

[23:27:16] UpdatePosition: 0.806134s, 1200 calls @ 806134us

[23:27:16] UpdatePredictionCache: 0.068006s, 200 calls @ 68006us

[23:27

['app/models/xgboost_pipeline.pkl']