In [1]:
import numpy as np
import pandas as pd
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
import time
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
from sklearn.linear_model import Ridge
import joblib


In [2]:
train_df_end = np.load("train_df_end.npy")
val_df_end = np.load("val_df_end.npy")
test_df_end = np.load("test_df_end.npy")

In [3]:
train_df = pd.read_csv("train_df.csv")
val_df = pd.read_csv("val_df.csv")

In [4]:
y_train = train_df['log_salary_from']
y_val = val_df['log_salary_from']

## CatBoostRegressor

In [5]:
print("\nЗапуск Hyperopt...")
start_time_hyperopt = time.time()

space = {
    'iterations': hp.quniform('iterations', 200, 1000, 50),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.9),
    'depth': hp.quniform('depth', 3, 5, 1),
    'l2_leaf_reg': hp.quniform('l2_leaf_reg', 0.1, 10.1, 1),
    'bagging_temperature': hp.quniform('bagging_temperature', 0.1, 1, 0.4)
}

def objective(params):
    model_params = {
        'iterations': int(params['iterations']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': params['l2_leaf_reg'],
        'bagging_temperature': params['bagging_temperature'],
        'od_wait': 100,
        'loss_function':'RMSE'
    }

    model = CatBoostRegressor(**model_params)

    scores = cross_val_score(model, train_df_end, y_train, 
                             cv=5, 
                             scoring='r2',
                             n_jobs=-1)
    
    r2_mean = float(np.mean(scores))

    hyperopt_pbar.update(1)

    return {'loss': -r2_mean, 'status': STATUS_OK, 'params': model_params}

n_iter_hyperopt = 15
hyperopt_pbar = tqdm(total=n_iter_hyperopt, desc="Hyperopt")

trials = Trials()
best_hyperopt = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=n_iter_hyperopt,
    trials=trials,
    verbose=0
)

hyperopt_pbar.close()

best_trial_idx = np.argmin([t['result']['loss'] for t in trials.trials])
best_params_hyperopt1 = trials.trials[best_trial_idx]['result']['params']

best_score_hyperopt = min([t['result']['loss'] for t in trials.trials])
time_hyperopt = time.time() - start_time_hyperopt

best_model_hyperopt = CatBoostRegressor(**best_params_hyperopt1)
best_model_hyperopt.fit(train_df_end, y_train)

y_test_pred_hyperopt = best_model_hyperopt.predict(val_df_end)

r2_val = r2_score(y_val, y_test_pred_hyperopt)

print(f"\nHyperopt завершен за {time_hyperopt:.2f} секунд")
print(f"Лучшие параметры: {best_params_hyperopt1}")
print(f"Лучший CV R2: {best_score_hyperopt:.4f}")
print(f"R2 на тестовых данных: {r2_val:.4f}")



Запуск Hyperopt...


Hyperopt: 100%|████████████████████████████████████████████████████████████████████████| 15/15 [21:49<00:00, 87.27s/it]


0:	learn: 0.5514309	total: 177ms	remaining: 2m 39s
1:	learn: 0.4838886	total: 208ms	remaining: 1m 33s
2:	learn: 0.4262820	total: 236ms	remaining: 1m 10s
3:	learn: 0.3790233	total: 258ms	remaining: 57.7s
4:	learn: 0.3370728	total: 278ms	remaining: 49.7s
5:	learn: 0.3031033	total: 297ms	remaining: 44.3s
6:	learn: 0.2771254	total: 316ms	remaining: 40.4s
7:	learn: 0.2519764	total: 336ms	remaining: 37.5s
8:	learn: 0.2337778	total: 355ms	remaining: 35.2s
9:	learn: 0.2170206	total: 375ms	remaining: 33.3s
10:	learn: 0.2044452	total: 394ms	remaining: 31.8s
11:	learn: 0.1937020	total: 417ms	remaining: 30.8s
12:	learn: 0.1858400	total: 436ms	remaining: 29.8s
13:	learn: 0.1794867	total: 457ms	remaining: 28.9s
14:	learn: 0.1746612	total: 477ms	remaining: 28.2s
15:	learn: 0.1709748	total: 496ms	remaining: 27.4s
16:	learn: 0.1680654	total: 516ms	remaining: 26.8s
17:	learn: 0.1660355	total: 535ms	remaining: 26.2s
18:	learn: 0.1638965	total: 554ms	remaining: 25.7s
19:	learn: 0.1623776	total: 573ms	rema

In [11]:
best_model_hyperopt(y_val, best_model_hyperopt.predict(val_df_end))

TypeError: 'CatBoostRegressor' object is not callable

In [9]:
best_model_hyperopt.save_model("catboost_model.cbm")

In [5]:
# loaded_model = CatBoostRegressor()
# loaded_model.load_model("catboost_model.cbm")

<catboost.core.CatBoostRegressor at 0x2edd3f68ec0>

## XGBRegressor

In [12]:
from xgboost import XGBRegressor

In [None]:
space = {
    'n_estimators': hp.quniform('n_estimators', 200, 1000, 50),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'max_depth': hp.quniform('max_depth', 2, 6, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
}

def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = int(params['min_child_weight'])

    model = XGBRegressor(
        **params,
        verbosity=0,
        use_label_encoder=False,
        objective='reg:squarederror'  
    )

    scores = cross_val_score(
        model, 
        train_df_end, 
        y_train,
        cv=5,
        scoring='r2',
        n_jobs=-1
    )

    r2_mean = float(np.mean(scores))

    hyperopt_pbar.update(1)

    return {'loss': -r2_mean, 'status': STATUS_OK, 'params': params}

n_iter_hyperopt = 10
hyperopt_pbar = tqdm(total=n_iter_hyperopt, desc="Hyperopt")

trials = Trials()
best_hyperopt = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=n_iter_hyperopt,
    trials=trials,
    verbose=0
)

hyperopt_pbar.close()

best_trial_idx = np.argmin([t['result']['loss'] for t in trials.trials])
best_params_hyperopt3 = trials.trials[best_trial_idx]['result']['params']

best_score_hyperopt = min([t['result']['loss'] for t in trials.trials])
time_hyperopt = time.time() - start_time_hyperopt

best_model_hyperopt_xgb = XGBRegressor(
    **best_params_hyperopt3,
    verbosity=0,
    use_label_encoder=False,
    objective='reg:squarederror'
)
best_model_hyperopt_xgb.fit(train_df_end, y_train)

y_test_pred_hyperopt = best_model_hyperopt_xgb.predict(val_df_end)
r2_test = r2_score(y_val, y_test_pred_hyperopt)

print(f"\nHyperopt завершен за {time_hyperopt:.2f} секунд")
print(f"Лучшие параметры: {best_params_hyperopt3}")
print(f"Лучший CV R2: {r2_test:.4f}")
print(f"R2 на тестовых данных: {r2_test:.4f}")


In [18]:
best_model_hyperopt_xgb.save_model("xgb_model.json")

In [13]:
# loaded_model = XGBRegressor()
# loaded_model.load_model("xgb_model.json")
# y_pred = loaded_model.predict(test_df_end)

# Ridge

In [24]:
model = Ridge(alpha=1, random_state=42)
model.fit(train_df_end, y_train)
val_preds = model.predict(val_df_end)
r2_score(y_val, val_preds)

0.7656355513902842

In [25]:
joblib.dump(model, "ridge_model.joblib")

['ridge_model.joblib']

In [20]:
# loaded_model = joblib.load("ridge_model.joblib")
# y_pred = loaded_model.predict(test_df_end)

## Meta-model

In [22]:
# model = joblib.load("ridge_model.joblib")

In [23]:
# best_model_hyperopt_xgb = XGBRegressor()
# best_model_hyperopt_xgb.load_model("xgb_model.json")

In [24]:
# best_model_hyperopt = CatBoostRegressor()
# best_model_hyperopt.load_model("catboost_model.cbm")

<catboost.core.CatBoostRegressor at 0x2ee18db0b90>

In [25]:
cb_predictions_train = best_model_hyperopt.predict(train_df_end) 
cb_predictions_val = best_model_hyperopt.predict(val_df_end)
cb_predictions_test = best_model_hyperopt.predict(test_df_end)

In [26]:
xgb_predictions_train = best_model_hyperopt_xgb.predict(train_df_end)
xgb_predictions_val = best_model_hyperopt_xgb.predict(val_df_end)
xgb_predictions_test = best_model_hyperopt_xgb.predict(test_df_end)

In [27]:
ridge_predictions_train = model.predict(train_df_end)
ridge_predictions_val = model.predict(val_df_end)
ridge_predictions_test = model.predict(test_df_end)

In [28]:
bert_predictions_train = np.load("predictions_train.npy")
bert_predictions_val = np.load("predictions_val.npy")
bert_predictions_test = np.load("predictions_test.npy")

In [29]:
meta_train = np.column_stack((bert_predictions_train, cb_predictions_train, xgb_predictions_train, ridge_predictions_train))
meta_val = np.column_stack((bert_predictions_val, cb_predictions_val, xgb_predictions_val, ridge_predictions_val))
meta_test = np.column_stack((bert_predictions_test, cb_predictions_test, xgb_predictions_test, ridge_predictions_test))

In [30]:
meta_model = Ridge(alpha=0.8, random_state=42)
meta_model.fit(meta_train, y_train)
val_preds = meta_model.predict(meta_val)
r2_score(y_val, val_preds)

0.7651521498583389

In [35]:
joblib.dump(meta_model, "meta_model.joblib")

['meta_model.joblib']

In [31]:
sam_sub = pd.read_csv('sample_submition.csv')

In [32]:
sam_sub['prediction'] = meta_model.predict(meta_test)

In [33]:
sam_sub.to_csv("sub.csv", index=False)