In [39]:
import pandas as pd

In [41]:
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from catboost import CatBoostClassifier

In [42]:
import plotly.graph_objects as go

In [43]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from tqdm import tqdm

# Загружаем ранее посчитанные модели

In [56]:
with open('baseline.pickle', 'rb') as f:
    results = pickle.load(f)
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Best Params,Validation r2,Test R²,Test MAE,Test RMSE,Best Estimator
0,KNN,"{'regressor__n_neighbors': 7, 'regressor__p': ...",0.858822,0.861995,19.347841,28.108901,"(ColumnTransformer(transformers=[('num',\n ..."
1,ElasticNet,"{'regressor__alpha': 0.07, 'regressor__l1_rati...",0.91107,0.91427,16.065511,22.154541,"(ColumnTransformer(transformers=[('num',\n ..."
2,RandomForest,"{'regressor__max_depth': 23, 'regressor__n_est...",0.907528,0.905332,16.31732,23.280833,"(ColumnTransformer(transformers=[('num',\n ..."


In [8]:
with open('classification.pickle', 'rb') as f:
    classification_df = pickle.load(f)

In [9]:
with open('regression.pickle', 'rb') as f:
    models = pickle.load(f)

In [17]:
with open('x_test.pickle', 'rb') as f:
    X_test = pickle.load(f)

In [22]:
with open('y_test.pickle', 'rb') as f:
    y_test = pickle.load(f)

In [23]:
base_model = results["Best Estimator"][1]
base_model

0,1,2
,steps,"[('preprocessing', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.07
,l1_ratio,0.85
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


### Используем модель, найденную классификацией

In [24]:
classification = 1
clf = classification_df["Best Estimator"][classification]
clf

0,1,2
,steps,"[('preprocessing', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [25]:
models[0]["Best Estimator"][1]

0,1,2
,steps,"[('preprocessing', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.15
,l1_ratio,0.75
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


### Гибридная модель

In [26]:
hybrid_results = []

for classification in range(3):
    
    clf = classification_df["Best Estimator"][classification]
    probs = clf.predict_proba(X_test)
    y_class = clf.predict(X_test)
    
    for way in ['Pipeline', 'Soft Voting']:
        
        for regression in range(1, 3):
            
            y_pred = np.zeros(len(X_test))
            
            if way == 'Pipeline':
                for i in range(len(X_test)):
                    x_row = X_test.iloc[[i]]
                    y_pred[i] = models[y_class[i].item()]["Best Estimator"][regression].predict(x_row)[0]
            else:
                for i in range(len(X_test)):
                    x_row = X_test.iloc[[i]]
                    pred0 = models[0]["Best Estimator"][regression].predict(x_row)[0]
                    pred1 = models[1]["Best Estimator"][regression].predict(x_row)[0]
                    pred2 = models[2]["Best Estimator"][regression].predict(x_row)[0]
                    pred3 = models[3]["Best Estimator"][regression].predict(x_row)[0]
                    y_pred[i] = probs[i, 0] * pred0 + probs[i, 1] * pred1 + probs[i, 2] * pred2 + probs[i, 3] * pred3
            
            test_r2, mae, rmse   = r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), root_mean_squared_error(y_test, y_pred)
            
            hybrid_results.append({
                "Type of hybridization" : way,
                "Classification": classification_df["Model"][classification],
                "Model": models[0]["Model"][regression],
                "Test R²": test_r2,
                "Test MAE": mae,
                "Test RMSE": rmse
            })

In [59]:
hybrid_results_df = pd.DataFrame(hybrid_results)
hybrid_results_df.sort_values(by=[f"Test RMSE"]) 

Unnamed: 0,Type of hybridization,Classification,Model,Test R²,Test MAE,Test RMSE
6,Soft Voting,CatBoost,ElasticNet,0.928056,14.026155,20.295159
2,Soft Voting,RandomForest,ElasticNet,0.927301,14.126699,20.401443
10,Soft Voting,XGBoost,ElasticNet,0.923959,14.56128,20.865075
0,Pipeline,RandomForest,ElasticNet,0.919284,15.665157,21.496869
7,Soft Voting,CatBoost,RandomForest,0.915145,15.369713,22.041111
4,Pipeline,CatBoost,ElasticNet,0.914766,15.78643,22.090316
8,Pipeline,XGBoost,ElasticNet,0.911529,16.256698,22.505885
11,Soft Voting,XGBoost,RandomForest,0.911188,15.787296,22.549293
3,Soft Voting,RandomForest,RandomForest,0.905213,16.113788,23.295385
1,Pipeline,RandomForest,RandomForest,0.903512,17.010351,23.503479


### Вывод результатов baseline

In [58]:
results_df = pd.DataFrame(results).sort_values(by=[f"Test RMSE"])
results_df[['Model','Test R²','Test MAE','Test RMSE']]

Unnamed: 0,Model,Test R²,Test MAE,Test RMSE
1,ElasticNet,0.91427,16.065511,22.154541
2,RandomForest,0.905332,16.31732,23.280833
0,KNN,0.861995,19.347841,28.108901


### Прирост точности (в процентах)

Лучший 'Soft Voting' (CatBoost + ElasticNet) по сравнению с baseline

In [70]:
(0.928056 - 0.914270)/0.914270*100, (16.065511 - 14.026155)/14.026155*100, (22.154541 - 20.295159)/20.295159*100

(1.5078696665098894, 14.53966536089186, 9.161702059096932)

Лучший 'Pipeline' (CatBoost + RandomForest) по сравнению с baseline

In [71]:
(0.919284 - 0.914270)/0.914270*100, (16.065511 - 15.665157)/15.665157*100, (22.154541 - 21.496869)/21.496869*100

(0.5484156758944253, 2.5556973351751284, 3.0593850667276152)