In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from custom_aggregator import GroupStatsAggregator

data = pd.read_csv("final.csv", encoding="latin-1")
data["Industry"] = data["Industry"].astype("category")
data["cluster"] = data["cluster"].astype("category")
data = data.drop(columns=["Unnamed: 0"])

In [2]:
from catboost import CatBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin
class CatBoostWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, cat_features, **kparams):
        self.kparams = kparams
        self.cat_features = cat_features
        self.model: CatBoostRegressor

    def fit(self, X, y):
        self.model = CatBoostRegressor(**self.kparams)
        self.model.fit(X, y, cat_features=self.cat_features, verbose=0)
        return self

    def predict(self, X):
        return self.model_.predict(X)

In [3]:

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer

numeric_columns = [ 'MR', 'TRC', 'BAB', 'EV', 'P/B', 'PSR', 'ROA', 'C/A', 'D/A', 'PG', 'AG', 'Industry-cluster-MR-mean', 'Industry-cluster-MR-median', 'Industry-cluster-MR-max', 'Industry-cluster-TRC-mean', 'Industry-cluster-TRC-median', 'Industry-cluster-TRC-max', 'Industry-cluster-BAB-mean', 'Industry-cluster-BAB-median', 'Industry-cluster-BAB-max', 'Industry-cluster-EV-mean', 'Industry-cluster-EV-median', 'Industry-cluster-EV-max', 'Industry-cluster-P/B-mean', 'Industry-cluster-P/B-median', 'Industry-cluster-P/B-max', 'Industry-cluster-PSR-mean', 'Industry-cluster-PSR-median', 'Industry-cluster-PSR-max', 'Industry-cluster-ROA-mean', 'Industry-cluster-ROA-median', 'Industry-cluster-ROA-max', 'Industry-cluster-C/A-mean', 'Industry-cluster-C/A-median', 'Industry-cluster-C/A-max', 'Industry-cluster-D/A-mean', 'Industry-cluster-D/A-median', 'Industry-cluster-D/A-max', 'Industry-cluster-PG-mean', 'Industry-cluster-PG-median', 'Industry-cluster-PG-max', 'Industry-cluster-AG-mean', 'Industry-cluster-AG-median', 'Industry-cluster-AG-max', 'Industry-MR-mean', 'Industry-MR-median', 'Industry-MR-max', 'Industry-TRC-mean', 'Industry-TRC-median', 'Industry-TRC-max', 'Industry-BAB-mean', 'Industry-BAB-median', 'Industry-BAB-max', 'Industry-EV-mean', 'Industry-EV-median', 'Industry-EV-max', 'Industry-P/B-mean', 'Industry-P/B-median', 'Industry-P/B-max', 'Industry-PSR-mean', 'Industry-PSR-median', 'Industry-PSR-max', 'Industry-ROA-mean', 'Industry-ROA-median', 'Industry-ROA-max', 'Industry-C/A-mean', 'Industry-C/A-median', 'Industry-C/A-max', 'Industry-D/A-mean', 'Industry-D/A-median', 'Industry-D/A-max', 'Industry-PG-mean', 'Industry-PG-median', 'Industry-PG-max', 'Industry-AG-mean', 'Industry-AG-median', 'Industry-AG-max', 'cluster-MR-mean', 'cluster-MR-median', 'cluster-MR-max', 'cluster-TRC-mean', 'cluster-TRC-median', 'cluster-TRC-max', 'cluster-BAB-mean', 'cluster-BAB-median', 'cluster-BAB-max', 'cluster-EV-mean', 'cluster-EV-median', 'cluster-EV-max', 'cluster-P/B-mean', 'cluster-P/B-median', 'cluster-P/B-max', 'cluster-PSR-mean', 'cluster-PSR-median', 'cluster-PSR-max', 'cluster-ROA-mean', 'cluster-ROA-median', 'cluster-ROA-max', 'cluster-C/A-mean', 'cluster-C/A-median', 'cluster-C/A-max', 'cluster-D/A-mean', 'cluster-D/A-median', 'cluster-D/A-max', 'cluster-PG-mean', 'cluster-PG-median', 'cluster-PG-max', 'cluster-AG-mean', 'cluster-AG-median', 'cluster-AG-max']
transforms = list()
transforms.append(('mms', MinMaxScaler()))
transforms.append(('ss', StandardScaler()))
transforms.append(('rs', RobustScaler()))
transforms.append(('qt', QuantileTransformer(n_quantiles=100, output_distribution='normal')))
transforms.append(('kbd', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')))
transforms.append(('svd', TruncatedSVD(n_components=7)))
fu = FeatureUnion(transforms)
preprocessor = ColumnTransformer([
    ('num', fu, numeric_columns),
])
steps = []
steps.append(("gsa",GroupStatsAggregator()))
steps.append(("preprocess",preprocessor))
steps.append(("regressor",CatBoostWrapper(["Industry","Category"],iterations=100,depth=5,learning_rate=0.1,verbose=0)))
model = Pipeline(steps)

In [4]:
from sklearn.model_selection import cross_val_score
X = data.drop(columns=["Yt.1M"])
print(X.info())
y = data["Yt.1M"]
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Industry  911 non-null    category
 1   MR        911 non-null    float64 
 2   TRC       911 non-null    float64 
 3   BAB       911 non-null    float64 
 4   EV        911 non-null    float64 
 5   P/B       911 non-null    float64 
 6   PSR       911 non-null    float64 
 7   ROA       911 non-null    float64 
 8   C/A       911 non-null    float64 
 9   D/A       911 non-null    float64 
 10  PG        911 non-null    float64 
 11  AG        911 non-null    float64 
 12  cluster   911 non-null    category
dtypes: category(2), float64(11)
memory usage: 81.7 KB
None
       MR     TRC     BAB            EV   P/B      PSR   ROA     C/A    D/A  \
0 -0.3978 -0.6397  0.9093  1.476000e+09  1.92  47.3836  0.55  0.5335  50.90   
1 -0.3970 -0.3205  0.7637  7.002769e+10  6.59  27.1569  1.78  0.5480  23.16  

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\pandas\core\indexes\base.py", line 3629, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas\_libs\index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Industry'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\utils\_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\sklearn\base.py", line 919, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\USER\python\hw1_qa\custom_aggregator.py", line 46, in fit
    final_stats_df[col] = final_stats_df["Industry"].map(i_stats_df.set_index('Industry')[col])
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\pandas\core\frame.py", line 3505, in __getitem__
    indexer = self.columns.get_loc(key)
  File "c:\Users\USER\.conda\envs\rust-pruning\lib\site-packages\pandas\core\indexes\base.py", line 3631, in get_loc
    raise KeyError(key) from err
KeyError: 'Industry'


In [None]:
xgb.plot_importance(model, importance_type="gain")
plt.title("Feature Importance")
plt.show()

In [None]:
import shap
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)
shap.summary_plot(shap_values, X_train)

In [None]:
shap.plots.force(shap_values[0],matplotlib=True)
print(

In [None]:
def objective(trial):
    params = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "booster": "gbtree",
        "enable_categorical":True,
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
    }
    fold_rmse_list = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # Initialize the XGBoost regressor.
        # n_estimators, learning_rate, and max_depth can be tuned further.
        model = XGBRegressor(**params)
        # Fit the model on training data
        model.fit(X_train, y_train)
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        # Compute evaluation metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        fold_rmse_list += [rmse]
    return np.asanyarray(fold_rmse_list).mean()

In [None]:
import optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1000)  # you can increase n_trials

In [None]:
# optuna.visualization.plot_param_importances(study).show()
optuna.visualization.plot_optimization_history(study).show()