In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from custom_aggregator import GroupStatsAggregator
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv("final.csv", encoding="latin-1")
encoder = LabelEncoder()
data["Industry"]=encoder.fit_transform(data["Industry"])
# print(label)
data = data.drop(columns=["Unnamed: 0","cluster"])

In [37]:
from custom_aggregator import DataFrameWrapper
from catboost import CatBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import RFE

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

numeric_columns = ['MR', 'TRC', 'BAB', 'EV', 'P/B', 'PSR', 'ROA', 'C/A', 'D/A', 'PG', 'AG', 'Industry-cluster-MR-mean', 'Industry-cluster-TRC-mean', 'Industry-cluster-BAB-mean', 'Industry-cluster-EV-mean', 'Industry-cluster-P/B-mean', 'Industry-cluster-PSR-mean', 'Industry-cluster-ROA-mean', 'Industry-cluster-C/A-mean', 'Industry-cluster-D/A-mean', 'Industry-cluster-PG-mean', 'Industry-cluster-AG-mean', 'Industry-MR-mean', 'Industry-TRC-mean', 'Industry-BAB-mean', 'Industry-EV-mean', 'Industry-P/B-mean', 'Industry-PSR-mean', 'Industry-ROA-mean', 'Industry-C/A-mean', 'Industry-D/A-mean', 'Industry-PG-mean', 'Industry-AG-mean', 'cluster-MR-mean', 'cluster-TRC-mean', 'cluster-BAB-mean', 'cluster-EV-mean', 'cluster-P/B-mean', 'cluster-PSR-mean', 'cluster-ROA-mean', 'cluster-C/A-mean', 'cluster-D/A-mean', 'cluster-PG-mean', 'cluster-AG-mean']
transforms = [
('mms', DataFrameWrapper(MinMaxScaler(), columns=numeric_columns)),
('ss', DataFrameWrapper(StandardScaler(), columns=numeric_columns)),
('rs', DataFrameWrapper(RobustScaler(), columns=numeric_columns)),
('qt', DataFrameWrapper(QuantileTransformer(n_quantiles=100, output_distribution='normal'), columns=numeric_columns)),
('kbd', DataFrameWrapper(KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'), columns=numeric_columns)),
('svd', DataFrameWrapper(TruncatedSVD(n_components=7), columns=numeric_columns)),
]
fu = FeatureUnion(transforms).set_output(transform="pandas")
fu = DataFrameWrapper(fu)
cat_columns = ["Industry", "cluster"]
preprocessor = ColumnTransformer([
    ('num', fu, numeric_columns)
])
preprocessor.set_output(transform="pandas")
wrapped_preprocessor = DataFrameWrapper(preprocessor)
rfe_estimator = CatBoostRegressor(iterations=100, depth=3, learning_rate=0.1, verbose=0)
rfe = RFE(estimator=rfe_estimator, n_features_to_select=8)

steps = []
steps.append(("gsa",GroupStatsAggregator()))
steps.append(("preprocess",wrapped_preprocessor))
steps.append(("rfe",rfe))
steps.append(("regressor",CatBoostRegressor(iterations=100,depth=5,learning_rate=0.1,verbose=0)))
model = Pipeline(steps)

In [38]:
from sklearn.model_selection import cross_val_score
X = data.drop(columns=["Yt.1M"])
y = data["Yt.1M"]
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
print(cv_scores)

[-0.09586233 -0.09833973 -0.11007229 -0.15970213 -0.15227857]


In [None]:

selector = model.named_steps["rfe"]
print(selector.get_support())
# regressor = model.named_steps["regressor"]
# feature_importances = regressor.get_feature_importance()
# feature_names = regressor.feature_names_
# importance_df = pd.DataFrame({
#     'Feature': feature_names,
#     'Importance': feature_importances
# }).sort_values(by='Importance', ascending=False)

# print(importance_df)

[-0.0928226  -0.09991592 -0.11065924 -0.16234354 -0.15295522]
