#Start

simply start by reading the original data,, encode the Industry as label,, only use basic columns, remove TVC and ROE because they are highly correlated to TRC and ROA, respectively. drop rows with null columns, lazy to (properly) impute

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from custom_aggregator import GroupStatsAggregator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score


data = pd.read_csv("homework_1.csv", encoding="latin-1")
encoder = LabelEncoder()
data["Industry"]=encoder.fit_transform(data["Industry"])
data = data.drop(columns=["TVC","ROE","Yt.2M","Yt.3M","Code"])
data["EV"] = data['EV'].str.replace(",","").astype(float)
data["PSR"] = data['PSR'].str.replace("#DIV/0!","Nan").astype(float)
data = data[["Yt.1M","Industry","MR","TRC","BAB","EV","P/B","PSR","ROA","C/A","D/A","PG","AG"]]
data = data.dropna()


In [2]:
from custom_aggregator import DataFrameWrapper
from catboost import CatBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import RFE

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

def prepare_model_pipeline(iterations=100,
                           depth=3,
                           lr=0.1,
                           n_clusters=8,
                           n_neighbors=3,
                           n_features_to_select=12):
    numeric_columns = ['MR', 'TRC', 'BAB', 'EV', 'P/B', 'PSR', 'ROA', 'C/A', 'D/A', 'PG', 'AG', 'Industry-cluster-MR-mean', 'Industry-cluster-TRC-mean', 'Industry-cluster-BAB-mean', 'Industry-cluster-EV-mean', 'Industry-cluster-P/B-mean', 'Industry-cluster-PSR-mean', 'Industry-cluster-ROA-mean', 'Industry-cluster-C/A-mean', 'Industry-cluster-D/A-mean', 'Industry-cluster-PG-mean', 'Industry-cluster-AG-mean', 'Industry-MR-mean', 'Industry-TRC-mean', 'Industry-BAB-mean', 'Industry-EV-mean', 'Industry-P/B-mean', 'Industry-PSR-mean', 'Industry-ROA-mean', 'Industry-C/A-mean', 'Industry-D/A-mean', 'Industry-PG-mean', 'Industry-AG-mean', 'cluster-MR-mean', 'cluster-TRC-mean', 'cluster-BAB-mean', 'cluster-EV-mean', 'cluster-P/B-mean', 'cluster-PSR-mean', 'cluster-ROA-mean', 'cluster-C/A-mean', 'cluster-D/A-mean', 'cluster-PG-mean', 'cluster-AG-mean']
    transforms = [
    ('mms', DataFrameWrapper(MinMaxScaler(), columns=numeric_columns)),
    ('ss', DataFrameWrapper(StandardScaler(), columns=numeric_columns)),
    ('rs', DataFrameWrapper(RobustScaler(), columns=numeric_columns)),
    ('qt', DataFrameWrapper(QuantileTransformer(n_quantiles=100, output_distribution='normal'), columns=numeric_columns)),
    ('kbd', DataFrameWrapper(KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'), columns=numeric_columns)),
    ('svd', DataFrameWrapper(TruncatedSVD(n_components=7), columns=numeric_columns)),
    ]
    fu = FeatureUnion(transforms).set_output(transform="pandas")
    fu = DataFrameWrapper(fu)
    preprocessor = ColumnTransformer([
        ('num', fu, numeric_columns)
    ], remainder="passthrough")
    preprocessor.set_output(transform="pandas")
    wrapped_preprocessor = DataFrameWrapper(preprocessor)
    rfe_estimator = CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=lr, verbose=0)
    rfe = RFE(estimator=rfe_estimator, n_features_to_select=n_features_to_select)

    steps = []
    steps.append(("gsa",GroupStatsAggregator(n_clusters=n_clusters, n_neighbors=n_neighbors)))
    steps.append(("preprocess",wrapped_preprocessor))
    steps.append(("rfe",rfe))
    steps.append(("regressor",CatBoostRegressor(iterations=iterations,depth=depth,learning_rate=lr,verbose=0)))
    model = Pipeline(steps)
    return model

model = prepare_model_pipeline()

In [None]:
X = data.drop(columns=["Yt.1M"])
y = data["Yt.1M"]
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

In [None]:
# this part can be used to see the selected featuers

# from sklearn.metrics import mean_squared_error, mean_absolute_error
# from sklearn.model_selection import KFold
# kf = KFold(n_splits=3, shuffle=True, random_state=42)
# # Loop over each fold for cross-validation
# fold_metrics = []
# fold_counter = 1
# x = data.drop(columns=["Yt.1M"])
# y = data["Yt.1M"]
# params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'learning_rate': 0.05,
#     'num_leaves': 31,
#     'verbose': -1
# }
# for train_index, test_index in kf.split(x):
#     X_train, X_test = x.iloc[train_index], x.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#     model = Pipeline(steps)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)
#     rmse = np.sqrt(mse)
#     fold_metrics.append({
#     'Fold': fold_counter,
#     'MSE': mse,
#     'RMSE': rmse,
#     'MAE': mae
#     })
#     print(f"Fold {fold_counter} -- MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

# selector = model.named_steps['rfe']
# fu = model.named_steps["preprocess"]
# columns = np.asanyarray(fu.final_columns.tolist())
# selection_mask = selector.support_
# print(columns[selection_mask])

#Hyperparameter tuning

In [8]:
def objective(trial):
    X = data.drop(columns=["Yt.1M"])
    y = data["Yt.1M"]

    param = {
        "objective": "regression",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "depth": trial.suggest_int("depth", 3, 8),
        "iterations": trial.suggest_int("iterations", 50, 150),
        "n_clusters": trial.suggest_int("n_clusters", 4, 8),
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 5),
        "n_features_to_select": trial.suggest_int("n_features_to_select", 5, 15),
    }
    model = prepare_model_pipeline(iterations=param["iterations"],
                                depth=param["depth"],
                                lr=param["learning_rate"],
                                n_clusters=param["n_clusters"],
                                n_neighbors=param["n_neighbors"],
                                n_features_to_select=param["n_features_to_select"])
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    return -np.asanyarray(cv_scores).mean()

In [None]:
import optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500)  # you can increase n_trials

[I 2025-04-15 00:03:28,324] A new study created in memory with name: no-name-7aecfb8b-b402-46ac-bc4a-c13eb10959b9
[I 2025-04-15 00:06:25,357] Trial 0 finished with value: 0.12036309620828649 and parameters: {'learning_rate': 0.07592800733509816, 'depth': 6, 'iterations': 51, 'n_clusters': 6, 'n_neighbors': 4, 'n_features_to_select': 9}. Best is trial 0 with value: 0.12036309620828649.
