In [None]:

import polars as pl

# Read Parquet
df = pl.read_parquet("/kaggle/input/stock-market-1/final_cleaned.parquet")

print("loading finished")

# Write CSV
df.write_csv("stock_market_1.csv")


In [None]:
for ele in df.columns:
    print(ele)

In [None]:
columns = df.columns

# Loop through each column and print its top 5 rows
for col in columns:
    print(f"Column: {col}")
    # Select just this column and print the first 5 rows
    print(df.select(col).head(5))
    print()  # Blank line for readability

In [None]:
!zip stock_market_1_csv.zip stock_market_1.csv

### Testing on sample of 200 companies

In [None]:
import polars as pl

# Read the CSV file
df_sam = pl.read_csv("/kaggle/input/stock-market-200-sample/top_200_companies.csv")

# Get column names and their data types
column_types = [(col, df_sam.schema[col]) for col in df_sam.columns]

# Print column types
for col, dtype in column_types:
    print(f"{col}: {dtype}")


In [None]:
# Step 2: Identify unique companies
company_ids = df_sam.select("Company_ID").unique().to_series().to_list()
print(f"üè¢ Found {len(company_ids)} unique companies.")

# Step 3: Define function to create target columns for one company
def create_targets(df_company, n_targets=10):
    for i in range(1, n_targets + 1):
        df_company = df_company.with_columns(
            pl.col("Close_x").shift(-i).alias(f"close_{i}")
        )
    # Drop last n rows where targets will be null
    df_company = df_company.slice(0, df_company.height - n_targets)
    return df_company

# Step 4: Process each company
processed_dfs = []
print("üõ†Ô∏è Creating target variables and trimming last 10 rows per company...")
for idx, company_id in enumerate(company_ids):
    df_company = df_sam.filter(pl.col("Company_ID") == company_id).sort("Date")
    df_company = create_targets(df_company)
    processed_dfs.append(df_company)
    if (idx + 1) % 20 == 0 or (idx + 1) == len(company_ids):
        print(f"‚úÖ Processed {idx + 1}/{len(company_ids)} companies")

# Combine processed data
df_all = pl.concat(processed_dfs)
print("üì¶ All companies processed and combined.")

# Step 5: Split into train/test sets
train_dfs = []
test_dfs = []

print("‚úÇÔ∏è Splitting into train/test (80/20) for each company...")
for idx, company_id in enumerate(company_ids):
    df_company = df_all.filter(pl.col("Company_ID") == company_id).sort("Date")
    total_rows = df_company.height
    test_size = int(total_rows * 0.2)
    train_size = total_rows - test_size
    
    train_dfs.append(df_company.slice(0, train_size))
    test_dfs.append(df_company.slice(train_size, test_size))

    if (idx + 1) % 20 == 0 or (idx + 1) == len(company_ids):
        print(f"üìä Split {idx + 1}/{len(company_ids)} companies")

# Combine all train/test splits
df_train = pl.concat(train_dfs)
df_test = pl.concat(test_dfs)

print(f"‚úÖ Train set size: {df_train.height}")
print(f"‚úÖ Test set size: {df_test.height}")

# Final output
print("üéâ Data preprocessing complete. Ready for modeling!")

In [None]:
df_train.columns

In [None]:
# Assuming df_train is your Polars DataFrame
for col in df_test.columns:
    missing_count = df_train.select(pl.col(col).is_null().sum()).item()
    print(f"Column '{col}': {missing_count} missing values")

In [None]:
(
    # ---------------- Core parameters ----------------
    boosting_type='gbdt',               # Type of boosting: 'gbdt' (Gradient Boosting), 'dart', 'goss', etc.
    objective='regression',            # Task type, e.g., regression, regression_l1, huber, fair
    metric='rmse',                     # Evaluation metric, e.g., rmse, mae, mse
    n_estimators=100,                  # Number of boosting rounds (trees)
    learning_rate=0.1,                 # Shrinks the contribution of each tree (step size shrinkage)

    # ---------------- Tree parameters ----------------
    num_leaves=63,                     # Max number of leaves per tree; higher = more complex model
    max_depth=5,                       # Max depth of each tree; limits model complexity
    min_child_samples=20,              # Minimum number of samples in a leaf node
    min_child_weight=1e-3,             # Minimum sum of instance weight (hessian) in a leaf
    min_split_gain=0.0,                # Minimum gain required to make a further partition (regularization)

    # ---------------- Regularization ----------------
    reg_alpha=0.0,                     # L1 regularization term on weights
    reg_lambda=0.0,                    # L2 regularization term on weights

    # ---------------- Sampling ----------------
    subsample=1.0,                     # Fraction of data to be randomly sampled for each tree (row sampling)
    subsample_freq=0,                 # Frequency of bagging. 0 means disabled
    colsample_bytree=1.0,             # Fraction of features (columns) to be randomly sampled for each tree

    # ---------------- Advanced sampling ----------------
    feature_fraction_bynode=1.0,       # Fraction of features used per split (node-level)
    feature_fraction_seed=42,          # Random seed for feature_fraction
    bagging_fraction=1.0,              # Same as subsample (alias), used in bagging
    bagging_seed=42,                   # Random seed for bagging

    # ---------------- Execution ----------------
    n_jobs=-1,                         # Number of threads to use (-1 means use all cores)
    random_state=42,                   # Random seed for reproducibility
    verbosity=-1,                      # Controls verbosity: <0 silent, 0 warnings, >0 all messages
    force_col_wise=True,               # Forces column-wise histogram building (usually faster for sparse data)
    device='cpu',                      # 'cpu' or 'gpu' ‚Äì device to train the model on
    max_bin=255,                       # Max number of bins for discretizing continuous features

    # ---------------- Other advanced options ----------------
    importance_type='split',           # Feature importance type: 'split' (frequency) or 'gain' (information gain)
    monotone_constraints=None,         # List to enforce monotonic relationship (e.g., [1, -1, 0])
    boosting='gbdt',                   # Alias for boosting_type
    class_weight=None,                 # Used for imbalance handling; None or 'balanced'
    is_unbalance=False,                # If True, automatically balances classes based on data
    scale_pos_weight=1.0,              # Used for unbalanced classes (typically for binary classification)
    force_row_wise=False,              # Force row-wise histogram (can be slower, but sometimes more stable)
    path_smooth=0.0,                   # Controls smoothing of prediction path (mainly for DART)
    drop_rate=0.1,                     # Dropout rate for DART boosting
    skip_drop=0.5,                     # Probability to skip dropping during a boosting iteration (DART)
    xgboost_dart_mode=False,           # Whether to use xgboost dart mode in DART boosting
    gpu_use_dp=False,                  # Whether to use double precision on GPU (if device='gpu')
    boosting_rounds=None,              # Deprecated / alias for n_estimators
    early_stopping_rounds=None,        # Enables early stopping if no improvement over given rounds
    callbacks=None,                    # List of callback functions to apply during training
)


In [None]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Convert Polars to Pandas (LightGBM + scikit-learn use Pandas/Numpy)
print("üîÅ Converting Polars DataFrames to Pandas...")
df_train_pd = df_train.to_pandas()
df_test_pd = df_test.to_pandas()

# Define target columns
target_cols = [f"close_{i}" for i in range(1, 11)]

# Drop unwanted columns
drop_cols = ['Date', 'Company_ID', 'COMPANY']
feature_cols = [col for col in df_train.columns if col not in target_cols + drop_cols]

# Split features and targets
X_train = df_train_pd[feature_cols]
y_train = df_train_pd[target_cols]

X_test_full = df_test_pd[feature_cols]
y_test_full = df_test_pd[target_cols]

# Split test into validation and final test
print("üì§ Splitting test data into validation and test...")
X_val, X_test, y_val, y_test = train_test_split(
    X_test_full, y_test_full, test_size=0.5, random_state=42
)

print(f"üìä Train size: {X_train.shape[0]}")
print(f"üìä Validation size: {X_val.shape[0]}")
print(f"üìä Test size: {X_test.shape[0]}")

# Define full parameterized LightGBM regressor
lgb_regressor = lgb.LGBMRegressor(
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=100,
    learning_rate=0.1,

    # Tree parameters
    num_leaves=31,
    max_depth=5,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    n_jobs= 1,
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    device='cpu',
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)
# Wrap in MultiOutputRegressor
print("üß† Training LightGBM Multi-Output Regressor...")
multi_model = MultiOutputRegressor(lgb_regressor)
multi_model.fit(X_train, y_train)

In [None]:
# Function to calculate Mean Absolute Percentage Error (MAPE) per column
def calculate_mape_per_target(y_true, y_pred, target_names):
    mape_scores = {}
    for i, col in enumerate(target_names):
        # Avoid division by zero
        true_vals = y_true[:, i]
        pred_vals = y_pred[:, i]
        percentage_errors = np.abs((true_vals - pred_vals) / np.clip(np.abs(true_vals), 1e-8, None)) * 100
        mape_scores[col] = np.mean(percentage_errors)
    return mape_scores

# Define target column names
target_cols = [f"close_{i}" for i in range(1, 11)]

# Validation evaluation
print("üìà Evaluating on validation set (with percentage error)...")
y_pred_val = multi_model.predict(X_val)
val_mape_scores = calculate_mape_per_target(y_val.values, y_pred_val, target_cols)

print("üìä Validation Average Percentage Errors (per target):")
for col, error in val_mape_scores.items():
    print(f"  {col}: {error:.2f}%")

# Test evaluation
print("üß™ Evaluating on test set (with percentage error)...")
y_pred_test = multi_model.predict(X_test)
test_mape_scores = calculate_mape_per_target(y_test.values, y_pred_test, target_cols)

print("üìä Test Average Percentage Errors (per target):")
for col, error in test_mape_scores.items():
    print(f"  {col}: {error:.2f}%")


In [None]:
1. (
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=100,
    learning_rate=0.1,

    # Tree parameters
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    n_jobs=-1,
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    device='cpu',
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)
------------------------üìà Evaluating on validation set (with percentage error)...
üìä Validation Average Percentage Errors (per target):
  close_1: 4.15%
  close_2: 5.31%
  close_3: 6.16%
  close_4: 6.91%
  close_5: 7.88%
  close_6: 8.43%
  close_7: 8.94%
  close_8: 9.26%
  close_9: 10.20%
  close_10: 10.49%
üß™ Evaluating on test set (with percentage error)...
üìä Test Average Percentage Errors (per target):
  close_1: 4.27%
  close_2: 5.46%
  close_3: 6.30%
  close_4: 7.09%
  close_5: 8.06%
  close_6: 8.60%
  close_7: 9.08%
  close_8: 9.43%
  close_9: 10.39%
  close_10: 10.68%
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2. (
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=1000,
    learning_rate=0.01,

    # Tree parameters
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    n_jobs=-1,
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    device='cpu',
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)
--------------------------------------------
 close_1: 4.11%
  close_2: 5.24%
  close_3: 6.12%
  close_4: 6.91%
  close_5: 7.73%
  close_6: 8.28%
  close_7: 8.92%
  close_8: 9.54%
  close_9: 9.93%
  close_10: 10.34%
üß™ Evaluating on test set (with percentage error)...
üìä Test Average Percentage Errors (per target):
  close_1: 4.24%
  close_2: 5.39%
  close_3: 6.27%
  close_4: 7.09%
  close_5: 7.92%
  close_6: 8.45%
  close_7: 9.06%
  close_8: 9.69%
  close_9: 10.09%
  close_10: 10.52%
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3. (
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=100,
    learning_rate=0.1,

    # Tree parameters
    num_leaves=31,
    max_depth=5,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    n_jobs=-1,
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    device='cpu',
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)----------------------------------------------------------------
close_1: 3.63%
  close_2: 4.75%
  close_3: 5.42%
  close_4: 6.38%
  close_5: 7.15%
  close_6: 7.86%
  close_7: 8.62%
  close_8: 9.18%
  close_9: 9.50%
  close_10: 9.86%
üß™ Evaluating on test set (with percentage error)...
üìä Test Average Percentage Errors (per target):
  close_1: 3.59%
  close_2: 4.71%
  close_3: 5.42%
  close_4: 6.37%
  close_5: 7.13%
  close_6: 7.84%
  close_7: 8.65%
  close_8: 9.19%
  close_9: 9.45%
  close_10: 9.79%
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4. (
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=100,
    learning_rate=0.1,

    # Tree parameters
    num_leaves=31,
    max_depth=5,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=0.6,
    subsample_freq=0,
    colsample_bytree=0.3,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    n_jobs= 1,
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    device='cpu',
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)-------------------------------------------------------------------
close_1: 3.84%
  close_2: 4.77%
  close_3: 5.48%
  close_4: 6.51%
  close_5: 7.21%
  close_6: 8.16%
  close_7: 8.72%
  close_8: 8.83%
  close_9: 9.46%
  close_10: 9.84%
üß™ Evaluating on test set (with percentage error)...
üìä Test Average Percentage Errors (per target):
  close_1: 3.97%
  close_2: 4.95%
  close_3: 5.68%
  close_4: 6.77%
  close_5: 7.63%
  close_6: 8.65%
  close_7: 9.28%
  close_8: 9.15%
  close_9: 9.80%
  close_10: 10.17%
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5. (
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=100,
    learning_rate=0.1,

    # Tree parameters
    num_leaves=31,
    max_depth=5,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=0.1,
    subsample_freq=0,
    colsample_bytree=0.1,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    n_jobs= 1,
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    device='cpu',
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)--------------------------------------------------------------------------------
close_1: 4.94%
  close_2: 5.68%
  close_3: 6.46%
  close_4: 6.98%
  close_5: 7.73%
  close_6: 8.07%
  close_7: 8.69%
  close_8: 8.91%
  close_9: 9.54%
  close_10: 9.63%
üß™ Evaluating on test set (with percentage error)...
üìä Test Average Percentage Errors (per target):
  close_1: 5.07%
  close_2: 5.77%
  close_3: 6.59%
  close_4: 7.10%
  close_5: 7.89%
  close_6: 8.18%
  close_7: 8.88%
  close_8: 9.01%
  close_9: 9.65%
  close_10: 9.79%
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [5]:
import lightgbm
import sklearn
from sklearn.multioutput import MultiOutputRegressor

print("LightGBM version:", lightgbm.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("MultiOutputRegressor module location:", MultiOutputRegressor.__module__)


LightGBM version: 4.5.0
Scikit-learn version: 1.2.2
MultiOutputRegressor module location: sklearn.multioutput


### creating train test for full data

In [None]:
import polars as pl

# Read the Parquet file
df1 = pl.read_parquet("/kaggle/input/stock-market-1/final_cleaned.parquet")

# Get column names and their data types
column_types = [(col, df1.schema[col]) for col in df1.columns]

# Print column types
for col, dtype in column_types:
    print(f"{col}: {dtype}")


In [None]:
# Step 2: Identify unique companies
company_ids = df1.select("Company_ID").unique().to_series().to_list()
print(f"üè¢ Found {len(company_ids)} unique companies.")

# Step 3: Define function to create target columns for one company
def create_targets(df_company, n_targets=10):
    for i in range(1, n_targets + 1):
        df_company = df_company.with_columns(
            pl.col("Close_x").shift(-i).alias(f"close_{i}")
        )
    # Drop last n rows where targets will be null
    df_company = df_company.slice(0, df_company.height - n_targets)
    return df_company

# Step 4: Process each company
processed_dfs = []
print("üõ†Ô∏è Creating target variables and trimming last 10 rows per company...")
for idx, company_id in enumerate(company_ids):
    df_company = df1.filter(pl.col("Company_ID") == company_id).sort("Date")
    df_company = create_targets(df_company)
    processed_dfs.append(df_company)
    if (idx + 1) % 20 == 0 or (idx + 1) == len(company_ids):
        print(f"‚úÖ Processed {idx + 1}/{len(company_ids)} companies")

# Combine processed data
df_all = pl.concat(processed_dfs)
print("üì¶ All companies processed and combined.")

# Step 5: Split into train/test sets
train_dfs = []
test_dfs = []

print("‚úÇÔ∏è Splitting into train/test (80/20) for each company...")
for idx, company_id in enumerate(company_ids):
    df_company = df_all.filter(pl.col("Company_ID") == company_id).sort("Date")
    total_rows = df_company.height
    test_size = int(total_rows * 0.2)
    train_size = total_rows - test_size
    
    train_dfs.append(df_company.slice(0, train_size))
    test_dfs.append(df_company.slice(train_size, test_size))

    if (idx + 1) % 20 == 0 or (idx + 1) == len(company_ids):
        print(f"üìä Split {idx + 1}/{len(company_ids)} companies")

# Combine all train/test splits
df_train = pl.concat(train_dfs)
df_test = pl.concat(test_dfs)

print(f"‚úÖ Train set size: {df_train.height}")
print(f"‚úÖ Test set size: {df_test.height}")

# Final output
print("üéâ Data preprocessing complete. Ready for modeling!")

In [None]:
del df1

In [None]:
del company_ids, create_targets,processed_dfs, df_all

In [None]:
del train_dfs, test_dfs,idx, company_id

In [None]:
# Save to Parquet
df_train.write_parquet("/kaggle/working/stock_df_train.parquet")
df_test.write_parquet("/kaggle/working/stock_df_test.parquet")

In [None]:
from IPython.display import FileLink, display 

# Display download links
print("Download links:")
display(FileLink("stock_df_train.parquet"))
display(FileLink("stock_df_test.parquet"))

### Training with full data

In [1]:
import polars as pl

# Load the saved Parquet files
df_test = pl.read_parquet("/kaggle/input/stock-final-train-test/stock_df_test.parquet")
df_train = pl.read_parquet("/kaggle/input/stock-final-train-test/stock_df_train.parquet")

# Quick sanity check
print("‚úÖ Loaded DataFrames:")
print(f"üì¶ Train shape: {df_train.shape}")
print(f"üì¶ Test shape: {df_test.shape}")


‚úÖ Loaded DataFrames:
üì¶ Train shape: (5112156, 101)
üì¶ Test shape: (1277054, 101)


In [2]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import time

# Convert Polars to Pandas
print("üîÅ Converting Polars DataFrames to Pandas...")
df_train_pd = df_train.to_pandas()
df_test_pd = df_test.to_pandas()

# Define target columns
target_cols = [f"close_{i}" for i in range(1, 11)]

# Drop unwanted columns
drop_cols = ['Date', 'Company_ID', 'COMPANY']
feature_cols = [col for col in df_train.columns if col not in target_cols + drop_cols]

# Split features and targets
X_train = df_train_pd[feature_cols]
y_train = df_train_pd[target_cols]

X_test_full = df_test_pd[feature_cols]
y_test_full = df_test_pd[target_cols]

# Validation split
print("üì§ Splitting test data into validation and test...")
X_val, X_test, y_val, y_test = train_test_split(
    X_test_full, y_test_full, test_size=0.5, random_state=42
)

print(f"üìä Train size: {X_train.shape[0]}")
print(f"üìä Validation size: {X_val.shape[0]}")
print(f"üìä Test size: {X_test.shape[0]}")

# Start timing
start_time = time.time()

# Use GPU for LightGBM
lgb_regressor = lgb.LGBMRegressor(
    # Core parameters
    boosting_type='gbdt',
    objective='regression',
    metric='rmse',
    n_estimators=7000,
    learning_rate=0.1,

    # Tree parameters
    num_leaves=31,
    max_depth=20,
    min_child_samples=20,
    min_child_weight=1e-3,
    min_split_gain=0.0,

    # Regularization
    reg_alpha=0.0,
    reg_lambda=0.0,

    # Sampling
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,

    # Advanced sampling
    feature_fraction_bynode=1.0,
    feature_fraction_seed=42,
    bagging_fraction=1.0,
    bagging_seed=42,

    # Execution
    random_state=42,
    verbosity=-1,
    force_col_wise=True,
    n_jobs=-1,                     # Use all CPU cores for preprocessing
    device='gpu',                  # üî• Use GPU
    gpu_platform_id=0,
    gpu_device_id=0,
    max_bin=255,

    # Other advanced options
    importance_type='split',
    monotone_constraints=None,
    boosting='gbdt',
    class_weight=None,
    is_unbalance=False,
    scale_pos_weight=1.0,
    force_row_wise=False,
    path_smooth=0.0,
    drop_rate=0.1,
    skip_drop=0.5,
    
    xgboost_dart_mode=False,
    gpu_use_dp=False,
    boosting_rounds=None,
    early_stopping_rounds=None,
    callbacks=None,
)
# (
#     boosting_type='gbdt',
#     objective='regression',
#     metric='rmse',
#     n_estimators=1000,             # More trees, GPU can handle it
#     learning_rate=0.05,
#     num_leaves=64,
#     max_depth=-1,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42,
#     n_jobs=-1,                     # Use all CPU cores for preprocessing
#     device='gpu',                  # üî• Use GPU
#     gpu_platform_id=0,
#     gpu_device_id=0,
#     verbosity=-1,
# )

# Wrap with MultiOutput
print("‚ö° Training LightGBM Multi-Output Regressor using GPU...")
multi_model = MultiOutputRegressor(lgb_regressor)

multi_model.fit(X_train, y_train)

# End timing
end_time = time.time()
print(f"‚è±Ô∏è Training completed in {end_time - start_time:.2f} seconds.")


üîÅ Converting Polars DataFrames to Pandas...
üì§ Splitting test data into validation and test...
üìä Train size: 5112156
üìä Validation size: 638527
üìä Test size: 638527
‚ö° Training LightGBM Multi-Output Regressor using GPU...
‚è±Ô∏è Training completed in 3727.73 seconds.


In [None]:
# # Function to calculate Mean Absolute Percentage Error (MAPE) per column
# def calculate_mape_per_target(y_true, y_pred, target_names):
#     mape_scores = {}
#     for i, col in enumerate(target_names):
#         # Avoid division by zero
#         true_vals = y_true[:, i]
#         pred_vals = y_pred[:, i]
#         percentage_errors = np.abs((true_vals - pred_vals) / np.clip(np.abs(true_vals), 1e-8, None)) * 100
#         mape_scores[col] = np.mean(percentage_errors)
#     return mape_scores

# # Define target column names
# target_cols = [f"close_{i}" for i in range(1, 11)]

# # Validation evaluation
# print("üìà Evaluating on validation set (with percentage error)...")
# y_pred_val = multi_model.predict(X_val)
# val_mape_scores = calculate_mape_per_target(y_val.values, y_pred_val, target_cols)

# print("üìä Validation Average Percentage Errors (per target):")
# for col, error in val_mape_scores.items():
#     print(f"  {col}: {error:.2f}%")

# # Test evaluation
# print("üß™ Evaluating on test set (with percentage error)...")
# y_pred_test = multi_model.predict(X_test)
# test_mape_scores = calculate_mape_per_target(y_test.values, y_pred_test, target_cols)

# print("üìä Test Average Percentage Errors (per target):")
# for col, error in test_mape_scores.items():
#     print(f"  {col}: {error:.2f}%")


In [3]:
import cupy as cp  # Use CuPy instead of NumPy

# GPU-accelerated MAPE calculation
def calculate_mape_per_target_gpu(y_true, y_pred, target_names):
    mape_scores = {}
    y_true_gpu = cp.asarray(y_true)
    y_pred_gpu = cp.asarray(y_pred)
    
    for i, col in enumerate(target_names):
        true_vals = y_true_gpu[:, i]
        pred_vals = y_pred_gpu[:, i]
        # Avoid division by zero
        percentage_errors = cp.abs((true_vals - pred_vals) / cp.clip(cp.abs(true_vals), 1e-8, None)) * 100
        mape_scores[col] = cp.mean(percentage_errors).item()  # .item() to convert back to float
    return mape_scores

# Define target column names
target_cols = [f"close_{i}" for i in range(1, 11)]

# Validation evaluation
print("üìà Evaluating on validation set (with percentage error, GPU accelerated)...")
y_pred_val = multi_model.predict(X_val)
val_mape_scores = calculate_mape_per_target_gpu(y_val.values, y_pred_val, target_cols)

print("üìä Validation Average Percentage Errors (per target):")
for col, error in val_mape_scores.items():
    print(f"  {col}: {error:.2f}%")

# Test evaluation
print("üß™ Evaluating on test set (with percentage error, GPU accelerated)...")
y_pred_test = multi_model.predict(X_test)
test_mape_scores = calculate_mape_per_target_gpu(y_test.values, y_pred_test, target_cols)

print("üìä Test Average Percentage Errors (per target):")
for col, error in test_mape_scores.items():
    print(f"  {col}: {error:.2f}%")


üìà Evaluating on validation set (with percentage error, GPU accelerated)...
üìä Validation Average Percentage Errors (per target):
  close_1: 5.60%
  close_2: 8.46%
  close_3: 10.28%
  close_4: 10.88%
  close_5: 12.47%
  close_6: 13.68%
  close_7: 14.17%
  close_8: 14.62%
  close_9: 15.49%
  close_10: 17.36%
üß™ Evaluating on test set (with percentage error, GPU accelerated)...
üìä Test Average Percentage Errors (per target):
  close_1: 5.87%
  close_2: 8.82%
  close_3: 10.75%
  close_4: 10.84%
  close_5: 12.63%
  close_6: 13.83%
  close_7: 14.35%
  close_8: 14.68%
  close_9: 15.57%
  close_10: 17.37%


In [None]:
close_1: 9.87%
  close_2: 13.36%
  close_3: 15.78%
  close_4: 17.85%
  close_5: 18.71%
  close_6: 19.90%
  close_7: 21.61%
  close_8: 22.01%
  close_9: 22.45%
  close_10: 23.46%
----------------------------
(10, 2000)
üìà Evaluating on validation set (with percentage error, GPU accelerated)...
üìä Validation Average Percentage Errors (per target):
  close_1: 6.82%
  close_2: 8.60%
  close_3: 11.41%
  close_4: 11.71%
  close_5: 13.72%
  close_6: 13.98%
  close_7: 16.38%
  close_8: 16.98%
  close_9: 18.48%
  close_10: 18.49%
üß™ Evaluating on test set (with percentage error, GPU accelerated)...
üìä Test Average Percentage Errors (per target):
  close_1: 6.98%
  close_2: 8.76%
  close_3: 11.24%
  close_4: 11.62%
  close_5: 13.80%
  close_6: 13.67%
  close_7: 16.22%
  close_8: 16.94%
  close_9: 18.19%
  close_10: 18.28%
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
(15, 2000)
close_1: 6.16%
  close_2: 8.16%
  close_3: 9.76%
  close_4: 11.41%
  close_5: 12.48%
  close_6: 14.17%
  close_7: 15.65%
  close_8: 16.55%
  close_9: 18.37%
  close_10: 17.32%
üß™ Evaluating on test set (with percentage error, GPU accelerated)...
üìä Test Average Percentage Errors (per target):
  close_1: 7.02%
  close_2: 8.56%
  close_3: 10.00%
  close_4: 11.06%
  close_5: 12.50%
  close_6: 14.13%
  close_7: 15.47%
  close_8: 16.35%
  close_9: 17.78%
  close_10: 17.08%
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
(15, 4000)
close_1: 5.84%
  close_2: 8.16%
  close_3: 10.44%
  close_4: 11.49%
  close_5: 13.14%
  close_6: 14.38%
  close_7: 15.72%
  close_8: 16.20%
  close_9: 17.69%
  close_10: 16.73%
üß™ Evaluating on test set (with percentage error, GPU accelerated)...
üìä Test Average Percentage Errors (per target):
  close_1: 6.85%
  close_2: 8.52%
  close_3: 10.92%
  close_4: 11.14%
  close_5: 13.29%
  close_6: 14.58%
  close_7: 15.69%
  close_8: 16.11%
  close_9: 17.16%
  close_10: 16.58%
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++==

In [4]:
import joblib
joblib.dump(multi_model, 'multi_model_stock1.pkl')


['multi_model_stock1.pkl']

In [None]:
import cupy as cp
print(cp.__version__)

### Hyperparameter tuning

In [None]:
import polars as pl

# Load the saved Parquet files
df_test = pl.read_parquet("/kaggle/input/stock-final-train-test/stock_df_test.parquet")
df_train = pl.read_parquet("/kaggle/input/stock-final-train-test/stock_df_train.parquet")

# Quick sanity check
print("‚úÖ Loaded DataFrames:")
print(f"üì¶ Train shape: {df_train.shape}")
print(f"üì¶ Test shape: {df_test.shape}")


In [None]:
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import time

In [None]:
# Assuming you already have df_train as Polars and converted it to df_train_pd
target_cols = [f"close_{i}" for i in range(1, 11)]
drop_cols = ['Date', 'Company_ID', 'COMPANY']
feature_cols = [col for col in df_train.columns if col not in target_cols + drop_cols]

X = df_train[feature_cols]
y = df_train[target_cols]

In [None]:
def calculate_mape_per_target(y_true, y_pred, target_names):
    mape_scores = {}
    for i, col in enumerate(target_names):
        true_vals = y_true[:, i]
        pred_vals = y_pred[:, i]
        percentage_errors = np.abs((true_vals - pred_vals) / np.clip(np.abs(true_vals), 1e-8, None)) * 100
        mape_scores[col] = np.mean(percentage_errors)
    return mape_scores


In [None]:
# # Store MAPE scores from each trial for plotting
# trial_mape_history = {}

# def objective(trial):
#     params = {
#         'objective': 'regression',
#         'metric': 'rmse',
#         'boosting_type': 'gbdt',
#         'device': 'gpu',
#         'verbosity': -1,
#         'learning_rate': 0.1,
#         'num_leaves': trial.suggest_int("num_leaves", 31, 45),
#         'max_depth': trial.suggest_int("max_depth", 5, 10, 15),
#         'subsample': trial.suggest_float("subsample", 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float("colsample_bytree", 0.3, 1.0),
#         # 'reg_alpha': trial.suggest_float("reg_alpha", 0.0, 0.05),
#         # 'reg_lambda': trial.suggest_float("reg_lambda", 0.0, 0.05),
#         'n_estimators': trial.suggest_int( "n_estimators", 100, 150, 200)
#     }

#     kf = KFold(n_splits= 5, shuffle=False)  # shuffle=False for time series
#     rmses = []
#     mape_scores = {col: [] for col in target_cols}

#     for train_idx, val_idx in kf.split(X):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         preds = []
#         true_vals = []

#         for i, col in enumerate(target_cols):
#             model = lgb.LGBMRegressor(**params)
#             model.fit(X_train, y_train[col])
#             pred = model.predict(X_val)
#             preds.append(pred)
#             true_vals.append(y_val[col].values)

#         preds = np.stack(preds, axis=1)
#         true_vals = np.stack(true_vals, axis=1)

#         # RMSE for Optuna
#         rmse = mean_squared_error(true_vals, preds, squared=False)
#         rmses.append(rmse)

#         # MAPE tracking
#         fold_mape = calculate_mape_per_target(true_vals, preds, target_cols)
#         for col in target_cols:
#             mape_scores[col].append(fold_mape[col])

#     # Save average MAPE per target for this trial
#     avg_mape_per_target = {col: np.mean(mape_scores[col]) for col in target_cols}
#     trial_mape_history[trial.number] = avg_mape_per_target

#     return np.mean(rmses)


In [None]:
# Store MAPE scores from each trial for plotting
trial_mape_history = {}

def objective(trial):
    # Suggested parameters with corrected `step=...`
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'device': 'gpu',
        'verbosity': -1,
        'learning_rate': 0.1,
        'max_depth': trial.suggest_int("max_depth", 5, 15, step=5),
        'num_leaves': trial.suggest_categorical("num_leaves", [31, 45]),
        'subsample': 1.0,
        'colsample_bytree': 1.0,
        'n_estimators': trial.suggest_int("n_estimators", 1000, 8000, step=500)
    }

    kf = KFold(n_splits=5, shuffle=False)
    rmses = []
    mape_scores = {col: [] for col in target_cols}

    for train_idx, val_idx in kf.split(X):
        # Polars slicing + convert to Pandas
        X_train = X[train_idx].to_pandas()
        X_val = X[val_idx].to_pandas()
        y_train = y[train_idx].to_pandas()
        y_val = y[val_idx].to_pandas()

        preds = []
        true_vals = []

        for col in target_cols:
            model = lgb.LGBMRegressor(**params)
            model.fit(X_train, y_train[col])
            pred = model.predict(X_val)
            preds.append(pred)
            true_vals.append(y_val[col].values)

        preds = np.stack(preds, axis=1)
        true_vals = np.stack(true_vals, axis=1)

        # RMSE for Optuna optimization
        rmse = mean_squared_error(true_vals, preds, squared=False)
        rmses.append(rmse)

        # MAPE tracking
        fold_mape = calculate_mape_per_target(true_vals, preds, target_cols)
        for col in target_cols:
            mape_scores[col].append(fold_mape[col])

    avg_mape_per_target = {col: np.mean(mape_scores[col]) for col in target_cols}
    trial_mape_history[trial.number] = avg_mape_per_target

    # ‚úÖ Clean and concise trial info
    print(f"\nTrial {trial.number}")
    print(f"Params: {params}")
    print(f"Final RMSE: {np.mean(rmses):.4f}")
    print(f"Avg MAPE per target:")
    for col in target_cols:
        print(f"  {col}: {avg_mape_per_target[col]:.2f}%")
    print("-" * 40)

    return np.mean(rmses)


In [None]:
start_time = time.time()

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # You can increase this to 50+

end_time = time.time()
print(f"‚è±Ô∏è Hyperparameter tuning completed in {end_time - start_time:.2f} seconds.")
print("‚úÖ Best Trial:", study.best_trial.params)


In [None]:
# Plot MAPE per target for each trial
plt.figure(figsize=(14, 7))
for trial_num, mape_scores in trial_mape_history.items():
    values = [mape_scores[col] for col in target_cols]
    plt.plot(target_cols, values, label=f'Trial {trial_num}')

plt.title("MAPE per Target Variable across Trials")
plt.ylabel("MAPE (%)")
plt.xlabel("Target Variable")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()
