In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [None]:
df = pd.read_parquet(r"datasets\df_train_wclusters.parquet", engine='pyarrow')
df_weighted = pd.read_parquet(r"datasets\df_train_wclusters_weighted.parquet", engine='pyarrow')

In [3]:
df.head()

Unnamed: 0,STOCK,DATE,Sector,Industry,SIG_0,SIG_1,SIG_2,SIG_3,SIG_4,SIG_5,...,SIG_32,SIG_33,SIG_34,SIG_35,SIG_36,SIG_37,SIG_38,SIG_39,RET,CLUSTER
0,A,2020-03-30,Healthcare,Diagnostics & Research,1.0,11.0,-0.038863,-0.010217,60.5,-0.510401,...,0.002732,0.000254,0.000122,-1.5e-05,-6.135842e-07,0.000173,-1.825497e-06,-1.777726e-07,0.027428,3
1,A,2020-03-31,Healthcare,Diagnostics & Research,1.0,13.0,-0.06679,-0.006509,84.5,0.333487,...,-0.000634,3e-05,-0.002507,-8e-06,1.496576e-06,6.6e-05,5.322834e-07,-4.595116e-08,-0.014449,3
2,A,2020-04-01,Healthcare,Diagnostics & Research,1.0,13.0,0.007987,-0.008074,84.5,-0.178082,...,0.001056,0.000109,-0.001418,-2e-06,1.909471e-07,0.000219,-5.734002e-08,-8.770939e-08,-0.037699,3
3,A,2020-04-02,Healthcare,Diagnostics & Research,1.0,13.0,0.007781,-0.004409,84.5,-0.496324,...,0.001558,-2.2e-05,-0.000267,-4e-06,-2.70919e-08,0.000104,-3.363939e-07,-1.428739e-08,0.048897,3
4,A,2020-04-03,Healthcare,Diagnostics & Research,1.0,13.0,0.037151,-0.004974,84.5,0.601383,...,-0.00192,1.4e-05,-0.000157,-7e-06,-3.126028e-08,9.1e-05,5.925123e-07,-2.050992e-08,-0.025868,3


In [4]:
df_weighted.head()

Unnamed: 0,STOCK,DATE,Sector,Industry,SIG_0,SIG_1,SIG_2,SIG_3,SIG_4,SIG_5,...,SIG_33,SIG_34,SIG_35,SIG_36,SIG_37,SIG_38,SIG_39,RET,adaptive_weight,CLUSTER
0,A,2020-03-30,Healthcare,Diagnostics & Research,1.0,11.0,-0.038863,-0.010217,60.5,-0.510401,...,0.000254,0.000122,-1.5e-05,-6.135842e-07,0.000173,-1.825497e-06,-1.777726e-07,0.027428,0.0,3
1,A,2020-03-31,Healthcare,Diagnostics & Research,1.0,13.0,-0.06679,-0.006509,84.5,0.333487,...,3e-05,-0.002507,-8e-06,1.496576e-06,6.6e-05,5.322834e-07,-4.595116e-08,-0.014449,1.127868e-15,3
2,A,2020-04-01,Healthcare,Diagnostics & Research,1.0,13.0,0.007987,-0.008074,84.5,-0.178082,...,0.000109,-0.001418,-2e-06,1.909471e-07,0.000219,-5.734002e-08,-8.770939e-08,-0.037699,7.355835e-05,3
3,A,2020-04-02,Healthcare,Diagnostics & Research,1.0,13.0,0.007781,-0.004409,84.5,-0.496324,...,-2.2e-05,-0.000267,-4e-06,-2.70919e-08,0.000104,-3.363939e-07,-1.428739e-08,0.048897,1.305433e-09,3
4,A,2020-04-03,Healthcare,Diagnostics & Research,1.0,13.0,0.037151,-0.004974,84.5,0.601383,...,1.4e-05,-0.000157,-7e-06,-3.126028e-08,9.1e-05,5.925123e-07,-2.050992e-08,-0.025868,9.3846e-08,3


In [15]:
import ridgeRegression_groupby_Apr1st

In [None]:
df['date'] = pd.to_datetime(df['DATE'])

print("\nSplitting dataset by time...")
df_weighted_train, df_weighted_test = ridgeRegression_groupby_Apr1st.train_test_split_by_time(df, train_ratio=0.7)

# Define grouping columns to test, including CLUSTER
group_columns = ['CLUSTER']

# Process each grouping column
for group_col in group_columns:
    print(f"\nStarting model training by {group_col}...")
    results_df, y_true_all, y_pred_all = ridgeRegression_groupby_Apr1st.ridge_regression_by_group(df_weighted_train, df_weighted_test, group_col)
    
    # Output results for current group column
    print(f"\nTraining results for each {group_col}:")
    # Determine display columns based on grouping type
    available_cols = [col for col in results_df.columns 
                        if col in [group_col, 'Sector', 'Train_Samples', 'Test_Samples', 
                                    'Best_Alpha', 'MSE', 'RMSE', 'MAE', 'R2', 
                                    'Direction_Accuracy']]
    
    if group_col == 'Industry' and 'Sector' in available_cols:
        display_cols = ['Industry', 'Sector'] + [col for col in available_cols if col not in ['Industry', 'Sector']]
    else:
        display_cols = [group_col] + [col for col in available_cols if col != group_col]
    
    print(results_df[display_cols])
    
    # Calculate overall evaluation metrics
    overall_metrics = ridgeRegression_groupby_Apr1st.evaluate_predictions(y_true_all, y_pred_all)
    print(f"\n{group_col} overall evaluation metrics:")
    for metric_name, metric_value in overall_metrics.items():
        print(f"{metric_name}: {metric_value:.4f}")
    
    # Save results
    output_file = r"datasets\ridge_regression_results_{group_col.lower()}.csv"
    results_df.to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")


Splitting dataset by time...
Training set end date: 2023-09-20 00:00:00
Training samples: 432182
Testing samples: 185222

Training set return statistics:
count    432182.000000
mean          0.000951
std           0.022293
min          -0.456763
25%          -0.010151
50%           0.000730
75%           0.011716
max           0.439739
Name: RET, dtype: float64

Testing set return statistics:
count    185222.000000
mean          0.000626
std           0.018865
min          -0.531402
25%          -0.008071
50%           0.000814
75%           0.009439
max           0.359363
Name: RET, dtype: float64

Starting model training by Sector...


Processing Sectors: 100%|██████████| 11/11 [00:06<00:00,  1.63it/s]



Training results for each Sector:
                    Sector  Train_Samples  Test_Samples   Best_Alpha  \
0        Consumer Cyclical          47115         20218  1000.000000   
1               Technology          72585         30828   215.443469   
2       Consumer Defensive          33393         14490   215.443469   
3               Healthcare          54499         23627  1000.000000   
4              Industrials          60427         25982  1000.000000   
5   Communication Services          19271          8173  1000.000000   
6       Financial Services          56051         23777  1000.000000   
7                   Energy          19272          8172  1000.000000   
8                Utilities          24049         10631  1000.000000   
9              Real Estate          26262         11148  1000.000000   
10         Basic Materials          19258          8176  1000.000000   

         MSE      RMSE       MAE        R2  Direction_Accuracy  
0   0.000399  0.019965  0.013474 -0

Processing Industrys: 100%|██████████| 110/110 [00:12<00:00,  8.54it/s]



Training results for each Industry:
                               Industry              Sector  Train_Samples  \
0                           Restaurants   Consumer Cyclical           6142   
1                        Semiconductors          Technology          13164   
2             Beverages - Non-Alcoholic  Consumer Defensive           5274   
3          Drug Manufacturers - General          Healthcare           9662   
4                       Discount Stores  Consumer Defensive           6146   
..                                  ...                 ...            ...   
105                                Gold     Basic Materials            876   
106              Footwear & Accessories   Consumer Cyclical           1751   
107  Furnishings, Fixtures & Appliances   Consumer Cyclical            875   
108               Utilities - Renewable           Utilities            409   
109         Health Information Services          Healthcare            180   

     Test_Samples   Best_A

Processing CLUSTERs: 100%|██████████| 9/9 [00:03<00:00,  2.95it/s]


Training results for each CLUSTER:
   CLUSTER  Train_Samples  Test_Samples  Best_Alpha       MSE      RMSE  \
0        1          35222         14858        10.0  0.000180  0.013407   
1        3         200508         85055      1000.0  0.000265  0.016284   
2        9         128689         54614      1000.0  0.000370  0.019248   
3        0          42024         17832      1000.0  0.000688  0.026223   
4        8            875           372      1000.0  0.006766  0.082258   
5        6           4379          1856      1000.0  0.000756  0.027487   
6        7          18386          7801      1000.0  0.000391  0.019785   
7        4           1834          1115      1000.0  0.001297  0.036007   
8        2            265          1204      1000.0  0.000483  0.021985   

        MAE        R2  Direction_Accuracy  
0  0.008793  0.304589            0.676807  
1  0.010934 -0.005796            0.510999  
2  0.012668 -0.060300            0.515802  
3  0.017671 -0.003971            0.51




In [None]:
df_weighted['date'] = pd.to_datetime(df['DATE'])

print("\nSplitting dataset by time...")
df_weighted_train, df_weighted_test = ridgeRegression_groupby_Apr1st.train_test_split_by_time(df_weighted, train_ratio=0.7)

# Set grouping column to "CLUSTER" only
group_col = "CLUSTER"

print(f"\nStarting model training by {group_col}...")
results_df, y_true_all, y_pred_all = ridgeRegression_groupby_Apr1st.ridge_regression_by_group_adaptive_weight(df_weighted_train, df_weighted_test, group_col)

# Output results for CLUSTER grouping
print(f"\nTraining results for each {group_col}:")
display_cols = [group_col, 'Train_Samples', 'Test_Samples', 'Best_Alpha', 'MSE', 'RMSE', 'MAE', 'R2', 'Direction_Accuracy']
print(results_df[display_cols])

# Calculate overall evaluation metrics
overall_metrics = ridgeRegression_groupby_Apr1st.evaluate_predictions(y_true_all, y_pred_all)
print(f"\n{group_col} overall evaluation metrics:")
for metric_name, metric_value in overall_metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")


# Save results
output_file = r"datasets\ridge_regression_weight_results_{group_col.lower()}.csv"
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")


Splitting dataset by time...
Training set end date: 2023-09-20 00:00:00
Training samples: 432182
Testing samples: 185222

Training set return statistics:
count    432182.000000
mean          0.000951
std           0.022293
min          -0.456763
25%          -0.010151
50%           0.000730
75%           0.011716
max           0.439739
Name: RET, dtype: float64

Testing set return statistics:
count    185222.000000
mean          0.000626
std           0.018865
min          -0.531402
25%          -0.008071
50%           0.000814
75%           0.009439
max           0.359363
Name: RET, dtype: float64

Starting model training by CLUSTER...


Processing CLUSTERs: 100%|██████████| 9/9 [00:06<00:00,  1.41it/s]


Training results for each CLUSTER:
   CLUSTER  Train_Samples  Test_Samples   Best_Alpha       MSE      RMSE  \
0        1          35222         14858    10.000000  0.000194  0.013930   
1        3         200508         85055  1000.000000  0.000264  0.016238   
2        9         128689         54614  1000.000000  0.000351  0.018738   
3        0          42024         17832  1000.000000  0.000686  0.026189   
4        8            875           372   215.443469  0.005112  0.071498   
5        6           4379          1856  1000.000000  0.000756  0.027504   
6        7          18386          7801   215.443469  0.000390  0.019751   
7        4           1834          1115   215.443469  0.001261  0.035512   
8        2            265          1204  1000.000000  0.000480  0.021899   

        MAE        R2  Direction_Accuracy  
0  0.009096  0.249192            0.674451  
1  0.010897 -0.000152            0.526295  
2  0.012565 -0.004842            0.524371  
3  0.017650 -0.001342      


