In [13]:
%%capture
%run Data_Cleaning.ipynb

In [14]:
random_seed = 42
training_size = 0.85
validation_size = 0.1

# Get unique month periods
month_periods = sorted(three_ff_cleaned_df['Date'].dt.to_period('M').unique())
# Split size calculation
total_periods = three_ff_cleaned_df['Date'].dt.to_period('M').nunique()
train_size = int(total_periods * training_size)
val_size = int(total_periods * validation_size)
test_size = total_periods - train_size - val_size

# print(month_periods)
print("train_size is :", train_size, "months")
print("val_size is :", val_size, "months")
print("test_size is :", test_size, "months")

# Define training and testing date boundaries
train_cutoff = pd.to_datetime("2023-12-31")
test_start_date = pd.to_datetime("2024-01-01")
test_end_date = pd.to_datetime("2024-12-31")

train_size is : 182 months
val_size is : 21 months
test_size is : 12 months


In [15]:
# Preview the 3ff result
three_ff_cleaned_df = three_ff_cleaned_df.sort_values('Date').reset_index(drop=True)
three_ff_cleaned_df.head(4)

Unnamed: 0,Date,AAPL.O,NVDA.O,MSFT.O,AMZN.O,LLY,WMT,XOM,MA,UNH,...,ASTI.O,AWH.O,SUNE.O,TCRT.O,WINT.O,DGLY.O,Mkt-RF,SMB,HML,RF
0,2007-02-28,-1.315037,1.135388,-9.120327,3.828094,-2.867782,1.291682,-3.320376,-3.996326,-0.153198,...,36.080434,-27.996003,4.710145,-0.564442,-13.182539,-16.907633,-1.96,1.19,-0.14,0.38
1,2007-03-31,9.357873,-7.430645,-1.070674,1.647066,2.107322,-2.855538,5.125842,-0.880898,1.502644,...,66.263841,7.410797,-1.991531,-3.650742,7.443259,14.090545,0.68,0.16,-0.97,0.43
2,2007-04-30,7.153685,13.348787,7.164454,43.26535,9.613921,2.044975,5.077626,4.993695,0.169763,...,12.77838,2.817088,7.561204,4.776457,22.5672,31.585295,3.49,-2.16,-1.45,0.44
3,2007-05-31,19.419975,5.181152,2.474475,11.986431,-0.866247,-0.670016,4.664026,29.199315,3.171925,...,-26.859959,-20.763936,-13.074411,-1.125715,13.226797,-0.417537,3.24,0.24,-0.65,0.41


### Read tuned model files for 3ff

In [16]:
# Read the Excel file
tuned_3ff_df = pd.read_excel(r"Output\models\3ff_tuned_para.xlsx")
tuned_3ff_df.head(3)

Unnamed: 0,Stock,Ridge,Lasso,ElasticNet,SVR,DecisionTree,XGBoost
0,AAPL.O,α=100.0,α=10.0,"α=10.0, l1=0.5","C=0.1, ε=0.1",depth=2,"est=200, lr=0.01"
1,NVDA.O,α=100.0,α=10.0,"α=10.0, l1=0.5","C=1, ε=0.01",depth=2,"est=200, lr=0.01"
2,MSFT.O,α=100.0,α=1.0,"α=1.0, l1=0.9","C=0.1, ε=0.01",depth=2,"est=200, lr=0.01"


### 3ff model prediction

In [21]:
# --------------------
# Start global timer
# --------------------
start_time = time.time()

# Inputs
returns_df = three_ff_cleaned_df.copy()
params_df = tuned_3ff_df.copy()

# FF factors
ff_factors = ['Mkt-RF', 'SMB', 'HML', 'RF']
returns_df['Date'] = pd.to_datetime(returns_df['Date'])

# Outputs
rmse_results = []
all_predictions = []

# Get stock list
stock_columns = [col for col in returns_df.columns if col not in ['Date'] + ff_factors]

for idx, stock in enumerate(stock_columns, start=1):
    print(f"\n🔁 Processing stock: {stock}")
    
    stock_data = returns_df[['Date'] + ff_factors + [stock]].copy()
    stock_data['Stock_Return'] = stock_data[stock]
    stock_data['Excess_Mkt_Return'] = stock_data['Mkt-RF'] - stock_data['RF']

    stock_data = stock_data[['Date', 'Stock_Return', 'Excess_Mkt_Return', 'SMB', 'HML', 'RF']]
    stock_data['Stock_Return_Shifted'] = stock_data['Stock_Return'].shift(-1)
    stock_data = stock_data.dropna().reset_index(drop=True)
    print(len(stock_data))

    X_full = stock_data[['Excess_Mkt_Return', 'SMB', 'HML', 'RF']].values
    y_full = stock_data['Stock_Return_Shifted'].values
    dates = stock_data['Date'].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_full)

    row = params_df[params_df['Stock'] == stock]
    if row.empty:
        continue

    try:
        models = {}

        # Tuned models
        models['Ridge'] = Ridge(alpha=float(re.findall(r"α=(\d+\.?\d*)", row['Ridge'].values[0])[0]))
        models['Lasso'] = Lasso(alpha=float(re.findall(r"α=(\d+\.?\d*)", row['Lasso'].values[0])[0]))
        
        en_alpha, en_l1 = re.findall(r"α=(\d+\.?\d*), l1=(\d+\.?\d*)", row['ElasticNet'].values[0])[0]
        models['ElasticNet'] = ElasticNet(alpha=float(en_alpha), l1_ratio=float(en_l1))
        
        svr_C, svr_eps = re.findall(r"C=(\d+\.?\d*), ε=([\d\.-]+)", row['SVR'].values[0])[0]
        models['SVR'] = SVR(C=float(svr_C), epsilon=float(svr_eps))
        
        dt_depth = int(re.findall(r"depth=(\d+)", row['DecisionTree'].values[0])[0])
        models['DecisionTree'] = DecisionTreeRegressor(max_depth=dt_depth)
        
        xgb_est, xgb_lr = re.findall(r"est=(\d+), lr=([\d\.]+)", row['XGBoost'].values[0])[0]
        models['XGBoost'] = XGBRegressor(n_estimators=int(xgb_est), learning_rate=float(xgb_lr), verbosity=0)

        # Benchmark model
        models['OLS'] = LinearRegression()

    except Exception as e:
        print(f"⚠️ Error parsing hyperparameters for {stock}: {e}")
        continue

    preds_by_model = {m: [] for m in models}
    actuals = []

    for i in range(len(stock_data)):
        current_date = stock_data.loc[i, 'Date']
        if test_start_date <= current_date <= test_end_date:
            train_mask = stock_data['Date'] < current_date
            test_mask = stock_data['Date'] == current_date

            X_train = X_scaled[train_mask]
            y_train = y_full[train_mask]
            X_test = X_scaled[test_mask]
            y_test = y_full[test_mask]

            if len(X_train) == 0 or X_test.shape[0] == 0:
                continue

            actual = y_test[0]

            for name, model in models.items():
                try:
                    model.fit(X_train, y_train)
                    pred = model.predict(X_test)[0]
                    preds_by_model[name].append(pred)
                    all_predictions.append({
                        'Stock': stock,
                        'Date': current_date,
                        'Model': name,
                        'Actual': actual,
                        'Predicted': pred
                    })
                except Exception as e:
                    print(f"⚠️ Prediction error for {stock} - {name}: {e}")
                    continue

            actuals.append(actual)

    for name in preds_by_model:
        pred_arr = np.array(preds_by_model[name])
        if len(pred_arr) == len(actuals) and len(actuals) > 0:
            rmse = mean_squared_error(actuals, pred_arr, squared=False)
            rmse_results.append({
                'Stock': stock,
                'Model': name,
                'RMSE': rmse
            })

# Convert results
rmse_df = pd.DataFrame(rmse_results)
predictions_df = pd.DataFrame(all_predictions)

# --------------------
# End global timer
# --------------------
end_time = time.time()
print(f"\n✅ Total Time: {(end_time - start_time)/60:.2f} minutes")



🔁 Processing stock: AAPL.O
214

🔁 Processing stock: NVDA.O
214

🔁 Processing stock: MSFT.O
214

🔁 Processing stock: AMZN.O
214

🔁 Processing stock: LLY
214

🔁 Processing stock: WMT
214

🔁 Processing stock: XOM
214

🔁 Processing stock: MA
214

🔁 Processing stock: UNH
214

🔁 Processing stock: ORCL.K
214

🔁 Processing stock: COST.O
214

🔁 Processing stock: NFLX.O
214

🔁 Processing stock: HD
214

🔁 Processing stock: CVX
214

🔁 Processing stock: CRM
214

🔁 Processing stock: CSCO.O
214

🔁 Processing stock: MRK
214

🔁 Processing stock: ABT
214

🔁 Processing stock: MCD
214

🔁 Processing stock: TMO
214

🔁 Processing stock: ISRG.O
214

🔁 Processing stock: RTX
214

🔁 Processing stock: QCOM.O
214

🔁 Processing stock: ADBE.O
214

🔁 Processing stock: AMGN.O
214

🔁 Processing stock: INTU.O
214

🔁 Processing stock: AMD.O
214

🔁 Processing stock: CAT
214

🔁 Processing stock: TXN.O
214

🔁 Processing stock: NEE
214

🔁 Processing stock: DHR
214

🔁 Processing stock: PFE
214

🔁 Processing stock: BLK
214

🔁


🔁 Processing stock: HALO.O
214

🔁 Processing stock: GAP
214

🔁 Processing stock: EXLS.O
214

🔁 Processing stock: ATI
214

🔁 Processing stock: TTC
214

🔁 Processing stock: ZION.O
214

🔁 Processing stock: BIO
214

🔁 Processing stock: MHK
214

🔁 Processing stock: APA.O
214

🔁 Processing stock: CVLT.O
214

🔁 Processing stock: AGCO.K
214

🔁 Processing stock: BRKR.O
214

🔁 Processing stock: RLI
214

🔁 Processing stock: IVZ
214

🔁 Processing stock: ONTO.K
214

🔁 Processing stock: CWST.O
214

🔁 Processing stock: QRVO.O
214

🔁 Processing stock: AAON.O
214

🔁 Processing stock: ALK
214

🔁 Processing stock: UFPI.O
214

🔁 Processing stock: FLS
214

🔁 Processing stock: SNV
214

🔁 Processing stock: TGTX.O
214

🔁 Processing stock: TFX
214

🔁 Processing stock: CELH.O
214

🔁 Processing stock: EAT
214

🔁 Processing stock: PEGA.O
214

🔁 Processing stock: OSK
214

🔁 Processing stock: LNC
214

🔁 Processing stock: AWI
214

🔁 Processing stock: CHH
214

🔁 Processing stock: WEX
214

🔁 Processing stock: CORT.O



🔁 Processing stock: OPK.O
214

🔁 Processing stock: SAFT.O
214

🔁 Processing stock: RES
214

🔁 Processing stock: UCTT.O
214

🔁 Processing stock: GYRE.O
214

🔁 Processing stock: MODG.K
214

🔁 Processing stock: TILE.O
214

🔁 Processing stock: MSEX.O
214

🔁 Processing stock: MRTN.O
214

🔁 Processing stock: PRG
214

🔁 Processing stock: INVA.O
214

🔁 Processing stock: DGII.O
214

🔁 Processing stock: APOG.O
214

🔁 Processing stock: GERN.O
214

🔁 Processing stock: CLMT.O
214

🔁 Processing stock: BBSI.O
214

🔁 Processing stock: WGO
214

🔁 Processing stock: AMN
214

🔁 Processing stock: AORT.K
214

🔁 Processing stock: ALGT.O
214

🔁 Processing stock: SBGI.O
214

🔁 Processing stock: GRC
214

🔁 Processing stock: AMSF.O
214

🔁 Processing stock: KFRC.K
214

🔁 Processing stock: BELFA.O
214

🔁 Processing stock: THRM.O
214

🔁 Processing stock: LQDT.O
214

🔁 Processing stock: MYGN.O
214

🔁 Processing stock: NSSC.O
214

🔁 Processing stock: UTL
214

🔁 Processing stock: WOLF.K
214

🔁 Processing stock: KSS
2


🔁 Processing stock: IVAC.O
214

🔁 Processing stock: PLCE.O
214

🔁 Processing stock: LTRX.O
214

🔁 Processing stock: PPIH.O
214

🔁 Processing stock: GNSS.O
214

🔁 Processing stock: GALT.O
214

🔁 Processing stock: GAIA.O
214

🔁 Processing stock: KVHI.O
214

🔁 Processing stock: HURC.O
214

🔁 Processing stock: NOTV.O
214

🔁 Processing stock: DXLG.O
214

🔁 Processing stock: TAYD.O
214

🔁 Processing stock: PCG_pa
214

🔁 Processing stock: GEOS.O
214

🔁 Processing stock: SLNG.O
214

🔁 Processing stock: ORMP.O
214

🔁 Processing stock: INTT.K
214

🔁 Processing stock: ACHV.O
214

🔁 Processing stock: ANIX.O
214

🔁 Processing stock: ZYXI.O
214

🔁 Processing stock: QUIK.O
214

🔁 Processing stock: BRID.O
214

🔁 Processing stock: UEIC.O
214

🔁 Processing stock: QMCO.O
214

🔁 Processing stock: LPSN.O
214

🔁 Processing stock: FONR.O
214

🔁 Processing stock: MCHX.O
214

🔁 Processing stock: LPTH.O
214

🔁 Processing stock: HGBL.O
214

🔁 Processing stock: PETS.O
214

🔁 Processing stock: DIT
214

🔁 Processi

In [22]:
predictions_df

Unnamed: 0,Stock,Date,Model,Actual,Predicted
0,AAPL.O,2024-01-31,Ridge,-1.999245,2.727184
1,AAPL.O,2024-01-31,Lasso,-1.999245,2.025251
2,AAPL.O,2024-01-31,ElasticNet,-1.999245,2.025251
3,AAPL.O,2024-01-31,SVR,-1.999245,3.228082
4,AAPL.O,2024-01-31,DecisionTree,-1.999245,1.916913
...,...,...,...,...,...
75070,DGLY.O,2024-11-30,ElasticNet,-39.532439,-2.695937
75071,DGLY.O,2024-11-30,SVR,-39.532439,-2.580654
75072,DGLY.O,2024-11-30,DecisionTree,-39.532439,-4.964388
75073,DGLY.O,2024-11-30,XGBoost,-39.532439,-4.732873


In [23]:
# Write results to Excel
rmse_df.to_excel("3ff_rmse_results.xlsx", index=False)
predictions_df.to_excel("3ff_predictions_results.xlsx", index=False)

In [24]:
# Identify the best model per stock (lowest RMSE)
best_models = rmse_df.loc[rmse_df.groupby("Stock")["RMSE"].idxmin()]

# Count how many times each model was selected as best
best_model_counts = best_models["Model"].value_counts().reset_index()
best_model_counts.columns = ["Model", "Count"]
best_model_counts.head(10)

Unnamed: 0,Model,Count
0,SVR,225
1,Lasso,205
2,XGBoost,165
3,OLS,162
4,DecisionTree,146
5,Ridge,46
6,ElasticNet,26


In [25]:
merged_data.head(4)

Unnamed: 0,Period,RIC,Closing Price,SIC Industry Code,SIC Industry Name,Company Market Cap,Industry
0,2025-01-31,AAPL.O,236.0,3571.0,Electronic Computers,3210436000000.0,Hi-Tech
1,2024-12-31,AAPL.O,250.42,3571.0,Electronic Computers,3210436000000.0,Hi-Tech
2,2024-11-30,AAPL.O,237.33,3571.0,Electronic Computers,3210436000000.0,Hi-Tech
3,2024-10-31,AAPL.O,225.91,3571.0,Electronic Computers,3210436000000.0,Hi-Tech


In [26]:
rmse_df.head(3)

Unnamed: 0,Stock,Model,RMSE
0,AAPL.O,Ridge,5.358828
1,AAPL.O,Lasso,5.111489
2,AAPL.O,ElasticNet,5.111489


In [27]:
# Step 1: Get industry info per stock (use latest entry only to avoid duplication)
industry_info = merged_data[['RIC', 'Industry']].drop_duplicates(subset='RIC')

# Step 2: Merge with RMSE results
rmse_with_industry = rmse_df.merge(industry_info, left_on='Stock', right_on='RIC', how='left')

# Step 3: Group by Industry and Model to compute average RMSE
avg_rmse_by_industry = (
    rmse_with_industry
    .groupby(['Industry', 'Model'])['RMSE']
    .mean()
    .reset_index()
    .sort_values(['Industry', 'RMSE'])
)
avg_rmse_by_industry.to_excel("avg_rmse_3ff_by_industry.xlsx", index=False)

### Read tuned model files for 5ff

In [28]:
# Preview the 3ff result
five_ff_cleaned_df = five_ff_cleaned_df.sort_values('Date').reset_index(drop=True)
five_ff_cleaned_df.head(3)

Unnamed: 0,Date,AAPL.O,NVDA.O,MSFT.O,AMZN.O,LLY,WMT,XOM,MA,UNH,...,SUNE.O,TCRT.O,WINT.O,DGLY.O,Mkt-RF,SMB,HML,RMW,CMA,RF
0,2007-02-28,-1.315037,1.135388,-9.120327,3.828094,-2.867782,1.291682,-3.320376,-3.996326,-0.153198,...,4.710145,-0.564442,-13.182539,-16.907633,-1.96,1.29,-0.14,-0.51,-0.71,0.38
1,2007-03-31,9.357873,-7.430645,-1.070674,1.647066,2.107322,-2.855538,5.125842,-0.880898,1.502644,...,-1.991531,-3.650742,7.443259,14.090545,0.68,0.2,-0.97,0.64,-0.65,0.43
2,2007-04-30,7.153685,13.348787,7.164454,43.26535,9.613921,2.044975,5.077626,4.993695,0.169763,...,7.561204,4.776457,22.5672,31.585295,3.49,-2.04,-1.45,1.15,1.03,0.44


In [31]:
# Read the Excel file
tuned_5ff_df = pd.read_excel(r"Output\models\5ff_tuned_para.xlsx")
tuned_5ff_df.head(3)

Unnamed: 0,Stock,Ridge,Lasso,ElasticNet,SVR,DecisionTree,XGBoost
0,AAPL.O,α=100.0,α=10.0,"α=10.0, l1=0.5","C=10, ε=0.01",depth=2,"est=200, lr=0.01"
1,NVDA.O,α=100.0,α=1.0,"α=1.0, l1=0.9","C=10, ε=0.01",depth=2,"est=50, lr=0.1"
2,MSFT.O,α=100.0,α=1.0,"α=1.0, l1=0.9","C=0.1, ε=0.1",depth=2,"est=100, lr=0.05"


In [33]:
# --------------------
# Start global timer
# --------------------
start_time = time.time()

# Inputs
returns_df = five_ff_cleaned_df.copy()
params_df = tuned_5ff_df.copy()

# FF factors
ff_factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
returns_df['Date'] = pd.to_datetime(returns_df['Date'])

# Outputs
rmse_results = []
all_predictions = []

# Get stock list
stock_columns = [col for col in returns_df.columns if col not in ['Date'] + ff_factors]

for idx, stock in enumerate(stock_columns, start=1):
    print(f"\n🔁 Processing stock: {stock}")
    
    stock_data = returns_df[['Date'] + ff_factors + [stock]].copy()
    stock_data['Stock_Return'] = stock_data[stock]
    stock_data['Excess_Mkt_Return'] = stock_data['Mkt-RF'] - stock_data['RF']

    stock_data = stock_data[['Date', 'Stock_Return', 'Excess_Mkt_Return', 'SMB', 'HML', 'RMW', 'CMA', 'RF']]
    stock_data['Stock_Return_Shifted'] = stock_data['Stock_Return'].shift(-1)
    stock_data = stock_data.dropna().reset_index(drop=True)

    X_full = stock_data[['Excess_Mkt_Return', 'SMB', 'HML', 'RMW', 'CMA', 'RF']].values
    y_full = stock_data['Stock_Return_Shifted'].values
    dates = stock_data['Date'].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_full)

    row = params_df[params_df['Stock'] == stock]
    if row.empty:
        continue

    try:
        models = {}

        # Tuned models
        models['Ridge'] = Ridge(alpha=float(re.findall(r"α=(\d+\.?\d*)", row['Ridge'].values[0])[0]))
        models['Lasso'] = Lasso(alpha=float(re.findall(r"α=(\d+\.?\d*)", row['Lasso'].values[0])[0]))
        
        en_alpha, en_l1 = re.findall(r"α=(\d+\.?\d*), l1=(\d+\.?\d*)", row['ElasticNet'].values[0])[0]
        models['ElasticNet'] = ElasticNet(alpha=float(en_alpha), l1_ratio=float(en_l1))
        
        svr_C, svr_eps = re.findall(r"C=(\d+\.?\d*), ε=([\d\.-]+)", row['SVR'].values[0])[0]
        models['SVR'] = SVR(C=float(svr_C), epsilon=float(svr_eps))
        
        dt_depth = int(re.findall(r"depth=(\d+)", row['DecisionTree'].values[0])[0])
        models['DecisionTree'] = DecisionTreeRegressor(max_depth=dt_depth)
        
        xgb_est, xgb_lr = re.findall(r"est=(\d+), lr=([\d\.]+)", row['XGBoost'].values[0])[0]
        models['XGBoost'] = XGBRegressor(n_estimators=int(xgb_est), learning_rate=float(xgb_lr), verbosity=0)

        # Benchmark model
        models['OLS'] = LinearRegression()

    except Exception as e:
        print(f"⚠️ Error parsing hyperparameters for {stock}: {e}")
        continue

    preds_by_model = {m: [] for m in models}
    actuals = []

    for i in range(len(stock_data)):
        current_date = stock_data.loc[i, 'Date']
        if test_start_date <= current_date <= test_end_date:
            train_mask = stock_data['Date'] < current_date
            test_mask = stock_data['Date'] == current_date

            X_train = X_scaled[train_mask]
            y_train = y_full[train_mask]
            X_test = X_scaled[test_mask]
            y_test = y_full[test_mask]

            if len(X_train) == 0 or X_test.shape[0] == 0:
                continue

            actual = y_test[0]

            for name, model in models.items():
                try:
                    model.fit(X_train, y_train)
                    pred = model.predict(X_test)[0]
                    preds_by_model[name].append(pred)
                    all_predictions.append({
                        'Stock': stock,
                        'Date': current_date,
                        'Model': name,
                        'Actual': actual,
                        'Predicted': pred
                    })
                except Exception as e:
                    print(f"⚠️ Prediction error for {stock} - {name}: {e}")
                    continue

            actuals.append(actual)

    for name in preds_by_model:
        pred_arr = np.array(preds_by_model[name])
        if len(pred_arr) == len(actuals) and len(actuals) > 0:
            rmse = mean_squared_error(actuals, pred_arr, squared=False)
            rmse_results.append({
                'Stock': stock,
                'Model': name,
                'RMSE': rmse
            })

# Convert results
rmse_5ff_df = pd.DataFrame(rmse_results)
predictions_5ff_df = pd.DataFrame(all_predictions)

# --------------------
# End global timer
# --------------------
end_time = time.time()
print(f"\n✅ Total Time: {(end_time - start_time)/60:.2f} minutes")



🔁 Processing stock: AAPL.O

🔁 Processing stock: NVDA.O

🔁 Processing stock: MSFT.O

🔁 Processing stock: AMZN.O

🔁 Processing stock: LLY

🔁 Processing stock: WMT

🔁 Processing stock: XOM

🔁 Processing stock: MA

🔁 Processing stock: UNH

🔁 Processing stock: ORCL.K

🔁 Processing stock: COST.O

🔁 Processing stock: NFLX.O

🔁 Processing stock: HD

🔁 Processing stock: CVX

🔁 Processing stock: CRM

🔁 Processing stock: CSCO.O

🔁 Processing stock: MRK

🔁 Processing stock: ABT

🔁 Processing stock: MCD

🔁 Processing stock: TMO

🔁 Processing stock: ISRG.O

🔁 Processing stock: RTX

🔁 Processing stock: QCOM.O

🔁 Processing stock: ADBE.O

🔁 Processing stock: AMGN.O

🔁 Processing stock: INTU.O

🔁 Processing stock: AMD.O

🔁 Processing stock: CAT

🔁 Processing stock: TXN.O

🔁 Processing stock: NEE

🔁 Processing stock: DHR

🔁 Processing stock: PFE

🔁 Processing stock: BLK

🔁 Processing stock: UNP

🔁 Processing stock: BSX

🔁 Processing stock: SYK

🔁 Processing stock: GILD.O

🔁 Processing stock: HON.O

🔁 P


🔁 Processing stock: CACC.O

🔁 Processing stock: BYD

🔁 Processing stock: DDS

🔁 Processing stock: IDCC.O

🔁 Processing stock: ACIW.O

🔁 Processing stock: USM

🔁 Processing stock: CROX.O

🔁 Processing stock: GNTX.O

🔁 Processing stock: CRVL.O

🔁 Processing stock: ARW

🔁 Processing stock: NOV

🔁 Processing stock: RHI

🔁 Processing stock: SKY

🔁 Processing stock: CRK

🔁 Processing stock: OPCH.O

🔁 Processing stock: CGNX.O

🔁 Processing stock: LSTR.O

🔁 Processing stock: CRUS.O

🔁 Processing stock: CMC

🔁 Processing stock: CYTK.O

🔁 Processing stock: QXO

🔁 Processing stock: IONS.O

🔁 Processing stock: TKR

🔁 Processing stock: LFUS.O

🔁 Processing stock: NXST.O

🔁 Processing stock: LAZ

🔁 Processing stock: MTH

🔁 Processing stock: TXNM.K

🔁 Processing stock: LUMN.K

🔁 Processing stock: POR

🔁 Processing stock: AMKR.O

🔁 Processing stock: NSIT.O

🔁 Processing stock: AMG

🔁 Processing stock: ITRI.O

🔁 Processing stock: FSS

🔁 Processing stock: HXL

🔁 Processing stock: FELE.O

🔁 Processing s


🔁 Processing stock: OPY

🔁 Processing stock: REPX.K

🔁 Processing stock: CVLG.K

🔁 Processing stock: OSPN.O

🔁 Processing stock: FWRD.O

🔁 Processing stock: SCVL.O

🔁 Processing stock: JACK.O

🔁 Processing stock: SENEA.O

🔁 Processing stock: WLDN.O

🔁 Processing stock: CASS.O

🔁 Processing stock: SMP

🔁 Processing stock: CTLP.O

🔁 Processing stock: FARO.O

🔁 Processing stock: EBF

🔁 Processing stock: USNA.K

🔁 Processing stock: SCHL.O

🔁 Processing stock: DJCO.O

🔁 Processing stock: NPKI.K

🔁 Processing stock: AXL

🔁 Processing stock: CMCO.O

🔁 Processing stock: SLP.O

🔁 Processing stock: CRDb

🔁 Processing stock: TWI

🔁 Processing stock: HZO

🔁 Processing stock: CLMB.O

🔁 Processing stock: MLR

🔁 Processing stock: CCRN.O

🔁 Processing stock: YORW.O

🔁 Processing stock: BBW

🔁 Processing stock: VLGEA.O

🔁 Processing stock: LGTY.O

🔁 Processing stock: PBT

🔁 Processing stock: MYE

🔁 Processing stock: WNC

🔁 Processing stock: GTN

🔁 Processing stock: KELYA.O

🔁 Processing stock: ALRS.O



🔁 Processing stock: NSYS.O

🔁 Processing stock: OCC.O

🔁 Processing stock: NTWK.O

🔁 Processing stock: INTG.O

🔁 Processing stock: DRRX.O

🔁 Processing stock: IGC

🔁 Processing stock: TRT

🔁 Processing stock: ENZ

🔁 Processing stock: GTIM.O

🔁 Processing stock: TENX.O

🔁 Processing stock: CKX

🔁 Processing stock: CRIS.O

🔁 Processing stock: CVM

🔁 Processing stock: CYTH.O

🔁 Processing stock: DAIO.O

🔁 Processing stock: JOB

🔁 Processing stock: RVP

🔁 Processing stock: INTZ.O

🔁 Processing stock: NAII.O

🔁 Processing stock: CVV.O

🔁 Processing stock: LSTA.O

🔁 Processing stock: PTN

🔁 Processing stock: CREX.O

🔁 Processing stock: MXC

🔁 Processing stock: MTEX.O

🔁 Processing stock: NNVC.K

🔁 Processing stock: IRIX.O

🔁 Processing stock: AMS

🔁 Processing stock: SIF

🔁 Processing stock: BRN

🔁 Processing stock: NEPH.O

🔁 Processing stock: HUSA.K

🔁 Processing stock: CLRO.O

🔁 Processing stock: COHN.K

🔁 Processing stock: ELSE.O

🔁 Processing stock: MEIP.O

🔁 Processing stock: TAIT.O

🔁

In [36]:
# Write results to Excel
rmse_5ff_df.to_excel("5ff_rmse_results.xlsx", index=False)
predictions_5ff_df.to_excel("5ff_predictions_results.xlsx", index=False)

In [37]:
# Step 1: Get industry info per stock (use latest entry only to avoid duplication)
industry_info = merged_data[['RIC', 'Industry']].drop_duplicates(subset='RIC')

# Step 2: Merge with RMSE results
rmse_with_industry = rmse_5ff_df.merge(industry_info, left_on='Stock', right_on='RIC', how='left')

# Step 3: Group by Industry and Model to compute average RMSE
avg_rmse_5ff_by_industry = (
    rmse_with_industry
    .groupby(['Industry', 'Model'])['RMSE']
    .mean()
    .reset_index()
    .sort_values(['Industry', 'RMSE'])
)
avg_rmse_5ff_by_industry.to_excel("avg_rmse_5ff_by_industry.xlsx", index=False)


#Doing by avg rmse by industry for all models

# Step 1: Rename RMSE columns to identify the factor model
avg_rmse_by_industry_3f_renamed = avg_rmse_by_industry.rename(columns={'RMSE': 'RMSE (3FF)'})
avg_rmse_5ff_by_industry_renamed = avg_rmse_5ff_by_industry.rename(columns={'RMSE': 'RMSE (5FF)'})

# Step 2: Merge on Industry and Model
rmse_comparison = pd.merge(
    avg_rmse_by_industry_3f_renamed,
    avg_rmse_5ff_by_industry_renamed,
    on=['Industry', 'Model'],
    how='outer'  # in case one side is missing models
)
rmse_comparison
# Step 3: Sort and display
rmse_comparison = rmse_comparison.sort_values(by=['Industry', 'Model'])
rmse_comparison.to_excel("rmse_comparison.xlsx", index=False)

In [38]:
# doing by count here

# Step 1: Get best model per stock for 3FF
best_3ff = rmse_df.loc[rmse_df.groupby("Stock")["RMSE"].idxmin()]
count_3ff = best_3ff["Model"].value_counts().reset_index()
count_3ff.columns = ["Model", "Count (3FF)"]

# Step 2: Get best model per stock for 5FF
best_5ff = rmse_5ff_df.loc[rmse_5ff_df.groupby("Stock")["RMSE"].idxmin()]
count_5ff = best_5ff["Model"].value_counts().reset_index()
count_5ff.columns = ["Model", "Count (5FF)"]

# Step 3: Merge the two counts side-by-side
best_model_counts_combined = pd.merge(count_3ff, count_5ff, on="Model", how="outer").fillna(0)
best_model_counts_combined[["Count (3FF)", "Count (5FF)"]] = best_model_counts_combined[["Count (3FF)", "Count (5FF)"]].astype(int)

# Optional: Sort by total count or model name
best_model_counts_combined = best_model_counts_combined.sort_values(by="Model")
best_model_counts_combined

Unnamed: 0,Model,Count (3FF),Count (5FF)
4,DecisionTree,146,133
6,ElasticNet,26,23
1,Lasso,205,178
3,OLS,162,209
5,Ridge,46,59
0,SVR,225,213
2,XGBoost,165,160


In [39]:
# doing by size here

# Step 1: Categorize Market Cap
def categorize_market_cap(value):
    if value >= 10_000_000_000:
        return 'Large Cap'
    elif value >= 2_000_000_000:
        return 'Mid Cap'
    elif value >= 300_000_000:
        return 'Small Cap'
    else:
        return 'Micro Cap'  # Optional catch-all

# Extract unique market cap info
marketcap_info = merged_data[['RIC', 'Company Market Cap']].drop_duplicates(subset='RIC')
marketcap_info['MarketCapCategory'] = marketcap_info['Company Market Cap'].apply(categorize_market_cap)

# Step 2: Merge with 3FF and 5FF RMSE data
rmse_3f = rmse_df.merge(marketcap_info, left_on='Stock', right_on='RIC', how='left')
rmse_5f = rmse_5ff_df.merge(marketcap_info, left_on='Stock', right_on='RIC', how='left')

# Step 3: Compute average RMSE by model and market cap
avg_rmse_3f = (
    rmse_3f.groupby(['MarketCapCategory', 'Model'])['RMSE']
    .mean().reset_index().rename(columns={'RMSE': 'RMSE (3FF)'})
)

avg_rmse_5f = (
    rmse_5f.groupby(['MarketCapCategory', 'Model'])['RMSE']
    .mean().reset_index().rename(columns={'RMSE': 'RMSE (5FF)'})
)

# Step 4: Merge 3FF and 5FF side-by-side
rmse_by_cap = pd.merge(avg_rmse_3f, avg_rmse_5f, on=['MarketCapCategory', 'Model'], how='outer')
rmse_by_cap = rmse_by_cap.sort_values(['MarketCapCategory', 'Model']).reset_index(drop=True)
rmse_by_cap

Unnamed: 0,MarketCapCategory,Model,RMSE (3FF),RMSE (5FF)
0,Large Cap,DecisionTree,8.910442,8.57015
1,Large Cap,ElasticNet,7.924144,7.92407
2,Large Cap,Lasso,7.925102,7.928441
3,Large Cap,OLS,8.079037,8.041491
4,Large Cap,Ridge,7.986602,7.959748
5,Large Cap,SVR,8.05272,7.99966
6,Large Cap,XGBoost,8.532621,8.281921
7,Micro Cap,DecisionTree,22.823438,23.04008
8,Micro Cap,ElasticNet,19.47412,19.450079
9,Micro Cap,Lasso,19.465708,19.46235
