In [1]:
import numpy as np
import pandas as pd
import iisignature
from iisignature import sig, prepare, logsig, logsiglength
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, RidgeClassifier, LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

In [2]:
df_train = pd.read_parquet("./datasets/df_train_wdate_wclusters.parquet")
df_test = pd.read_parquet("./datasets/df_test_wdate_wclusters.parquet")

industry_lst = df_train.INDUSTRY.unique() # 'INDUSTRY'
industry_group_lst = df_train.INDUSTRY_GROUP.unique() # 'INDUSTRY_GROUP'
sector_lst = df_train.SECTOR.unique() # 'SECTOR'
sub_industry_lst = df_train.SUB_INDUSTRY.unique() # 'SUB_INDUSTRY'
stock_lst = df_train.STOCK.unique() # 'STOCK'
cluster_lst = df_train.CLUSTER.unique() # 'CLUSTER'

param_grid = {
    "C": np.logspace(-3, 3, num=10)  # e.g., 0.001 to 1000
}

In [3]:
results = []
y_true_all = []
y_pred_all = []

for cluster_idx in tqdm(cluster_lst, total=len(cluster_lst), desc="Processing clusters"):
    try:
        # Split train/test data for this cluster
        df_cluster_train = df_train[df_train["CLUSTER"] == cluster_idx]
        df_cluster_test  = df_test[df_test["CLUSTER"] == cluster_idx]

        # Skip cluster if no data in train or test
        if len(df_cluster_train) == 0 or len(df_cluster_test) == 0:
            print(f"Skipping cluster {cluster_idx} - no data.")
            continue

        # Identify feature columns (those starting with "SIG_")
        feature_cols = [c for c in df_cluster_train.columns if c.startswith("SIG_")]

        # Extract X and y for training and testing
        X_train = df_cluster_train[feature_cols].values
        y_train = df_cluster_train["RET"].values  # assume labels are 0/1
        X_test = df_cluster_test[feature_cols].values
        y_test = df_cluster_test["RET"].values

        # Skip cluster if X or y is empty (shouldn't happen, but safety check)
        if X_train.shape[0] == 0 or X_test.shape[0] == 0:
            print(f"Skipping cluster {cluster_idx} - empty X or y.")
            continue
        print(1)
        # Standardize features: fit on training data then transform both sets
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled  = scaler.transform(X_test)

        # Use GridSearchCV to tune LogisticRegression with L2 penalty (ridge logistic regression)
        gs = GridSearchCV(
            estimator=LogisticRegression(
                penalty="l2", 
                solver="saga",     # 更快的求解器
                max_iter=500,      # 增加单次迭代
                tol=1e-2,          # 提高容忍度
                warm_start=True,   # 使用上次结果作为起点
                random_state=42    # 保证结果一致性
            ),
            param_grid=param_grid,
            cv=3,                 # 减少交叉验证折数
            scoring="f1",
            n_jobs=-1             # 使用所有CPU核心
        )
        gs.fit(X_train_scaled, y_train)
        # Extract the best hyperparameter and model
        best_C = gs.best_params_["C"]
        best_model = gs.best_estimator_

        y_prob = best_model.predict_proba(X_test_scaled)

        # Get predicted probabilities for class 1 from the best model
        y_val_proba = y_prob[:, 1]

        # --- Custom Threshold: Compute the median of the predicted probabilities ---
        custom_threshold = np.median(y_val_proba)
        # Create custom predictions: assign 1 if probability >= median, else 0
        y_val_pred_custom = (y_val_proba >= custom_threshold).astype(int)

        # Get default predictions using the model's built-in threshold (0.5)
        y_val_pred_default = best_model.predict(X_test_scaled)

        # Evaluate default predictions
        acc_default = accuracy_score(y_test, y_val_pred_default)
        report_default = classification_report(y_test, y_val_pred_default, zero_division=0)

        # Evaluate custom predictions (using median threshold)
        acc_custom = accuracy_score(y_test, y_val_pred_custom)
        report_custom = classification_report(y_test, y_val_pred_custom, zero_division=0)

        # Aggregate default predictions for overall evaluation
        y_true_all.extend(y_test)
        y_pred_all.extend(y_val_pred_custom)

        # Save the results for this cluster
        results.append({
            "Cluster": cluster_idx,
            "Train_Samples": len(df_cluster_train),
            "Test_Samples": len(df_cluster_test),
            "Best_C": best_C,
            "Default_Accuracy": acc_default,
            "Custom_Accuracy": acc_custom,
            "Default_Report": report_default,
            "Custom_Report": report_custom,
            "Custom_Threshold": custom_threshold,
            "Model": best_model
        })

    except Exception as e:
        print(f"Error processing cluster {cluster_idx}: {e}")
        continue

# -----------------------------------------------------------------
# 3) SUMMARIZE RESULTS
# -----------------------------------------------------------------
df_results = pd.DataFrame(results)

overall_accuracy = accuracy_score(y_true_all, y_pred_all)
print(f"\nOverall Custom Accuracy across all clusters: {overall_accuracy:.4f}")
df_results

# Optionally, you can inspect the classification reports:
# for idx, row in df_results.iterrows():
#     print(f"\nCluster {row['Cluster']} - Default Report:")
#     print(row['Default_Report'])
#     print(f"Cluster {row['Cluster']} - Custom Report:")
#     print(row['Custom_Report'])

Processing clusters:   0%|          | 0/12 [00:00<?, ?it/s]

1
1




Skipping cluster 6 - no data.
1
1
Skipping cluster 11 - no data.
Skipping cluster 4 - no data.
1
1
Skipping cluster 10 - no data.
Skipping cluster 7 - no data.
Skipping cluster 8 - no data.

Overall Custom Accuracy across all clusters: 0.5011


Unnamed: 0,Cluster,Train_Samples,Test_Samples,Best_C,Default_Accuracy,Custom_Accuracy,Default_Report,Custom_Report,Custom_Threshold,Model
0,5,32381,2074,0.464159,0.5135,0.500964,precision recall f1-score ...,precision recall f1-score ...,0.497721,LogisticRegression(C=np.float64(0.464158883361...
1,2,177418,2568,0.001,0.492991,0.498832,precision recall f1-score ...,precision recall f1-score ...,0.512504,"LogisticRegression(C=np.float64(0.001), max_it..."
2,9,37462,6766,10.0,0.503399,0.501182,precision recall f1-score ...,precision recall f1-score ...,0.502525,"LogisticRegression(C=np.float64(10.0), max_ite..."
3,3,35459,9086,10.0,0.506604,0.503852,precision recall f1-score ...,precision recall f1-score ...,0.49692,"LogisticRegression(C=np.float64(10.0), max_ite..."
4,1,6175,165729,0.464159,0.498778,0.500631,precision recall f1-score ...,precision recall f1-score ...,0.477101,LogisticRegression(C=np.float64(0.464158883361...
5,0,14072,12206,0.001,0.49828,0.50639,precision recall f1-score ...,precision recall f1-score ...,0.493371,"LogisticRegression(C=np.float64(0.001), max_it..."


In [9]:
results = []
y_true_all = []
y_pred_all = []

for sector_idx in tqdm(sector_lst, total=len(sector_lst), desc="Processing clusters"):
    print(sector_idx)
    try:
        # Split train/test data for this cluster
        df_cluster_train = df_train[df_train["SECTOR"] == sector_idx]
        df_cluster_test  = df_test[df_test["SECTOR"] == sector_idx]

        # Skip cluster if no data in train or test
        if len(df_cluster_train) == 0 or len(df_cluster_test) == 0:
            print(f"Skipping cluster {sector_idx} - no data.")
            continue

        # Identify feature columns (those starting with "SIG_")
        feature_cols = [c for c in df_cluster_train.columns if c.startswith("SIG_")]

        # Extract X and y for training and testing
        X_train = df_cluster_train[feature_cols].values
        y_train = df_cluster_train["RET"].values  # assume labels are 0/1
        X_test = df_cluster_test[feature_cols].values
        y_test = df_cluster_test["RET"].values

        # Skip cluster if X or y is empty (shouldn't happen, but safety check)
        if X_train.shape[0] == 0 or X_test.shape[0] == 0:
            print(f"Skipping cluster {sector_idx} - empty X or y.")
            continue
        print(1)
        # Standardize features: fit on training data then transform both sets
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled  = scaler.transform(X_test)

        # Use GridSearchCV to tune LogisticRegression with L2 penalty (ridge logistic regression)
        gs = GridSearchCV(
            estimator=LogisticRegression(
                penalty="l2", 
                solver="saga",     # 更快的求解器
                max_iter=500,      # 增加单次迭代
                tol=1e-2,          # 提高容忍度
                warm_start=True,   # 使用上次结果作为起点
                random_state=42    # 保证结果一致性
            ),
            param_grid=param_grid,
            cv=3,                 # 减少交叉验证折数
            scoring="f1",
            n_jobs=-1             # 使用所有CPU核心
        )
        gs.fit(X_train_scaled, y_train)
        # Extract the best hyperparameter and model
        best_C = gs.best_params_["C"]
        best_model = gs.best_estimator_

        y_prob = best_model.predict_proba(X_test_scaled)

        # Get predicted probabilities for class 1 from the best model
        y_val_proba = y_prob[:, 1]

        # --- Custom Threshold: Compute the median of the predicted probabilities ---
        custom_threshold = np.median(y_val_proba)
        # Create custom predictions: assign 1 if probability >= median, else 0
        y_val_pred_custom = (y_val_proba >= custom_threshold).astype(int)

        # Get default predictions using the model's built-in threshold (0.5)
        y_val_pred_default = best_model.predict(X_test_scaled)

        # Evaluate default predictions
        acc_default = accuracy_score(y_test, y_val_pred_default)
        report_default = classification_report(y_test, y_val_pred_default, zero_division=0)

        # Evaluate custom predictions (using median threshold)
        acc_custom = accuracy_score(y_test, y_val_pred_custom)
        report_custom = classification_report(y_test, y_val_pred_custom, zero_division=0)

        # Aggregate default predictions for overall evaluation
        y_true_all.extend(y_test)
        y_pred_all.extend(y_val_pred_custom)

        # Save the results for this cluster
        results.append({
            "SECTOR": sector_idx,
            "Train_Samples": len(df_cluster_train),
            "Test_Samples": len(df_cluster_test),
            "Best_C": best_C,
            "Default_Accuracy": acc_default,
            "Custom_Accuracy": acc_custom,
            "Default_Report": report_default,
            "Custom_Report": report_custom,
            "Custom_Threshold": custom_threshold,
            "Model": best_model
        })

    except Exception as e:
        print(f"Error processing SECTOR {sector_idx}: {e}")
        continue

# -----------------------------------------------------------------
# 3) SUMMARIZE RESULTS
# -----------------------------------------------------------------
df_results = pd.DataFrame(results)

overall_accuracy = accuracy_score(y_true_all, y_pred_all)
print(f"\nOverall Custom Accuracy across all sectors: {overall_accuracy:.4f}")
df_results

# Optionally, you can inspect the classification reports:
# for idx, row in df_results.iterrows():
#     print(f"\nCluster {row['Cluster']} - Default Report:")
#     print(row['Default_Report'])
#     print(f"Cluster {row['Cluster']} - Custom Report:")
#     print(row['Custom_Report'])

Processing clusters:   0%|          | 0/12 [00:00<?, ?it/s]

5.0
1
3.0
1
6.0
1
8.0
1
1.0
1
2.0
1
0.0
1
4.0
1
7.0
1
11.0
1
10.0
1
9.0
1

Overall Custom Accuracy across all sectors: 0.5008


Unnamed: 0,SECTOR,Train_Samples,Test_Samples,Best_C,Default_Accuracy,Custom_Accuracy,Default_Report,Custom_Report,Custom_Threshold,Model
0,5.0,17295,7977,0.001,0.502445,0.503573,precision recall f1-score ...,precision recall f1-score ...,0.51472,"LogisticRegression(C=np.float64(0.001), max_it..."
1,3.0,55473,27312,0.001,0.50476,0.504723,precision recall f1-score ...,precision recall f1-score ...,0.499888,"LogisticRegression(C=np.float64(0.001), max_it..."
2,6.0,55123,27706,0.001,0.502924,0.503032,precision recall f1-score ...,precision recall f1-score ...,0.499627,"LogisticRegression(C=np.float64(0.001), max_it..."
3,8.0,70843,30092,0.001,0.498272,0.499435,precision recall f1-score ...,precision recall f1-score ...,0.503763,"LogisticRegression(C=np.float64(0.001), max_it..."
4,1.0,21264,11844,0.001,0.497974,0.503462,precision recall f1-score ...,precision recall f1-score ...,0.514521,"LogisticRegression(C=np.float64(0.001), max_it..."
5,2.0,18967,9176,0.001,0.501526,0.502398,precision recall f1-score ...,precision recall f1-score ...,0.501566,"LogisticRegression(C=np.float64(0.001), max_it..."
6,0.0,6304,2177,0.021544,0.488287,0.495636,precision recall f1-score ...,precision recall f1-score ...,0.485783,LogisticRegression(C=np.float64(0.021544346900...
7,4.0,63519,27951,0.464159,0.494043,0.494186,precision recall f1-score ...,precision recall f1-score ...,0.49842,LogisticRegression(C=np.float64(0.464158883361...
8,7.0,87903,41187,0.001,0.499454,0.499259,precision recall f1-score ...,precision recall f1-score ...,0.499664,"LogisticRegression(C=np.float64(0.001), max_it..."
9,11.0,3054,3916,0.004642,0.505363,0.505363,precision recall f1-score ...,precision recall f1-score ...,0.474035,LogisticRegression(C=np.float64(0.004641588833...
