In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

Loading Data: Note that you may have to change the dataset location based on where you stored it.

In [3]:
train_data = np.load("X_train.npz")
X_train = train_data["X"]
feature_names = train_data["feature_names"]
X_train_df = pd.DataFrame(X_train, columns=feature_names)
y_train_df = pd.read_csv("datasets/y_train.csv")

In [4]:
X_train_df.columns

Index(['ID', 'Stock', 'Industry', 'Industry_Group', 'Sub_Industry', 'Sector',
       'Start Time', 'End Time', 'Sig_0', 'Sig_1',
       ...
       'Sig_1453', 'Sig_1454', 'Sig_1455', 'Sig_1456', 'Sig_1457', 'Sig_1458',
       'Sig_1459', 'Sig_1460', 'Sig_1461', 'Sig_1462'],
      dtype='object', length=1471)

STEP 2: EXTRACT & MATCH TARGET VALUES (y_train)

In [5]:

train_ids = X_train_df["ID"].astype(int)

# Convert RET to binary classification
y_train_df["RET"] = (y_train_df["RET"] > 0).astype(int)

# Create a fast lookup dictionary for {ID: RET}
y_dict = dict(zip(y_train_df["ID"], y_train_df["RET"]))
y_train = np.array([y_dict.get(stock_id, 0) for stock_id in train_ids])

STEP 3: Feature extraction and preprocessing

In [6]:
# 🏷️ Drop unnecessary columns:
X_train_df.drop(columns=["ID", "Stock", "Industry", "Start Time", "End Time"], inplace=True, errors="ignore")

# 🏷️ Select categorical and one hot encoding
categorical_features = ["Industry_Group", "Sub_Industry", "Sector"]
signature_features = [col for col in X_train_df.columns if "Sig_" in col]  # Select only signature features
encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
encoded_cats = encoder.fit_transform(X_train_df[categorical_features])
cat_feature_names = encoder.get_feature_names_out(categorical_features)
encoded_cats_df = pd.DataFrame(encoded_cats, columns=cat_feature_names, index=X_train_df.index)
X_train_df.drop(columns=categorical_features, inplace=True)
X_train_df = pd.concat([X_train_df, encoded_cats_df], axis=1)

# 🔢 Standardize only numerical features (important for Ridge Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_df)




### 🎯 STEP 4: TRAIN/VALIDATE BEFORE TESTING

In [7]:
X_train_sub, X_val, y_train_sub, y_val, id_train_sub, id_val = train_test_split(
    X_scaled, y_train, train_ids, test_size=0.2, random_state=42
)

ridge = RidgeClassifier(alpha=1.0)
ridge.fit(X_train_sub, y_train_sub)

# 🏹 Predict on validation set
y_val_pred = ridge.predict(X_val)

In [8]:
accuracy = accuracy_score(y_val, y_val_pred)
print(f"✅ Validation Accuracy: {accuracy:.4f}")
print(classification_report(y_val, y_val_pred))

✅ Validation Accuracy: 0.5121
              precision    recall  f1-score   support

           0       0.51      0.52      0.52     39866
           1       0.51      0.50      0.51     39598

    accuracy                           0.51     79464
   macro avg       0.51      0.51      0.51     79464
weighted avg       0.51      0.51      0.51     79464



In [9]:
val_results = pd.DataFrame({
    "ID": id_val,
    "Predicted_RET": y_val_pred
})

print(val_results.head())

            ID  Predicted_RET
119251  106278              1
96701    26766              0
285464  134211              1
126266  149857              0
87325   146806              0
