# MGTF 424 Final Project
This notebook builds a baseline model to predict `return_on_asset` using panel data with anonymized indicators.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("sample_submission.csv")

  from pandas.core import (


In [2]:
from sklearn.preprocessing import StandardScaler

# Remove outliers from train_df based on return_on_asset (e.g., 1st and 99th percentiles)
q_low = train_df['return_on_asset'].quantile(0.01)
q_high = train_df['return_on_asset'].quantile(0.99)
train_df_clean = train_df[(train_df['return_on_asset'] >= q_low) & (train_df['return_on_asset'] <= q_high)].copy()

# Normalize target
scaler = StandardScaler()
train_df_clean['return_on_asset_norm'] = scaler.fit_transform(train_df_clean[['return_on_asset']])

# Add lag features and rolling stats for each asset_id
lag_features = []
rolling_features = []
window = 3

for col in [c for c in train_df_clean.columns if c.startswith('indicator_')]:
    # Lag 1
    lag_col = f"{col}_lag1"
    train_df_clean[lag_col] = train_df_clean.groupby('asset_id')[col].shift(1)
    lag_features.append(lag_col)
    # Rolling mean
    roll_mean_col = f"{col}_roll{window}_mean"
    train_df_clean[roll_mean_col] = train_df_clean.groupby('asset_id')[col].rolling(window, min_periods=1).mean().reset_index(level=0, drop=True)
    rolling_features.append(roll_mean_col)
    # Rolling std
    roll_std_col = f"{col}_roll{window}_std"
    train_df_clean[roll_std_col] = train_df_clean.groupby('asset_id')[col].rolling(window, min_periods=1).std().reset_index(level=0, drop=True)
    rolling_features.append(roll_std_col)

# Drop rows with NaN after lagging (optional, or impute later)
train_df_clean = train_df_clean.dropna(subset=lag_features)

In [3]:
# Step 1: Aggregate features
def aggregate_features(df, is_train=True):
    agg_funcs = ['mean', 'std', 'min', 'max']
    feature_cols = [col for col in df.columns if col.startswith("indicator_")]

    aggregated = df.groupby('asset_id')[feature_cols].agg(agg_funcs)
    aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]

    if is_train:
        static_cols = ['return_on_asset', 'company_age', 'company_size', 'revenue']
    else:
        static_cols = ['company_age', 'company_size', 'revenue']

    static_info = df.groupby('asset_id')[static_cols].first()
    return aggregated.join(static_info)

train_agg = aggregate_features(train_df, is_train=True)
test_agg = aggregate_features(test_df, is_train=False)

In [4]:
from sklearn.model_selection import GroupKFold

# Prepare features and target
X = train_agg.drop(columns=["return_on_asset"])
y = train_agg["return_on_asset"]
groups = train_agg.index  # asset_id as group

cv = GroupKFold(n_splits=5)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    print(f"Fold {fold+1}: Train shape {X_tr.shape}, Val shape {X_val.shape}")
    # You can fit your model here using X_tr, y_tr and validate on X_val, y_val

Fold 1: Train shape (408, 51), Val shape (103, 51)
Fold 2: Train shape (409, 51), Val shape (102, 51)
Fold 3: Train shape (409, 51), Val shape (102, 51)
Fold 4: Train shape (409, 51), Val shape (102, 51)
Fold 5: Train shape (409, 51), Val shape (102, 51)


In [7]:
from sklearn.preprocessing import StandardScaler

# Step 2: Prepare training data
X_train = train_agg.drop(columns=["return_on_asset"])
y_train_raw = train_agg["return_on_asset"]

# Normalize target
target_scaler = StandardScaler()
y_train = target_scaler.fit_transform(y_train_raw.values.reshape(-1, 1)).ravel()

In [9]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Step 3: Impute missing values using IterativeImputer
imputer = IterativeImputer(random_state=42)
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
test_agg_imputed = pd.DataFrame(imputer.transform(test_agg), index=test_agg.index, columns=test_agg.columns)



In [10]:
y_train = train_agg["return_on_asset"]
y_train = np.clip(y_train, a_min=-100, a_max=100)

In [12]:
# Ensure rf is defined and trained
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train_imputed, y_train)

# Use the trained RandomForestRegressor (rf) and imputed test features
y_pred = rf.predict(test_agg_imputed)
y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).ravel()

In [13]:
# Step 4: Cross-validated training

# Ensure X_train_imputed is defined
if 'X_train_imputed' not in globals():
    imputer = SimpleImputer(strategy='mean')
    X_train = train_agg.drop(columns=["return_on_asset"])
    X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

cv = GroupKFold(n_splits=5)
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
cv_scores = []

# Use 'groups' as defined in cell 4 (groups = train_agg.index)
for train_idx, val_idx in cv.split(X_train_imputed, y_train, groups):
    X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    cv_scores.append(score)

print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))


CV MAE scores: [8.001262135922332, 7.372254901960783, 7.230686274509804, 6.949901960784314, 7.492450980392157]
Average MAE: 7.409311250713879


In [14]:
from sklearn.ensemble import GradientBoostingRegressor

# Bagging: RandomForest (already used in your notebook)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train_imputed, y_train)
rf_preds = rf.predict(test_agg_imputed)

# Boosting: GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=200, random_state=42)
gbr.fit(X_train_imputed, y_train)
gbr_preds = gbr.predict(test_agg_imputed)

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn

# Prepare sequence data for each asset_id
class AssetSequenceDataset(Dataset):
    def __init__(self, df, target_col, seq_len=32, feature_cols=None):
        self.groups = []
        self.sequences = []
        self.targets = []
        self.seq_len = seq_len

        if feature_cols is None:
            feature_cols = [c for c in df.columns if c.startswith('indicator_')]
        self.feature_cols = feature_cols

        grouped = df.groupby('asset_id')
        for asset_id, group in grouped:
            group = group.sort_values('timestamp')
            features = group[self.feature_cols].values
            target = group[target_col].values if target_col in group else None
            # Pad or truncate
            if len(features) < seq_len:
                pad_width = seq_len - len(features)
                features = np.pad(features, ((pad_width,0),(0,0)), 'constant')
                if target is not None:
                    target = np.pad(target, (pad_width,0), 'constant')
            else:
                features = features[-seq_len:]
                if target is not None:
                    target = target[-seq_len:]
            self.sequences.append(features)
            if target is not None:
                self.targets.append(target[-1])  # predict last value
            self.groups.append(asset_id)

    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        x = torch.tensor(self.sequences[idx], dtype=torch.float32)
        y = torch.tensor(self.targets[idx], dtype=torch.float32) if self.targets else torch.tensor(0.0)
        return x, y

# Temporal Transformer Model
class TemporalTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2, seq_len=32):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * seq_len, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        x = self.input_proj(x)
        x = self.transformer(x)
        return self.head(x)
    


In [16]:
print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))

CV MAE scores: [8.001262135922332, 7.372254901960783, 7.230686274509804, 6.949901960784314, 7.492450980392157]
Average MAE: 7.409311250713879


In [17]:
# Average predictions from bagging and boosting models
ensemble_preds = 0.5 * rf_preds + 0.5 * gbr_preds

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

# Use TemporalTransformer on aggregated features to predict return_on_asset


# Prepare a dataset for tabular (aggregated) features
class TabularAggDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

# Use only the features (no target) for test set
X_test = test_agg_imputed.values.astype(np.float32)
test_ds = TabularAggDataset(test_agg_imputed)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

# For training, use X_train_imputed and y_train
X_train = X_train_imputed.values.astype(np.float32)
y_train_tensor = y_train.values.astype(np.float32)
train_ds = TabularAggDataset(X_train_imputed, y_train)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

# Define a simple transformer for tabular data
import torch.nn as nn

class TabularTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: (batch, features)
        x = self.input_proj(x).unsqueeze(1)  # (batch, 1, d_model)
        x = self.transformer(x)
        return self.head(x[:, 0])

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabularTransformer(input_dim=X_train.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
criterion = nn.L1Loss()

best_mae = float('inf')
for epoch in range(1000):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
    # Validation on train set (for simplicity)
    model.eval()
    with torch.no_grad():
        preds = []
        for xb, yb in train_loader:
            xb = xb.to(device)
            pred = model(xb).squeeze().cpu().numpy()
            preds.append(pred)
        preds = np.concatenate([p.reshape(-1) for p in preds])
        mae = np.mean(np.abs(preds - y_train_tensor))
        if mae < best_mae:
            best_mae = mae
    print(f"Epoch {epoch+1}, Train MAE: {mae:.4f}")

print("Best Train MAE:", best_mae)

# Predict on test set
model.eval()
test_preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(device)
        pred = model(xb).squeeze().cpu().numpy()
        test_preds.append(pred)
test_preds = np.concatenate([p.reshape(-1) for p in test_preds])

Epoch 1, Train MAE: 42.0881
Epoch 2, Train MAE: 40.7746
Epoch 3, Train MAE: 39.0368
Epoch 4, Train MAE: 36.7443
Epoch 5, Train MAE: 33.7636
Epoch 6, Train MAE: 30.0104
Epoch 7, Train MAE: 25.5614
Epoch 8, Train MAE: 21.0863
Epoch 9, Train MAE: 17.7691
Epoch 10, Train MAE: 16.0074
Epoch 11, Train MAE: 17.7047
Epoch 12, Train MAE: 16.2360
Epoch 13, Train MAE: 16.7903
Epoch 14, Train MAE: 17.1822
Epoch 15, Train MAE: 17.4312
Epoch 16, Train MAE: 18.5508
Epoch 17, Train MAE: 18.1766
Epoch 18, Train MAE: 19.0176
Epoch 19, Train MAE: 18.0119
Epoch 20, Train MAE: 18.9379
Epoch 21, Train MAE: 18.7898
Epoch 22, Train MAE: 18.0815
Epoch 23, Train MAE: 19.6503
Epoch 24, Train MAE: 19.0065
Epoch 25, Train MAE: 18.9751
Epoch 26, Train MAE: 20.4866
Epoch 27, Train MAE: 20.6664
Epoch 28, Train MAE: 20.1449
Epoch 29, Train MAE: 20.8311
Epoch 30, Train MAE: 20.1986
Epoch 31, Train MAE: 18.9953
Epoch 32, Train MAE: 20.0800
Epoch 33, Train MAE: 19.3053
Epoch 34, Train MAE: 20.4862
Epoch 35, Train MAE: 19

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

# Step 1: Get transformer embeddings for train and test sets
# We'll use the output of the penultimate layer (before the regression head) as features

def get_transformer_embeddings(model, loader, device):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)
            # Forward pass up to the transformer encoder
            x_proj = model.input_proj(xb)
            x_enc = model.transformer(x_proj)
            # Flatten for each sample
            emb = x_enc.reshape(x_enc.shape[0], -1).cpu().numpy()
            embeddings.append(emb)
    return np.vstack(embeddings)

# Prepare train sequence dataset and loader
seq_len = 32  # Use the same as in previous cells
feature_cols = [c for c in train_df.columns if c.startswith('indicator_')]
train_seq_ds = AssetSequenceDataset(train_df, target_col='return_on_asset', seq_len=seq_len, feature_cols=feature_cols)
train_seq_loader = DataLoader(train_seq_ds, batch_size=64, shuffle=False)

# Prepare test sequence dataset and loader
test_seq_ds = AssetSequenceDataset(test_df, target_col=None, seq_len=seq_len, feature_cols=feature_cols)
test_seq_loader = DataLoader(test_seq_ds, batch_size=64, shuffle=False)

# Define and (optionally) train the temporal_model before using it
# Assume input_dim is the number of features in your sequence data
input_dim = len(feature_cols)
temporal_model = TemporalTransformer(input_dim=input_dim, d_model=64, nhead=4, num_layers=2, seq_len=seq_len).to(device)

# (Optional) You may want to train the model here before extracting embeddings
# For demonstration, we'll just use the randomly initialized model

# Get embeddings
train_embeddings = get_transformer_embeddings(temporal_model, train_seq_loader, device)
test_embeddings = get_transformer_embeddings(temporal_model, test_seq_loader, device)

# Step 2: Fit RandomForest on transformer embeddings

# Use y_train as target (already aligned with train_seq_ds)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(train_embeddings, y_train.values)

# Cross-validated MAE (optional)
cv_mae = -cross_val_score(rf, train_embeddings, y_train.values, cv=5, scoring='neg_mean_absolute_error')
print("RandomForest on Transformer Embeddings CV MAE:", cv_mae)
print("Mean MAE:", np.mean(cv_mae))

# Step 3: Predict on test set
test_preds_rf = rf.predict(test_embeddings)

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


RandomForest on Transformer Embeddings CV MAE: [12.85859223 13.30279412 13.13404902 11.35607843 13.17696078]
Mean MAE: 12.765694917190174


In [20]:
from sklearn.metrics import mean_absolute_error
import numpy as np

def expanding_window_cv(model, X, y, 
                        initial_window=1000, 
                        horizon=200, 
                        step=200, 
                        verbose=True):
    """
    Expanding window time series cross-validation.
    
    Args:
        model: scikit-learn-like regressor (with fit/predict).
        X: DataFrame or np.ndarray of features.
        y: Series or np.ndarray of targets.
        initial_window: Size of initial training window.
        horizon: Size of each validation/test split.
        step: How much to move forward each time.
        verbose: Print split info and scores.
        
    Returns:
        List of MAE scores for each fold.
    """
    n = len(X)
    cv_scores = []
    folds = 0

    for start in range(0, n - initial_window - horizon + 1, step):
        train_end = start + initial_window
        test_end = train_end + horizon

        X_train = X.iloc[start:train_end]
        y_train = y.iloc[start:train_end]
        X_val = X.iloc[train_end:test_end]
        y_val = y.iloc[train_end:test_end]

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = mean_absolute_error(y_val, preds)
        cv_scores.append(score)
        folds += 1

        if verbose:
            print(f"[Fold {folds}] Train: {start}:{train_end}, Test: {train_end}:{test_end}, MAE: {score:.4f}")

    print("\n Expanding Window CV Complete")
    print(f"Folds run: {folds}")
    print(f"MAE: mean={np.mean(cv_scores):.4f}, std={np.std(cv_scores):.4f}")
    return cv_scores

In [21]:
print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))

CV MAE scores: [8.001262135922332, 7.372254901960783, 7.230686274509804, 6.949901960784314, 7.492450980392157]
Average MAE: 7.409311250713879


In [22]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(train_embeddings, y_train.values)
print("Best parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  16.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  16.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  17.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  30.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  30.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  30.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  14.1s
[CV] END m

In [23]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score

# Step 6: Fit LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Cross-validation (optional, but good for tuning)
mae_scores = -cross_val_score(lgb_model, X_train_imputed, y_train, 
                              scoring='neg_mean_absolute_error', cv=5)
print(f"LightGBM CV MAE: {mae_scores.mean():.4f}")

# Final fit
lgb_model.fit(X_train_imputed, y_train)

# Prediction on test set
lgb_pred = lgb_model.predict(test_agg_imputed)
lgb_pred_original = target_scaler.inverse_transform(lgb_pred.reshape(-1, 1)).ravel()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6164
[LightGBM] [Info] Number of data points in the train set: 408, number of used features: 51
[LightGBM] [Info] Start training from score 43.897059
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6210
[LightGBM] [Info] Number of data points in the train set: 409, number of used features: 51
[LightGBM] [Info] Start training from score 44.603912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6174
[LightGBM] [Info] Number of data points in the train set: 409, number of used features: 51
[LightGBM] [Info] Start trainin

In [25]:
from sklearn.metrics import mean_absolute_error

# Compute final MAE on train and test sets for both models

# For TabularTransformer predictions (train set only, since test set targets are known)
model.eval()
tabular_train_preds = []
with torch.no_grad():
    for xb, yb in train_loader:
        xb = xb.to(device)
        pred = model(xb).squeeze().cpu().numpy()
        tabular_train_preds.append(pred)
tabular_train_preds = np.concatenate([p.reshape(-1) for p in tabular_train_preds])

# Remove NaNs from both y_train and tabular_train_preds before computing MAE
mask = ~(
    pd.isna(y_train.values) | 
    pd.isna(tabular_train_preds)
)
mae_train_tabular = mean_absolute_error(y_train.values[mask], tabular_train_preds[mask])
print("TabularTransformer Train MAE:", mae_train_tabular)

# Test predictions (TabularTransformer)
# Since test set has no targets, just show shape
model.eval()
tabular_test_preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(device)
        pred = model(xb).squeeze().cpu().numpy()
        tabular_test_preds.append(pred)
tabular_test_preds = np.concatenate([p.reshape(-1) for p in tabular_test_preds])
print("TabularTransformer Test predictions shape:", tabular_test_preds.shape)

# For RandomForest on Transformer Embeddings (test_preds_rf)
# Ensure rf is fitted on transformer embeddings
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(train_embeddings, y_train.values)

# Train set
rf_train_preds = rf.predict(train_embeddings)
mae_train_rf = mean_absolute_error(y_train, rf_train_preds)
print("RandomForest Embedding Train MAE:", mae_train_rf)

# Test set
test_preds_rf = rf.predict(test_embeddings)
print("RandomForest Embedding Test predictions shape:", test_preds_rf.shape)

TabularTransformer Train MAE: 22.97878111971801
TabularTransformer Test predictions shape: (315,)
RandomForest Embedding Train MAE: 5.305508806262232
RandomForest Embedding Test predictions shape: (315,)


In [26]:
# Combine predictions from bagging (RandomForest) and boosting (GradientBoostingRegressor) to reduce variance

# Average predictions from both models (already computed as rf_preds and gbr_preds)
ensemble_preds = 0.5 * rf_preds + 0.5 * gbr_preds

# Optionally, evaluate ensemble performance on train set (if you have validation targets)
# Example (if you have y_val and val_preds from ensemble):
# mae_ensemble = mean_absolute_error(y_val, ensemble_preds_on_val)

print("Ensemble predictions (bagging + boosting) computed.")

Ensemble predictions (bagging + boosting) computed.


In [27]:
print("TemporalTransformer Train MAE:", mae_train_tabular)
print("TemporalTransformer Test predictions shape:", tabular_test_preds.shape)
print("RandomForest Embedding Train MAE:", mae_train_rf)
# Test set
print("RandomForest Embedding Test predictions shape:", test_preds_rf.shape)

TemporalTransformer Train MAE: 22.97878111971801
TemporalTransformer Test predictions shape: (315,)
RandomForest Embedding Train MAE: 5.305508806262232
RandomForest Embedding Test predictions shape: (315,)


In [29]:
# Step 6: Create submission file
submission = sample_submission_df.copy()
submission['return_on_asset'] = submission['asset_id'].map(
    dict(zip(test_agg.index, test_preds))
).fillna(0)

# Save the CSV file
submission.to_csv("submission_simple_Temporal_randomF_2+.csv", index=False)
print("Submission file saved as kaggle_submission.csv")


Submission file saved as kaggle_submission.csv
