# MGTF 424 Final Project
This notebook builds a baseline model to predict `return_on_asset` using panel data with anonymized indicators.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("sample_submission.csv")

  from pandas.core import (


In [None]:
# from sklearn.preprocessing import StandardScaler

# # Remove outliers from train_df based on return_on_asset (e.g., 1st and 99th percentiles)
# q_low = train_df['return_on_asset'].quantile(0.01)
# q_high = train_df['return_on_asset'].quantile(0.99)
# train_df_clean = train_df[(train_df['return_on_asset'] >= q_low) & (train_df['return_on_asset'] <= q_high)].copy()

# # Normalize target
# scaler = StandardScaler()
# train_df_clean['return_on_asset_norm'] = scaler.fit_transform(train_df_clean[['return_on_asset']])

# # Add lag features and rolling stats for each asset_id
# lag_features = []
# rolling_features = []
# window = 3

# for col in [c for c in train_df_clean.columns if c.startswith('indicator_')]:
#     # Lag 1
#     lag_col = f"{col}_lag1"
#     train_df_clean[lag_col] = train_df_clean.groupby('asset_id')[col].shift(1)
#     lag_features.append(lag_col)
#     # Rolling mean
#     roll_mean_col = f"{col}_roll{window}_mean"
#     train_df_clean[roll_mean_col] = train_df_clean.groupby('asset_id')[col].rolling(window, min_periods=1).mean().reset_index(level=0, drop=True)
#     rolling_features.append(roll_mean_col)
#     # Rolling std
#     roll_std_col = f"{col}_roll{window}_std"
#     train_df_clean[roll_std_col] = train_df_clean.groupby('asset_id')[col].rolling(window, min_periods=1).std().reset_index(level=0, drop=True)
#     rolling_features.append(roll_std_col)

# # Drop rows with NaN after lagging (optional, or impute later)
# train_df_clean = train_df_clean.dropna(subset=lag_features)

In [2]:
# Step 1: Aggregate features
def aggregate_features(df, is_train=True):
    agg_funcs = ['mean', 'std', 'min', 'max']
    feature_cols = [col for col in df.columns if col.startswith("indicator_")]

    aggregated = df.groupby('asset_id')[feature_cols].agg(agg_funcs)
    aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]

    if is_train:
        static_cols = ['return_on_asset', 'company_age', 'company_size', 'revenue']
    else:
        static_cols = ['company_age', 'company_size', 'revenue']

    static_info = df.groupby('asset_id')[static_cols].first()
    return aggregated.join(static_info)

train_agg = aggregate_features(train_df, is_train=True)
test_agg = aggregate_features(test_df, is_train=False)

In [3]:
# Step 2: Prepare training data
X_train = train_agg.drop(columns=["return_on_asset"])
y_train = train_agg["return_on_asset"]
groups = train_agg.index

# Step 3: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
test_agg_imputed = pd.DataFrame(imputer.transform(test_agg), index=test_agg.index, columns=test_agg.columns)

In [4]:
y_train = train_agg["return_on_asset"]
y_train = np.clip(y_train, a_min=-100, a_max=100)

In [5]:
# Step 4: Cross-validated training

# Ensure X_train_imputed is defined
if 'X_train_imputed' not in globals():
    imputer = SimpleImputer(strategy='mean')
    X_train = train_agg.drop(columns=["return_on_asset"])
    X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

cv = GroupKFold(n_splits=5)
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
cv_scores = []

# Use 'groups' as defined in cell 4 (groups = train_agg.index)
for train_idx, val_idx in cv.split(X_train_imputed, y_train, groups):
    X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    cv_scores.append(score)

print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))


CV MAE scores: [8.135922330097088, 7.294019607843136, 7.232156862745098, 7.036274509803922, 7.384607843137253]
Average MAE: 7.4165962307252995


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn

# Prepare sequence data for each asset_id
class AssetSequenceDataset(Dataset):
    def __init__(self, df, target_col, seq_len=32, feature_cols=None):
        self.groups = []
        self.sequences = []
        self.targets = []
        self.seq_len = seq_len

        if feature_cols is None:
            feature_cols = [c for c in df.columns if c.startswith('indicator_')]
        self.feature_cols = feature_cols

        grouped = df.groupby('asset_id')
        for asset_id, group in grouped:
            group = group.sort_values('timestamp')
            features = group[self.feature_cols].values
            target = group[target_col].values if target_col in group else None
            # Pad or truncate
            if len(features) < seq_len:
                pad_width = seq_len - len(features)
                features = np.pad(features, ((pad_width,0),(0,0)), 'constant')
                if target is not None:
                    target = np.pad(target, (pad_width,0), 'constant')
            else:
                features = features[-seq_len:]
                if target is not None:
                    target = target[-seq_len:]
            self.sequences.append(features)
            if target is not None:
                self.targets.append(target[-1])  # predict last value
            self.groups.append(asset_id)

    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        x = torch.tensor(self.sequences[idx], dtype=torch.float32)
        y = torch.tensor(self.targets[idx], dtype=torch.float32) if self.targets else torch.tensor(0.0)
        return x, y

# Temporal Transformer Model
class TemporalTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2, seq_len=32):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * seq_len, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        x = self.input_proj(x)
        x = self.transformer(x)
        return self.head(x)
    


In [7]:
print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))

CV MAE scores: [8.135922330097088, 7.294019607843136, 7.232156862745098, 7.036274509803922, 7.384607843137253]
Average MAE: 7.4165962307252995


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

# Use TemporalTransformer on aggregated features to predict return_on_asset


# Prepare a dataset for tabular (aggregated) features
class TabularAggDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

# Use only the features (no target) for test set
X_test = test_agg_imputed.values.astype(np.float32)
test_ds = TabularAggDataset(test_agg_imputed)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

# For training, use X_train_imputed and y_train
X_train = X_train_imputed.values.astype(np.float32)
y_train_tensor = y_train.values.astype(np.float32)
train_ds = TabularAggDataset(X_train_imputed, y_train)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

# Define a simple transformer for tabular data
import torch.nn as nn

class TabularTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: (batch, features)
        x = self.input_proj(x).unsqueeze(1)  # (batch, 1, d_model)
        x = self.transformer(x)
        return self.head(x[:, 0])

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabularTransformer(input_dim=X_train.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
criterion = nn.L1Loss()

best_mae = float('inf')
for epoch in range(1000):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
    # Validation on train set (for simplicity)
    model.eval()
    with torch.no_grad():
        preds = []
        for xb, yb in train_loader:
            xb = xb.to(device)
            pred = model(xb).squeeze().cpu().numpy()
            preds.append(pred)
        preds = np.concatenate([p.reshape(-1) for p in preds])
        mae = np.mean(np.abs(preds - y_train_tensor))
        if mae < best_mae:
            best_mae = mae
    print(f"Epoch {epoch+1}, Train MAE: {mae:.4f}")

print("Best Train MAE:", best_mae)

# Predict on test set
model.eval()
test_preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(device)
        pred = model(xb).squeeze().cpu().numpy()
        test_preds.append(pred)
test_preds = np.concatenate([p.reshape(-1) for p in test_preds])

Epoch 1, Train MAE: 42.7811
Epoch 2, Train MAE: 41.7912
Epoch 3, Train MAE: 40.4059
Epoch 4, Train MAE: 38.4099
Epoch 5, Train MAE: 35.7605
Epoch 6, Train MAE: 32.3637
Epoch 7, Train MAE: 28.1732
Epoch 8, Train MAE: 23.5077
Epoch 9, Train MAE: 19.4445
Epoch 10, Train MAE: 16.8237
Epoch 11, Train MAE: 15.6708
Epoch 12, Train MAE: 15.5329
Epoch 13, Train MAE: 15.9844
Epoch 14, Train MAE: 16.9048
Epoch 15, Train MAE: 17.0939
Epoch 16, Train MAE: 17.0719
Epoch 17, Train MAE: 16.8157
Epoch 18, Train MAE: 18.4560
Epoch 19, Train MAE: 18.5953
Epoch 20, Train MAE: 17.6262
Epoch 21, Train MAE: 18.5178
Epoch 22, Train MAE: 19.4617
Epoch 23, Train MAE: 18.6489
Epoch 24, Train MAE: 20.4509
Epoch 25, Train MAE: 20.0274
Epoch 26, Train MAE: 18.1570
Epoch 27, Train MAE: 21.0977
Epoch 28, Train MAE: 18.9685
Epoch 29, Train MAE: 19.0544
Epoch 30, Train MAE: 21.0080
Epoch 31, Train MAE: 18.4218
Epoch 32, Train MAE: 20.6532
Epoch 33, Train MAE: 19.4987
Epoch 34, Train MAE: 20.8378
Epoch 35, Train MAE: 19

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

# Step 1: Get transformer embeddings for train and test sets
# We'll use the output of the penultimate layer (before the regression head) as features

def get_transformer_embeddings(model, loader, device):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)
            # Forward pass up to the transformer encoder
            x_proj = model.input_proj(xb)
            x_enc = model.transformer(x_proj)
            # Flatten for each sample
            emb = x_enc.reshape(x_enc.shape[0], -1).cpu().numpy()
            embeddings.append(emb)
    return np.vstack(embeddings)

# Prepare train sequence dataset and loader
seq_len = 32  # Use the same as in previous cells
feature_cols = [c for c in train_df.columns if c.startswith('indicator_')]
train_seq_ds = AssetSequenceDataset(train_df, target_col='return_on_asset', seq_len=seq_len, feature_cols=feature_cols)
train_seq_loader = DataLoader(train_seq_ds, batch_size=64, shuffle=False)

# Prepare test sequence dataset and loader
test_seq_ds = AssetSequenceDataset(test_df, target_col=None, seq_len=seq_len, feature_cols=feature_cols)
test_seq_loader = DataLoader(test_seq_ds, batch_size=64, shuffle=False)

# Define and (optionally) train the temporal_model before using it
# Assume input_dim is the number of features in your sequence data
input_dim = len(feature_cols)
temporal_model = TemporalTransformer(input_dim=input_dim, d_model=64, nhead=4, num_layers=2, seq_len=seq_len).to(device)

# (Optional) You may want to train the model here before extracting embeddings
# For demonstration, we'll just use the randomly initialized model

# Get embeddings
train_embeddings = get_transformer_embeddings(temporal_model, train_seq_loader, device)
test_embeddings = get_transformer_embeddings(temporal_model, test_seq_loader, device)

# Step 2: Fit RandomForest on transformer embeddings

# Use y_train as target (already aligned with train_seq_ds)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(train_embeddings, y_train.values)

# Cross-validated MAE (optional)
cv_mae = -cross_val_score(rf, train_embeddings, y_train.values, cv=5, scoring='neg_mean_absolute_error')
print("RandomForest on Transformer Embeddings CV MAE:", cv_mae)
print("Mean MAE:", np.mean(cv_mae))

# Step 3: Predict on test set
test_preds_rf = rf.predict(test_embeddings)

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


RandomForest on Transformer Embeddings CV MAE: [12.85038835 13.67769608 12.77223039 11.49504902 13.79617647]
Mean MAE: 12.918308062059774


In [14]:
from sklearn.metrics import mean_absolute_error
import numpy as np

def expanding_window_cv(model, X, y, 
                        initial_window=1000, 
                        horizon=200, 
                        step=200, 
                        verbose=True):
    """
    Expanding window time series cross-validation.
    
    Args:
        model: scikit-learn-like regressor (with fit/predict).
        X: DataFrame or np.ndarray of features.
        y: Series or np.ndarray of targets.
        initial_window: Size of initial training window.
        horizon: Size of each validation/test split.
        step: How much to move forward each time.
        verbose: Print split info and scores.
        
    Returns:
        List of MAE scores for each fold.
    """
    n = len(X)
    cv_scores = []
    folds = 0

    for start in range(0, n - initial_window - horizon + 1, step):
        train_end = start + initial_window
        test_end = train_end + horizon

        X_train = X.iloc[start:train_end]
        y_train = y.iloc[start:train_end]
        X_val = X.iloc[train_end:test_end]
        y_val = y.iloc[train_end:test_end]

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = mean_absolute_error(y_val, preds)
        cv_scores.append(score)
        folds += 1

        if verbose:
            print(f"[Fold {folds}] Train: {start}:{train_end}, Test: {train_end}:{test_end}, MAE: {score:.4f}")

    print("\n Expanding Window CV Complete")
    print(f"Folds run: {folds}")
    print(f"MAE: mean={np.mean(cv_scores):.4f}, std={np.std(cv_scores):.4f}")
    return cv_scores


In [15]:
print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))

CV MAE scores: [8.135922330097088, 7.294019607843136, 7.232156862745098, 7.036274509803922, 7.384607843137253]
Average MAE: 7.4165962307252995


In [21]:
from sklearn.metrics import mean_absolute_error

# Compute final MAE on train and test sets for both models


# For TabularTransformer predictions (test_preds)
# Assume y_train_tensor is the true train targets, and test_preds is for test set

# Train predictions (TabularTransformer)
model.eval()
train_preds = []
with torch.no_grad():
    for xb, _ in train_loader:
        xb = xb.to(device)
        pred = model(xb).squeeze().cpu().numpy()
        train_preds.append(pred)
train_preds = np.concatenate([p.reshape(-1) for p in train_preds])

mae_train_tabular = mean_absolute_error(y_train_tensor, train_preds)
print("TabularTransformer Train MAE:", mae_train_tabular)

# Test predictions (TabularTransformer)
# test_preds already computed
print("TabularTransformer Test predictions shape:", test_preds.shape)

# For RandomForest on Transformer Embeddings (test_preds_rf)
# Train set
rf_train_preds = rf.predict(train_embeddings)
mae_train_rf = mean_absolute_error(y_train, rf_train_preds)
print("RandomForest Embedding Train MAE:", mae_train_rf)

# Test set
print("RandomForest Embedding Test predictions shape:", test_preds_rf.shape)



TabularTransformer Train MAE: 22.622737884521484
TabularTransformer Test predictions shape: (315,)
RandomForest Embedding Train MAE: 5.2196673189823874
RandomForest Embedding Test predictions shape: (315,)


In [18]:
# Step 6: Create submission file
submission = sample_submission_df.copy()
submission['return_on_asset'] = submission['asset_id'].map(
    dict(zip(test_agg.index, test_preds))
).fillna(0)

# Save the CSV file
submission.to_csv("submission_simple_Temporal_randomF.csv", index=False)
print("Submission file saved as kaggle_submission.csv")


Submission file saved as kaggle_submission.csv
