# MGTF 424 Final Project
This notebook builds a baseline model to predict `return_on_asset` using panel data with anonymized indicators.

In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("sample_submission.csv")


In [21]:
# Step 1: Aggregate features
def aggregate_features(df, is_train=True):
    agg_funcs = ['mean', 'std', 'min', 'max']
    feature_cols = [col for col in df.columns if col.startswith("indicator_")]

    aggregated = df.groupby('asset_id')[feature_cols].agg(agg_funcs)
    aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]

    if is_train:
        static_cols = ['return_on_asset', 'company_age', 'company_size', 'revenue']
    else:
        static_cols = ['company_age', 'company_size', 'revenue']

    static_info = df.groupby('asset_id')[static_cols].first()
    return aggregated.join(static_info)

train_agg = aggregate_features(train_df, is_train=True)
test_agg = aggregate_features(test_df, is_train=False)

In [22]:
# Step 2: Prepare training data
X_train = train_agg.drop(columns=["return_on_asset"])
y_train = train_agg["return_on_asset"]
groups = train_agg.index

# Step 3: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
test_agg_imputed = pd.DataFrame(imputer.transform(test_agg), index=test_agg.index, columns=test_agg.columns)


In [6]:
# Step 4: Cross-validated training
# cv = GroupKFold(n_splits=5)
# model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
# cv_scores = []

# for train_idx, val_idx in cv.split(X_train_imputed, y_train, groups):
#     X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
#     y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

#     model.fit(X_tr, y_tr)
#     preds = model.predict(X_val)
#     score = mean_absolute_error(y_val, preds)
#     cv_scores.append(score)

# print("CV MAE scores:", cv_scores)
# print("Average MAE:", np.mean(cv_scores))


In [23]:
class MonteCarloCV:
    """Monte Carlo cross-validation with random splits"""
    def __init__(self, n_splits=100, test_size=0.2, random_state=None):
        self.n_splits = n_splits
        self.test_size = test_size
        self.random_state = random_state
        
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        n_test = int(n_samples * self.test_size)
        
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        for i in range(self.n_splits):
            # Random test indices
            test_indices = np.random.choice(n_samples, n_test, replace=False)
            train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
            yield train_indices, test_indices
print("Using Monte Carlo CV with 100 splits...")
# Step 5: Monte Carlo cross-validation training
mc_cv = MonteCarloCV(n_splits=100, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
cv_scores = []
for train_idx, val_idx in mc_cv.split(X_train_imputed, y_train, groups):
    X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    cv_scores.append(score)
print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))



Using Monte Carlo CV with 100 splits...
CV MAE scores: [7.3020588235294115, 8.205490196078431, 7.1470588235294095, 7.483137254901964, 6.850980392156862, 7.051960784313725, 6.95078431372549, 6.322058823529412, 7.183529411764706, 7.818039215686277, 7.546764705882351, 7.628529411764707, 7.1793137254901955, 7.238431372549019, 8.012450980392158, 7.611078431372549, 7.109901960784313, 7.404803921568628, 6.6331372549019605, 8.833235294117644, 7.326960784313726, 6.694019607843137, 7.169607843137257, 9.286666666666667, 8.37313725490196, 8.491470588235295, 7.359411764705882, 8.342352941176472, 7.888627450980391, 7.0814705882352955, 8.048039215686272, 7.339215686274509, 7.150686274509804, 8.678235294117645, 6.657156862745098, 7.125, 7.6922549019607835, 6.936862745098041, 7.1225490196078445, 8.966372549019608, 8.041568627450982, 7.584705882352941, 8.457450980392156, 7.632254901960783, 7.842843137254901, 6.3116666666666665, 8.537058823529412, 6.3781372549019615, 6.526176470588235, 7.3623529411764705

In [None]:
# from pytorch_tabnet.tab_model import TabNetRegressor

# Prepare data (already imputed)
# X = X_train_imputed.values
# y = y_train.values

# TabNet expects numpy arrays for groups as well
# groups_arr = np.array(list(groups))

# Cross-validation with TabNet
# cv = GroupKFold(n_splits=5)
# tabnet_scores = []

# for train_idx, val_idx in cv.split(X, y, groups_arr):
#     X_tr, X_val = X[train_idx], X[val_idx]
#     y_tr, y_val = y[train_idx], y[val_idx]

#     tabnet = TabNetRegressor(verbose=0, seed=42)
#     tabnet.fit(
#         X_tr, y_tr.reshape(-1, 1),
#         eval_set=[(X_val, y_val.reshape(-1, 1))],
#         eval_metric=['mae'],
#         max_epochs=200,
#         patience=20,
#         batch_size=256,
#         virtual_batch_size=64
#     )
#     preds = tabnet.predict(X_val).reshape(-1)
#     score = mean_absolute_error(y_val, preds)
#     tabnet_scores.append(score)

# print("TabNet CV MAE scores:", tabnet_scores)
# print("TabNet Average MAE:", np.mean(tabnet_scores))

In [24]:
class WindowedTabularDataset(Dataset):
    def __init__(self, df, feature_cols, target_col, window_size=5):
        self.samples = []
        grouped = df.groupby('asset_id')
        for _, group in grouped:
            group = group.sort_values('timestamp')
            X = group[feature_cols].values.astype(np.float32)
            y = group[target_col].values.astype(np.float32)
            for i in range(len(group) - window_size + 1):
                self.samples.append((
                    X[i:i+window_size],  # window of features
                    y[i+window_size-1]   # predict last value in window
                ))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

# Example usage:
feature_cols = [col for col in train_df.columns if col.startswith("indicator_")]
target_col = "return_on_asset"
window_size = 5
windowed_ds = WindowedTabularDataset(train_df, feature_cols, target_col, window_size)
windowed_loader = DataLoader(windowed_ds, batch_size=32, shuffle=True)


In [25]:
import torch
from torch.utils.data import DataLoader

# Step 5: Final model and prediction

# Define ImprovedMLP if not already defined
import torch.nn as nn
import torch.optim as optim

class ImprovedMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mlp_final = ImprovedMLP(X_train_imputed.shape[1]).to(device)
optimizer = optim.AdamW(mlp_final.parameters(), lr=1e-3, weight_decay=1e-2)
criterion = nn.L1Loss()

X_full = X_train_imputed.values.astype(np.float32)
y_full = y_train.values.astype(np.float32).reshape(-1, 1)
full_ds = TabularDataset(X_full, y_full)
full_loader = DataLoader(full_ds, batch_size=64, shuffle=True)

# Train on all data with early stopping
best_loss = float('inf')
patience, patience_counter = 10, 0
for epoch in range(100):
    mlp_final.train()
    for xb, yb in full_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = mlp_final(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
    # Optionally, compute train loss for early stopping
    mlp_final.eval()
    train_losses = []
    with torch.no_grad():
        for xb, yb in full_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = mlp_final(xb)
            train_losses.append(criterion(preds, yb).item())
    train_loss = np.mean(train_losses)
    if train_loss < best_loss:
        best_loss = train_loss
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        break

# Predict on test set
X_test = test_agg_imputed.values.astype(np.float32)
mlp_final.eval()
with torch.no_grad():
    test_preds = mlp_final(torch.from_numpy(X_test).to(device)).cpu().numpy().reshape(-1)




In [27]:
# Step 6: Create submission file
submission = sample_submission_df.copy()
submission['return_on_asset'] = submission['asset_id'].map(
    dict(zip(test_agg.index, test_preds))
).fillna(0)

# Save the CSV file
submission.to_csv("kaggle_sequence_submission.csv", index=False)
print("Submission file saved as kaggle_sequence_submission.csv")

Submission file saved as kaggle_sequence_submission.csv
