# MGTF 424 Final Project
This notebook builds a baseline model to predict `return_on_asset` using panel data with anonymized indicators.

In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("sample_submission.csv")


In [24]:
# from sklearn.preprocessing import StandardScaler

# # Remove outliers from train_df based on return_on_asset (e.g., 1st and 99th percentiles)
# q_low = train_df['return_on_asset'].quantile(0.01)
# q_high = train_df['return_on_asset'].quantile(0.99)
# train_df_clean = train_df[(train_df['return_on_asset'] >= q_low) & (train_df['return_on_asset'] <= q_high)].copy()

# # Normalize target
# scaler = StandardScaler()
# train_df_clean['return_on_asset_norm'] = scaler.fit_transform(train_df_clean[['return_on_asset']])

# # Add lag features and rolling stats for each asset_id
# lag_features = []
# rolling_features = []
# window = 3

# for col in [c for c in train_df_clean.columns if c.startswith('indicator_')]:
#     # Lag 1
#     lag_col = f"{col}_lag1"
#     train_df_clean[lag_col] = train_df_clean.groupby('asset_id')[col].shift(1)
#     lag_features.append(lag_col)
#     # Rolling mean
#     roll_mean_col = f"{col}_roll{window}_mean"
#     train_df_clean[roll_mean_col] = train_df_clean.groupby('asset_id')[col].rolling(window, min_periods=1).mean().reset_index(level=0, drop=True)
#     rolling_features.append(roll_mean_col)
#     # Rolling std
#     roll_std_col = f"{col}_roll{window}_std"
#     train_df_clean[roll_std_col] = train_df_clean.groupby('asset_id')[col].rolling(window, min_periods=1).std().reset_index(level=0, drop=True)
#     rolling_features.append(roll_std_col)

# # Drop rows with NaN after lagging (optional, or impute later)
# train_df_clean = train_df_clean.dropna(subset=lag_features)

In [25]:
# Step 1: Aggregate features
def aggregate_features(df, is_train=True):
    agg_funcs = ['mean', 'std', 'min', 'max']
    feature_cols = [col for col in df.columns if col.startswith("indicator_")]

    aggregated = df.groupby('asset_id')[feature_cols].agg(agg_funcs)
    aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]

    if is_train:
        static_cols = ['return_on_asset', 'company_age', 'company_size', 'revenue']
    else:
        static_cols = ['company_age', 'company_size', 'revenue']

    static_info = df.groupby('asset_id')[static_cols].first()
    return aggregated.join(static_info)

train_agg = aggregate_features(train_df, is_train=True)
test_agg = aggregate_features(test_df, is_train=False)

In [26]:
# Step 2: Prepare training data
X_train = train_agg.drop(columns=["return_on_asset"])
y_train = train_agg["return_on_asset"]
groups = train_agg.index

# Step 3: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
test_agg_imputed = pd.DataFrame(imputer.transform(test_agg), index=test_agg.index, columns=test_agg.columns)


In [27]:
# Step 4: Cross-validated training
# cv = GroupKFold(n_splits=5)
# model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
# cv_scores = []

# for train_idx, val_idx in cv.split(X_train_imputed, y_train, groups):
#     X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
#     y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

#     model.fit(X_tr, y_tr)
#     preds = model.predict(X_val)
#     score = mean_absolute_error(y_val, preds)
#     cv_scores.append(score)

# print("CV MAE scores:", cv_scores)
# print("Average MAE:", np.mean(cv_scores))


In [28]:
# class MonteCarloCV:
#     """Monte Carlo cross-validation with random splits"""
#     def __init__(self, n_splits=100, test_size=0.2, random_state=None):
#         self.n_splits = n_splits
#         self.test_size = test_size
#         self.random_state = random_state
#         
#     def split(self, X, y=None, groups=None):
#         n_samples = len(X)
#         n_test = int(n_samples * self.test_size)
#         
#         if self.random_state is not None:
#             np.random.seed(self.random_state)
#         
#         for i in range(self.n_splits):
#             # Random test indices
#             test_indices = np.random.choice(n_samples, n_test, replace=False)
#             train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
#             yield train_indices, test_indices
# print("Using Monte Carlo CV with 100 splits...")
# # Step 5: Monte Carlo cross-validation training
# mc_cv = MonteCarloCV(n_splits=100, test_size=0.2, random_state=42)
# model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
# cv_scores = []
# for train_idx, val_idx in mc_cv.split(X_train_imputed, y_train, groups):
#     X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
#     y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
# 
#     model.fit(X_tr, y_tr)
#     preds = model.predict(X_val)
#     score = mean_absolute_error(y_val, preds)
#     cv_scores.append(score)
# print("CV MAE scores:", cv_scores)
# print("Average MAE:", np.mean(cv_scores))


In [29]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

print("Using Time Series Split CV...")

n_splits = 100  # You can increase this if you want more folds
tscv = TimeSeriesSplit(n_splits=n_splits)

model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
cv_scores = []

for train_idx, val_idx in tscv.split(X_train_imputed):
    X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    cv_scores.append(score)

print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))

Using Time Series Split CV...
CV MAE scores: [8.528000000000002, 9.629999999999999, 11.274, 9.37, 9.51, 12.054, 5.629999999999999, 19.532, 23.165999999999997, 10.972, 9.838, 14.219999999999999, 8.887999999999998, 5.095999999999999, 15.580000000000002, 18.94, 5.024000000000001, 9.748000000000001, 15.443999999999999, 9.187999999999999, 6.358, 7.465999999999999, 7.933999999999996, 17.77, 10.174, 7.586, 9.146, 3.4339999999999975, 7.601999999999999, 7.504000000000002, 12.330000000000002, 10.66, 21.798, 1.7619999999999998, 7.58, 7.162000000000001, 6.828000000000001, 10.558000000000002, 10.984000000000002, 4.876, 9.834, 9.898, 7.251999999999998, 6.088, 8.595999999999998, 17.869999999999997, 8.646, 2.3939999999999984, 4.376000000000001, 2.620000000000001, 14.402000000000005, 5.806, 8.654, 3.6500000000000012, 10.152, 8.662000000000003, 5.5779999999999985, 5.430000000000001, 3.478, 1.5659999999999996, 13.526, 4.467999999999999, 3.6119999999999997, 5.788, 6.457999999999998, 1.8939999999999997, 7.

In [30]:
# from pytorch_tabnet.tab_model import TabNetRegressor

# Prepare data (already imputed)
# X = X_train_imputed.values
# y = y_train.values

# TabNet expects numpy arrays for groups as well
# groups_arr = np.array(list(groups))

# Cross-validation with TabNet
# cv = GroupKFold(n_splits=5)
# tabnet_scores = []

# for train_idx, val_idx in cv.split(X, y, groups_arr):
#     X_tr, X_val = X[train_idx], X[val_idx]
#     y_tr, y_val = y[train_idx], y[val_idx]

#     tabnet = TabNetRegressor(verbose=0, seed=42)
#     tabnet.fit(
#         X_tr, y_tr.reshape(-1, 1),
#         eval_set=[(X_val, y_val.reshape(-1, 1))],
#         eval_metric=['mae'],
#         max_epochs=200,
#         patience=20,
#         batch_size=256,
#         virtual_batch_size=64
#     )
#     preds = tabnet.predict(X_val).reshape(-1)
#     score = mean_absolute_error(y_val, preds)
#     tabnet_scores.append(score)

# print("TabNet CV MAE scores:", tabnet_scores)
# print("TabNet Average MAE:", np.mean(tabnet_scores))

In [31]:
import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn
import torch.optim as optim

# Prepare data for PyTorch
X = X_train_imputed.values.astype(np.float32)
y = y_train.values.astype(np.float32).reshape(-1, 1)

# Simple PyTorch Dataset
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# MLP Model with BatchNorm and Dropout
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

In [32]:
from sklearn.metrics import mean_absolute_error

# Training loop with GroupKFold CV
# cv = GroupKFold(n_splits=5)
# cv_scores = []
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# for train_idx, val_idx in cv.split(X, y, groups):
#     X_tr, X_val = X[train_idx], X[val_idx]
#     y_tr, y_val = y[train_idx], y[val_idx]

#     train_ds = TabularDataset(X_tr, y_tr)
#     val_ds = TabularDataset(X_val, y_val)
#     train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
#     val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

#     model = MLPRegressor(X.shape[1]).to(device)
#     optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
#     criterion = nn.L1Loss()

#     # Training
#     best_val_loss = float('inf')
#     patience, patience_counter = 10, 0
#     for epoch in range(100):
#         model.train()
#         for xb, yb in train_loader:
#             xb, yb = xb.to(device), yb.to(device)
#             optimizer.zero_grad()
#             preds = model(xb)
#             loss = criterion(preds, yb)
#             loss.backward()
#             optimizer.step()
#         # Validation
#         model.eval()
#         val_losses = []
#         with torch.no_grad():
#             for xb, yb in val_loader:
#                 xb, yb = xb.to(device), yb.to(device)
#                 preds = model(xb)
#                 val_losses.append(criterion(preds, yb).item())
#         val_loss = np.mean(val_losses)
#         if val_loss < best_val_loss:
#             best_val_loss = val_loss
#             patience_counter = 0
#         else:
#             patience_counter += 1
#         if patience_counter >= patience:
#             break
#     cv_scores.append(best_val_loss)

print("MLP CV MAE scores:", cv_scores)
print("MLP Average MAE:", np.mean(cv_scores))


MLP CV MAE scores: [8.528000000000002, 9.629999999999999, 11.274, 9.37, 9.51, 12.054, 5.629999999999999, 19.532, 23.165999999999997, 10.972, 9.838, 14.219999999999999, 8.887999999999998, 5.095999999999999, 15.580000000000002, 18.94, 5.024000000000001, 9.748000000000001, 15.443999999999999, 9.187999999999999, 6.358, 7.465999999999999, 7.933999999999996, 17.77, 10.174, 7.586, 9.146, 3.4339999999999975, 7.601999999999999, 7.504000000000002, 12.330000000000002, 10.66, 21.798, 1.7619999999999998, 7.58, 7.162000000000001, 6.828000000000001, 10.558000000000002, 10.984000000000002, 4.876, 9.834, 9.898, 7.251999999999998, 6.088, 8.595999999999998, 17.869999999999997, 8.646, 2.3939999999999984, 4.376000000000001, 2.620000000000001, 14.402000000000005, 5.806, 8.654, 3.6500000000000012, 10.152, 8.662000000000003, 5.5779999999999985, 5.430000000000001, 3.478, 1.5659999999999996, 13.526, 4.467999999999999, 3.6119999999999997, 5.788, 6.457999999999998, 1.8939999999999997, 7.382000000000002, 10.338000

In [33]:
# from sklearn.preprocessing import StandardScaler

# # Standardize input features
# X_scaler = StandardScaler()
# X_tr_scaled = X_scaler.fit_transform(X[train_idx])
# X_val_scaled = X_scaler.transform(X[val_idx])

# # Standardize output (target)
# y_scaler = StandardScaler()
# y_tr_scaled = y_scaler.fit_transform(y[train_idx])
# y_val_scaled = y_scaler.transform(y[val_idx])

# # Update datasets with standardized data
# train_ds = TabularDataset(X_tr_scaled.astype(np.float32), y_tr_scaled.astype(np.float32))
# val_ds = TabularDataset(X_val_scaled.astype(np.float32), y_val_scaled.astype(np.float32))
# train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
# val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

# # Define balanced loss: average of MSE and MAE
# class BalancedLoss(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.mse = nn.MSELoss()
#         self.mae = nn.L1Loss()
#     def forward(self, pred, target):
#         return 0.5 * self.mse(pred, target) + 0.5 * self.mae(pred, target)

# criterion = BalancedLoss()
# print("MLP CV MAE scores:", cv_scores)
# print("MLP Average MAE:", np.mean(cv_scores))

In [34]:
# Step 5: Final model and prediction

# Define ImprovedMLP if not already defined
import torch.nn as nn

class ImprovedMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mlp_final = ImprovedMLP(X_train_imputed.shape[1]).to(device)
optimizer = optim.AdamW(mlp_final.parameters(), lr=1e-3, weight_decay=1e-2)
criterion = nn.L1Loss()

X_full = X_train_imputed.values.astype(np.float32)
y_full = y_train.values.astype(np.float32).reshape(-1, 1)
full_ds = TabularDataset(X_full, y_full)
full_loader = DataLoader(full_ds, batch_size=64, shuffle=True)

# Train on all data with early stopping
best_loss = float('inf')
patience, patience_counter = 10, 0
for epoch in range(100):
    mlp_final.train()
    for xb, yb in full_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = mlp_final(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
    # Optionally, compute train loss for early stopping
    mlp_final.eval()
    train_losses = []
    with torch.no_grad():
        for xb, yb in full_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = mlp_final(xb)
            train_losses.append(criterion(preds, yb).item())
    train_loss = np.mean(train_losses)
    if train_loss < best_loss:
        best_loss = train_loss
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        break

# Predict on test set
X_test = test_agg_imputed.values.astype(np.float32)
mlp_final.eval()
with torch.no_grad():
    test_preds = mlp_final(torch.from_numpy(X_test).to(device)).cpu().numpy().reshape(-1)



In [35]:
# Step 6: Create submission file
submission = sample_submission_df.copy()
submission['return_on_asset'] = submission['asset_id'].map(
    dict(zip(test_agg.index, test_preds))
).fillna(0)

# Save the CSV file
submission.to_csv("kaggle_submission.csv", index=False)
print("Submission file saved as kaggle_submission.csv")


Submission file saved as kaggle_submission.csv
