# MGTF 424 Final Project
This notebook builds a baseline model to predict `return_on_asset` using panel data with anonymized indicators.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("sample_submission.csv")


In [None]:
# Step 1: Aggregate features
def aggregate_features(df, is_train=True):
    agg_funcs = ['mean', 'std', 'min', 'max']
    feature_cols = [col for col in df.columns if col.startswith("indicator_")]

    aggregated = df.groupby('asset_id')[feature_cols].agg(agg_funcs)
    aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]

    if is_train:
        static_cols = ['return_on_asset', 'company_age', 'company_size', 'revenue']
    else:
        static_cols = ['company_age', 'company_size', 'revenue']

    static_info = df.groupby('asset_id')[static_cols].first()
    return aggregated.join(static_info)

train_agg = aggregate_features(train_df, is_train=True)
test_agg = aggregate_features(test_df, is_train=False)


In [None]:
# Step 2: Prepare training data
X_train = train_agg.drop(columns=["return_on_asset"])
y_train = train_agg["return_on_asset"]
groups = train_agg.index

# Step 3: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
test_agg_imputed = pd.DataFrame(imputer.transform(test_agg), index=test_agg.index, columns=test_agg.columns)


In [None]:
# Step 4: Cross-validated training
cv = GroupKFold(n_splits=5)
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
cv_scores = []

for train_idx, val_idx in cv.split(X_train_imputed, y_train, groups):
    X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    score = mean_absolute_error(y_val, preds)
    cv_scores.append(score)

print("CV MAE scores:", cv_scores)
print("Average MAE:", np.mean(cv_scores))


CV MAE scores: [8.220970873786406, 7.308725490196078, 7.278627450980392, 7.131078431372549, 7.444509803921568]
Average MAE: 7.476782410051399


In [None]:
# Step 5: Final model and prediction
final_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
final_model.fit(X_train_imputed, y_train)
test_preds = final_model.predict(test_agg_imputed)

# Step 6: Create submission file
submission = sample_submission_df.copy()
submission['return_on_asset'] = submission['asset_id'].map(
    dict(zip(test_agg.index, test_preds))
).fillna(0)

# Save the CSV file
submission.to_csv("kaggle_submission.csv", index=False)
print("Submission file saved as kaggle_submission.csv")


Submission file saved as kaggle_submission.csv
