# Baseline Modeling Notebook
This notebook builds a baseline wage prediction model using a Voting Regressor ensemble.

In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
data_path = 'data/engineered_wage_data.csv'
df = pd.read_csv(data_path)
df.head()

In [None]:
target_col = 'log_wage' if 'log_wage' in df.columns else 'wage'

X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

In [None]:
lin_reg = LinearRegression()
rf_reg = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
gb_reg = GradientBoostingRegressor(random_state=42)

voting_reg = VotingRegressor(
    estimators=[('lr', lin_reg), ('rf', rf_reg), ('gb', gb_reg)]
)

voting_reg.fit(X_train, y_train)

In [None]:
preds = voting_reg.predict(X_test)

r2 = r2_score(y_test, preds)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print(f'R^2: {r2:.4f}')
print(f'MAE: {mae:.4f}')
print(f'RMSE: {rmse:.4f}')

## Feature Importance (Tree-Based Proxy)
Random forest importances are used as a baseline proxy for influential engineered features.

In [None]:
rf_reg.fit(X_train, y_train)
importances = pd.Series(rf_reg.feature_importances_, index=X_train.columns).sort_values(ascending=False)
importances.head(15)