# ML Experiment Template

**Experiment:** [Experiment name/ID]

**Hypothesis:** [What are you testing?]

**Date:** [YYYY-MM-DD]

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import json
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Set random seed
SEED = 42
np.random.seed(SEED)

# Paths
DATA_DIR = Path("../../data")
MODELS_DIR = Path("../../models")
REPORTS_DIR = Path("../../reports")
MODELS_DIR.mkdir(exist_ok=True)
REPORTS_DIR.mkdir(exist_ok=True)

## Load & Prepare Data

In [None]:
# Load data
df = pd.read_parquet(DATA_DIR / "processed" / "dataset.parquet")

# Define features and target
X = df.drop(columns=['target'])  # Replace 'target' with your target column
y = df['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

## Baseline Model

In [None]:
from sklearn.linear_model import LinearRegression

# Train baseline
baseline = LinearRegression()
baseline.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline.predict(X_test)

# Metrics
baseline_metrics = {
    'rmse': np.sqrt(mean_squared_error(y_test, y_pred_baseline)),
    'mae': mean_absolute_error(y_test, y_pred_baseline),
    'r2': r2_score(y_test, y_pred_baseline)
}

print("Baseline Metrics:")
for k, v in baseline_metrics.items():
    print(f"  {k}: {v:.4f}")

## Experiment Model

In [None]:
# TODO: Replace with your model
from sklearn.ensemble import RandomForestRegressor

# Train model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=SEED,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
metrics = {
    'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
    'mae': mean_absolute_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred)
}

print("Experiment Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

## Comparison

In [None]:
# Compare metrics
comparison = pd.DataFrame({
    'Baseline': baseline_metrics,
    'Experiment': metrics
})
comparison['Improvement'] = ((comparison['Experiment'] - comparison['Baseline']) / comparison['Baseline'] * 100).round(2)
comparison

## Visualization

In [None]:
# Predictions vs Actuals
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Baseline
axes[0].scatter(y_test, y_pred_baseline, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Baseline (R²={baseline_metrics["r2"]:.3f})')

# Experiment
axes[1].scatter(y_test, y_pred, alpha=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predicted')
axes[1].set_title(f'Experiment (R²={metrics["r2"]:.3f})')

plt.tight_layout()
plt.show()

## Save Results

In [None]:
# Save model
model_path = MODELS_DIR / "experiment_model.joblib"
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# Save metrics
results = {
    'experiment_name': '[EXPERIMENT_NAME]',
    'date': '[YYYY-MM-DD]',
    'baseline_metrics': baseline_metrics,
    'experiment_metrics': metrics,
    'model_params': model.get_params(),
    'data_shape': {'train': X_train.shape, 'test': X_test.shape}
}

results_path = REPORTS_DIR / "experiment_results.json"
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2, default=str)
print(f"Results saved to: {results_path}")

## Conclusions

**Results:**
- [Summary of findings]

**Next Steps:**
- [ ] [Action 1]
- [ ] [Action 2]