# 3. Model Training and Evaluation

This notebook brings everything together. We'll load our processed features, label them using SPC reports, and then train and evaluate our models.

In [None]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append('../src')
from modeling import label_data, train_classification_model, train_regression_model

## Step 1: Load Data

**Note:** For a real project, you would first run the feature engineering pipeline on many NEXRAD files and concatenate them into a single large CSV. Here, we use the single scan file as a placeholder.

In [None]:
# Load features (assuming you have a larger, combined file)
features_file = '../data/processed/features_single_scan.csv'
if not os.path.exists(features_file):
    raise FileNotFoundError("Processed features not found. Run notebook 02 first.")
features_df = pd.read_csv(features_file)

# Load SPC reports
spc_file = '../data/raw/spc_reports/240528_rpts.csv'
spc_df = pd.read_csv(spc_file)
hail_reports = spc_df[spc_df['Type'] == 'Hail'].copy()

## Step 2: Label the Feature Data

In [None]:
labeled_df = label_data(features_df, hail_reports)

print(f"Number of cells matched with hail reports: {labeled_df['hail_report'].sum()}")
print(f"Number of non-hail cells: {len(labeled_df) - labeled_df['hail_report'].sum()}")

labeled_df.head()

## Step 3: Prepare Data for Modeling

In [None]:
FINAL_DATA_PATH = '../data/final/'
os.makedirs(FINAL_DATA_PATH, exist_ok=True)
labeled_df.to_csv(os.path.join(FINAL_DATA_PATH, 'final_labeled_dataset.csv'), index=False)

# Define features (X) and targets (y)
feature_cols = ['max_reflectivity_dbz', 'max_mesh_mm', 'echo_top_km']
X = labeled_df[feature_cols]

y_class = labeled_df['hail_report']
y_reg = labeled_df['hail_size_in']

## Step 4: Train and Evaluate Classification Model (Will it hail?)

In [None]:
# The dataset is too small from one scan for meaningful training, 
# but this shows the process.
if len(labeled_df) > 1 and labeled_df['hail_report'].nunique() > 1:
    classifier = train_classification_model(X, y_class, '../models/hail_classifier.joblib')
else:
    print("Not enough data or class diversity to train the classification model.")

## Step 5: Train and Evaluate Regression Model (How large?)

In [None]:
if labeled_df['hail_report'].sum() > 1:
    regressor = train_regression_model(X, y_reg, '../models/hail_regressor.joblib')
else:
    print("Not enough hail samples to train the regression model.")