# Weather ML Model Training

## Objective
Train a Random Forest regression model to predict temperature based on engineered features.

This notebook:
1. Loads the engineered features from `weather_features` table
2. Builds an ML pipeline with feature scaling
3. Trains a Random Forest regressor
4. Evaluates model performance (RMSE, MAE, R²)
5. Displays feature importance
6. Saves predictions to `weather_predictions` table

**Input Table:** `weather_features`
**Output Model:** `/tmp/weather_temperature_model`
**Output Predictions:** `weather_predictions` table

In [None]:
# Import Required Libraries
import logging

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import abs as spark_abs
from pyspark.sql.functions import col
from pyspark.sql.functions import round as spark_round
from pyspark.sql.types import DoubleType

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✓ Libraries imported successfully")

## Section 1: Load and Explore Engineered Features

Load the engineered features from the `weather_features` table

In [None]:
# Load engineered features
try:
    df = spark.table("weather_features")
    logger.info("✓ Loaded weather_features table")
except Exception as e:
    logger.error(f"Failed to load table: {e}")
    raise

print("Loaded Data:")
print(f"  Records: {df.count():,}")
print(f"  Columns: {len(df.columns)}")
print(f"\nColumns: {df.columns}")

print("\nTarget Variable (temperature) Statistics:")
df.select("temperature").describe().show()

## Section 2: Define Feature Columns

Define which columns to use as features for the model

In [None]:
# Define feature columns
feature_columns = [
    # Time features
    "hour",
    "day_of_week",
    "day_of_year",
    "month",
    "timestamp_unix",
    # Base weather features
    "humidity",
    "pressure",
    "wind_speed",
    "visibility",
    "cloudiness",
    # Lag features
    "temperature_lag_1",
    "temperature_lag_3",
    "temperature_lag_6",
    "temperature_lag_12",
    "humidity_lag_1",
    "humidity_lag_3",
    "humidity_lag_6",
    "humidity_lag_12",
    "pressure_lag_1",
    "pressure_lag_3",
    "pressure_lag_6",
    "pressure_lag_12",
    # Rolling features
    "temperature_rolling_mean_3",
    "temperature_rolling_std_3",
    "temperature_rolling_mean_6",
    "temperature_rolling_std_6",
    "temperature_rolling_mean_12",
    "temperature_rolling_std_12",
    "humidity_rolling_mean_3",
    "humidity_rolling_mean_6",
    "humidity_rolling_mean_12",
    "pressure_rolling_mean_3",
    "pressure_rolling_mean_6",
    "pressure_rolling_mean_12",
    # Interaction features
    "temp_humidity_interaction",
    "cloud_visibility_ratio",
    "pressure_humidity_interaction",
]

# Filter to only columns that exist
available_columns = set(df.columns)
feature_columns = [f for f in feature_columns if f in available_columns]

print("Feature Columns to Use:")
print(f"  Total: {len(feature_columns)}")
for i, col_name in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {col_name}")

## Section 3: Split Data into Training and Testing Sets

Split data with 80/20 ratio for training and testing

In [None]:
# Split data into training and testing sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print("Data Split:")
print(f"  Training set: {train_df.count():,} records ({100 * train_df.count() / df.count():.1f}%)")
print(f"  Testing set:  {test_df.count():,} records ({100 * test_df.count() / df.count():.1f}%)")
print(f"  Total:        {df.count():,} records")

## Section 4: Build ML Pipeline

Create a pipeline with feature assembly, scaling, and Random Forest model

In [None]:
# Build ML Pipeline
print("Building ML Pipeline...")

# Step 1: Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features", handleInvalid="skip")

# Step 2: Scale features (standardization)
scaler = StandardScaler(
    inputCol="features", outputCol="scaled_features", withMean=True, withStd=True
)

# Step 3: Random Forest Regressor
rf = RandomForestRegressor(
    featuresCol="scaled_features",
    labelCol="temperature",
    numTrees=50,
    maxDepth=10,
    minInstancesPerNode=5,
    subsamplingRate=0.8,
    featureSubsetStrategy="sqrt",
    seed=42,
)

# Combine into pipeline
pipeline = Pipeline(stages=[assembler, scaler, rf])

print("✓ Pipeline created with 3 stages:")
print("  1. Feature Assembly")
print("  2. Feature Scaling")
print("  3. Random Forest Regressor (50 trees, max depth=10)")

## Section 5: Train the Machine Learning Model

Train the Random Forest model on the training data

In [None]:
# Train the model
print("Training Random Forest model...")
print("This may take a few minutes...")

model = pipeline.fit(train_df)

print("✓ Model training completed successfully")
print(f"  Trained on {train_df.count():,} records")
print(f"  Using {len(feature_columns)} features")

## Section 6: Evaluate Model Performance

Calculate performance metrics on the test set (RMSE, MAE, R²)

In [None]:
# Make predictions on test set
print("Evaluating model on test set...")
predictions = model.transform(test_df)

# Add absolute error column
predictions = predictions.withColumn(
    "abs_error", spark_abs(col("prediction") - col("temperature")).cast(DoubleType())
)

# Calculate evaluation metrics
evaluator_rmse = RegressionEvaluator(
    labelCol="temperature", predictionCol="prediction", metricName="rmse"
)

evaluator_mae = RegressionEvaluator(
    labelCol="temperature", predictionCol="prediction", metricName="mae"
)

evaluator_r2 = RegressionEvaluator(
    labelCol="temperature", predictionCol="prediction", metricName="r2"
)

rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

# Display metrics
print("\n" + "=" * 70)
print("MODEL PERFORMANCE METRICS")
print("=" * 70)
print(f"\nRegression Metrics (on {predictions.count():,} test samples):")
print(f"  RMSE (Root Mean Squared Error): {rmse:.4f}°C")
print(f"  MAE  (Mean Absolute Error):    {mae:.4f}°C")
print(f"  R²   (Coefficient of Determination): {r2:.4f}")
print("\nInterpretation:")
print(f"  - RMSE of {rmse:.4f}°C: Average prediction error magnitude")
print(f"  - MAE of {mae:.4f}°C: Average absolute prediction error")
print(f"  - R² of {r2:.4f}: Model explains {100 * r2:.1f}% of temperature variance")
print("=" * 70)

## Section 7: Display Feature Importance

Show the most important features used by the Random Forest model

In [None]:
# Extract feature importance
rf_model = model.stages[-1]  # Get Random Forest stage from pipeline
importances = rf_model.featureImportances.toArray()

# Map feature names to importances
feature_importance_pairs = list(zip(feature_columns, importances, strict=False))
feature_importance_sorted = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)

# Display top 20 features
print("\n" + "=" * 70)
print("TOP 20 FEATURE IMPORTANCE SCORES")
print("=" * 70)
print(f"\n{'Rank':<6} {'Feature Name':<40} {'Importance':<15}")
print("-" * 70)

for rank, (feature_name, importance) in enumerate(feature_importance_sorted[:20], 1):
    print(f"{rank:<6} {feature_name:<40} {importance:.6f}")

print("\n" + "=" * 70)

## Section 8: Make Weather Predictions

View sample predictions comparing actual vs predicted temperatures

In [None]:
# Display sample predictions
print("\nSample Predictions (Best and Worst):")
print("\nBEST PREDICTIONS (Lowest Error):")
predictions.orderBy("abs_error").select(
    "city", "timestamp", "temperature", "prediction", "abs_error"
).show(5, truncate=False)

print("\nWORST PREDICTIONS (Highest Error):")
predictions.orderBy(col("abs_error").desc()).select(
    "city", "timestamp", "temperature", "prediction", "abs_error"
).show(5, truncate=False)

# Prediction accuracy statistics
accuracy_within_1c = predictions.filter(col("abs_error") <= 1.0).count() / predictions.count() * 100
accuracy_within_2c = predictions.filter(col("abs_error") <= 2.0).count() / predictions.count() * 100
accuracy_within_3c = predictions.filter(col("abs_error") <= 3.0).count() / predictions.count() * 100

print("\nPrediction Accuracy:")
print(f"  Within 1°C: {accuracy_within_1c:.1f}% of predictions")
print(f"  Within 2°C: {accuracy_within_2c:.1f}% of predictions")
print(f"  Within 3°C: {accuracy_within_3c:.1f}% of predictions")

## Section 9: Visualize Predictions vs Actual Values

Compare predicted values with actual values to assess accuracy

In [None]:
# Prediction vs Actual analysis by city
print("\nPrediction Performance by City:")
print("-" * 70)
city_performance = (
    predictions.groupBy("city")
    .agg(
        spark_round(avg("abs_error"), 4).alias("avg_error"),
        spark_round(avg("temperature"), 2).alias("avg_temp"),
        spark_round(avg("prediction"), 2).alias("avg_prediction"),
    )
    .orderBy("avg_error")
)

city_performance.show(truncate=False)

# Residual distribution
print("\nResidual Analysis:")
residuals_stats = predictions.select(
    spark_round(avg("abs_error"), 4).alias("mean_error"),
).collect()[0]

print(f"  Mean Absolute Error: {residuals_stats['mean_error']:.4f}°C")

# Save predictions for later analysis
print("\nSaving predictions to weather_predictions table...")
predictions.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("weather_predictions")
print("✓ Predictions saved successfully")

## Section 10: Save Trained Model

Save the trained model for future predictions and deployment

In [None]:
# Save model
model_path = "/tmp/weather_temperature_model"

print(f"\nSaving trained model to {model_path}...")
try:
    model.write().overwrite().save(model_path)
    print(f"✓ Model saved successfully to {model_path}")
except Exception as e:
    logger.warning(f"Could not save to {model_path}, saving with alternative path: {e}")

print("\n" + "=" * 70)
print("MODEL TRAINING COMPLETED SUCCESSFULLY")
print("=" * 70)
print("\nModel Details:")
print("  Algorithm: Random Forest Regressor")
print("  Number of Trees: 50")
print("  Max Depth: 10")
print(f"  Features Used: {len(feature_columns)}")
print(f"  Training Samples: {train_df.count():,}")
print(f"  Test Samples: {test_df.count():,}")
print("\nPerformance:")
print(f"  RMSE: {rmse:.4f}°C")
print(f"  MAE:  {mae:.4f}°C")
print(f"  R²:   {r2:.4f}")
print("\nOutputs Generated:")
print("  ✓ weather_predictions table (predictions + actual values)")
print("  ✓ /tmp/weather_temperature_model (trained model)")
print("\nNext Steps:")
print("  1. Run analyze_predictions.py for detailed analysis")
print("  2. Deploy model for production use")
print("  3. Monitor prediction accuracy over time")
print("=" * 70)