# Weather ML Pipeline - Local Run

This notebook runs the weather ML pipeline locally using sample.csv


In [1]:
# Import necessary libraries
import os
from pyspark.sql import SparkSession
from data_loader import load_and_preprocess_data
from model_trainer import train_models
from feature_pipeline import create_feature_pipeline

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Create local Spark session
spark = SparkSession.builder \
    .appName("WeatherML_Local") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.ui.showConsoleProgress", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("Spark session created successfully!")
print(f"Spark version: {spark.version}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/29 22:41:08 WARN Utils: Your hostname, Rics-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.134 instead (on interface en0)
25/10/29 22:41:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/29 22:41:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/29 22:41:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark session created successfully!
Spark version: 4.0.1


In [3]:
# Load and preprocess data from sample.csv
data_path = "sample.csv"

print("Loading and preprocessing data...")
df = load_and_preprocess_data(spark, data_path)

print(f"\nTotal records after preprocessing: {df.count()}")
print("\nDataFrame schema:")
df.printSchema()


Loading and preprocessing data...
Loading weather data from: sample.csv
Loading data from directory...




Sample of processed features:
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+------------------+-------------------+-------------------+------------------+
|temperature|dew_clean|slp_clean|wind_speed|cloud_ceiling|visibility|              geo_x|              geo_y|             geo_z|elevation|          hour_sin|           hour_cos|    day_of_year_sin|   day_of_year_cos|
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+------------------+-------------------+-------------------+------------------+
|       15.0|     12.2|   1003.8|       5.2|       1524.0|   16000.0|0.04731207462675889|-0.8319094570208571|0.5528907875103138|    187.5|               0.0|                1.0|0.01721335615583582|0.9998518392091162|
|       14.4|     12.2|   1003.7|       5.7|        762.0|    4000.0|0.04731207462675889|-0.8319094570

In [4]:
# Display sample of preprocessed features
print("Sample of preprocessed features:")
df.select(
    "temperature",
    "dew_clean",
    "slp_clean",
    "wind_speed",
    "cloud_ceiling",
    "visibility",
    "geo_x",
    "geo_y",
    "geo_z",
    "elevation",
).show(10, truncate=False)

# Check feature distributions
print("\nFeature Statistics:")
df.describe().show()


Sample of preprocessed features:
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+
|temperature|dew_clean|slp_clean|wind_speed|cloud_ceiling|visibility|geo_x              |geo_y              |geo_z             |elevation|
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+
|15.0       |12.2     |1003.8   |5.2       |1524.0       |16000.0   |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5    |
|14.4       |12.2     |1003.7   |5.7       |762.0        |4000.0    |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5    |
|13.9       |12.8     |1003.7   |6.2       |427.0        |4800.0    |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5    |
|13.9       |11.7     |1003.7   |4.1       |3353.0       |16000.0   |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5

25/10/29 22:41:16 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/10/29 22:41:16 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


+-------+---------------+-------------------+-------------------+---------+--------------------+-----------+---------+---------------+-------+-----------+-----------+-----------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+------------------+------------------+------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+-----------------+------------------+------------------+
|summary|        STATION|               DATE|             SOURCE|elevation|                NAME|REPORT_TYPE|CALL_SIGN|QUALITY_CONTROL|    TMP|        AA1|        AA2|        AA3| AJ1| AY1| AY2| GA1| GA2| GA3| GE1| GF1| IA1| KA1| KA2| MA1| MD1| MW1| OC1| OD1| SA1| UA1| REM| EQD|         tmp_value|          tmp_flag|       temperature|            hour_sin|            hour_cos|    day_of_year_sin|    day_of_year_cos|           

In [5]:
# Split data into training and test sets
train_ratio = 0.7
train_df, test_df = df.randomSplit([train_ratio, 1 - train_ratio], seed=42)

print(f"Training set: {train_df.count()} records")
print(f"Test set: {test_df.count()} records")


Training set: 560 records
Test set: 196 records


25/10/29 22:41:17 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv
25/10/29 22:41:17 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


In [6]:
# Train models (using a local output path)
output_path = "file://" + os.path.abspath("results")

print("Training models...")
print(f"Results will be saved to: {output_path}\n")

metrics = train_models(train_df, test_df, output_path, spark)

print("\n=== Training Complete! ===")


Training models...
Results will be saved to: file:///Users/ric/proj/weather/pyspark_weather_data/results

Creating feature pipeline...
Sample of training data features:


25/10/29 22:41:17 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


+-------------------+-------------------+------------------+---------+---------+---------+----------+-------------+----------+------------------+-------------------+-------------------+------------------+
|              geo_x|              geo_y|             geo_z|elevation|dew_clean|slp_clean|wind_speed|cloud_ceiling|visibility|          hour_sin|           hour_cos|    day_of_year_sin|   day_of_year_cos|
+-------------------+-------------------+------------------+---------+---------+---------+----------+-------------+----------+------------------+-------------------+-------------------+------------------+
|0.04731207462675889|-0.8319094570208571|0.5528907875103138|    187.5|     12.2|   1003.8|       5.2|       1524.0|   16000.0|               0.0|                1.0|0.01721335615583582|0.9998518392091162|
|0.04731207462675889|-0.8319094570208571|0.5528907875103138|    187.5|     12.2|   1003.7|       5.7|        762.0|    4000.0|0.2588190451025374| 0.9659258262890639|0.0172133561558

25/10/29 22:41:18 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


Transforming training data...
Transforming test data...
Training Gradient Boosting Trees with 3-phase hyperparameter tuning
Starting 3-phase hyperparameter tuning for GBT...
  Phase 1: Grid search on 2% of data...


25/10/29 22:41:18 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv
25/10/29 22:41:18 WARN BlockManager: Block rdd_67_0 already exists on this machine; not re-adding it
25/10/29 22:41:18 WARN BlockManager: Block rdd_67_0 already exists on this machine; not re-adding it
25/10/29 22:41:18 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 7 (= number of training instances)
25/10/29 22:41:18 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 7 (= number of training instances)
25/10/29 22:41:18 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 7 (= number of training instances)
25/10/29 22:41:18 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 7 (= number of training instances)
25/10/29 22:41:18 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to

  Phase 1 complete. Top 5 candidates:
    1. RMSE: 5.9136 | maxBins=32, maxDepth=5, maxIter=20, stepSize=0.05
    2. RMSE: 5.9136 | maxBins=64, maxDepth=5, maxIter=20, stepSize=0.05
    3. RMSE: 5.9136 | maxBins=32, maxDepth=8, maxIter=20, stepSize=0.05
    4. RMSE: 5.9136 | maxBins=64, maxDepth=8, maxIter=20, stepSize=0.05
    5. RMSE: 5.9136 | maxBins=32, maxDepth=3, maxIter=20, stepSize=0.05
  Phase 2: Narrow search on 10% of data with top 5 candidates...


25/10/29 22:41:37 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 44 (= number of training instances)
25/10/29 22:41:37 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 44 (= number of training instances)
25/10/29 22:41:41 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 51 (= number of training instances)


  Phase 2 complete. Best RMSE: 5.6721
  Best hyperparameters: {'featureSubsetStrategy': 'all', 'maxBins': 64, 'maxDepth': 5, 'maxIter': 20, 'minInstancesPerNode': 1, 'stepSize': 0.05, 'subsamplingRate': 0.8}
  Phase 3: Training final model on full dataset...
  Phase 3 complete. Final model trained on full dataset.
Training Random Forest with 3-phase hyperparameter tuning
Starting 3-phase hyperparameter tuning for RF...
  Phase 1: Grid search on 2% of data...


25/10/29 22:41:43 WARN CacheManager: Asked to cache already cached data.
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 7 (= number of training instances)
25/10/29 22:41:44 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 

  Phase 1 complete. Top 5 candidates:
    1. RMSE: 6.6281 | featureSubsetStrategy=auto, maxBins=32, maxDepth=6, numTrees=50, subsamplingRate=1.0
    2. RMSE: 6.6281 | featureSubsetStrategy=sqrt, maxBins=32, maxDepth=6, numTrees=50, subsamplingRate=1.0
    3. RMSE: 6.6281 | featureSubsetStrategy=auto, maxBins=64, maxDepth=6, numTrees=50, subsamplingRate=1.0
    4. RMSE: 6.6281 | featureSubsetStrategy=sqrt, maxBins=64, maxDepth=6, numTrees=50, subsamplingRate=1.0
    5. RMSE: 6.6281 | featureSubsetStrategy=auto, maxBins=32, maxDepth=10, numTrees=50, subsamplingRate=1.0
  Phase 2: Narrow search on 10% of data with top 5 candidates...


25/10/29 22:41:50 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 44 (= number of training instances)
25/10/29 22:41:50 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 64 to 44 (= number of training instances)


  Phase 2 complete. Best RMSE: 3.9611
  Best hyperparameters: {'featureSubsetStrategy': 'auto', 'maxBins': 32, 'maxDepth': 6, 'minInstancesPerNode': 10, 'numTrees': 50, 'subsamplingRate': 1.0}
  Phase 3: Training final model on full dataset...
  Phase 3 complete. Final model trained on full dataset.
Saving models...
Evaluating models...


25/10/29 22:41:53 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv



=== Training Complete! ===


                                                                                

In [7]:
# Display results
print("\n" + "="*50)
print("MODEL PERFORMANCE RESULTS")
print("="*50)

for model_metrics in metrics["models"]:
    print(f"\n{model_metrics['model']}:")
    print(f"  RMSE: {model_metrics['rmse']:.4f}°C")
    print(f"  R²:   {model_metrics['r2']:.4f}")
    print(f"  MAE:  {model_metrics['mae']:.4f}°C")

print(f"\n✓ Best model: {metrics['best_model']}")
print(f"✓ Results saved to: {output_path}")
print("\n" + "="*50)



MODEL PERFORMANCE RESULTS

Gradient Boosting Trees:
  RMSE: 1.8886°C
  R²:   0.9121
  MAE:  1.4106°C

Random Forest:
  RMSE: 1.9072°C
  R²:   0.9104
  MAE:  1.4582°C

✓ Best model: Gradient Boosting Trees
✓ Results saved to: file:///Users/ric/proj/weather/pyspark_weather_data/results



In [8]:
# Stop Spark session
print("Stopping Spark session...")
spark.stop()
print("Spark session stopped.")


Stopping Spark session...
Spark session stopped.
