# Weather ML Pipeline - Local Run

This notebook runs the weather ML pipeline locally using sample.csv


In [1]:
# Import necessary libraries
import os
from pyspark.sql import SparkSession
from data_loader import load_and_preprocess_data
from model_trainer import train_models
from feature_pipeline import create_feature_pipeline

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Create local Spark session
spark = SparkSession.builder \
    .appName("WeatherML_Local") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("Spark session created successfully!")
print(f"Spark version: {spark.version}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/27 19:06:37 WARN Utils: Your hostname, Rics-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.249.83.62 instead (on interface en0)
25/10/27 19:06:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/27 19:06:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session created successfully!
Spark version: 4.0.1


In [3]:
# Load and preprocess data from sample.csv
data_path = "sample.csv"

print("Loading and preprocessing data...")
df = load_and_preprocess_data(spark, data_path)

print(f"\nTotal records after preprocessing: {df.count()}")
print("\nDataFrame schema:")
df.printSchema()


Loading and preprocessing data...
Loading weather data from: sample.csv
Loading data from directory...
Loaded 1583 records




After preprocessing: 756 records
Sample of processed features:
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+------------------+-------------------+-------------------+------------------+
|temperature|dew_clean|slp_clean|wind_speed|cloud_ceiling|visibility|              geo_x|              geo_y|             geo_z|elevation|          hour_sin|           hour_cos|    day_of_year_sin|   day_of_year_cos|
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+------------------+-------------------+-------------------+------------------+
|       15.0|     12.2|   1003.8|       5.2|       1524.0|   16000.0|0.04731207462675889|-0.8319094570208571|0.5528907875103138|    187.5|               0.0|                1.0|0.01721335615583582|0.9998518392091162|
|       14.4|     12.2|   1003.7|       5.7|        762.0|    4000.0|

In [4]:
# Display sample of preprocessed features
print("Sample of preprocessed features:")
df.select(
    "temperature",
    "dew_clean",
    "slp_clean",
    "wind_speed",
    "cloud_ceiling",
    "visibility",
    "geo_x",
    "geo_y",
    "geo_z",
    "elevation",
).show(10, truncate=False)

# Check feature distributions
print("\nFeature Statistics:")
df.describe().show()


Sample of preprocessed features:
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+
|temperature|dew_clean|slp_clean|wind_speed|cloud_ceiling|visibility|geo_x              |geo_y              |geo_z             |elevation|
+-----------+---------+---------+----------+-------------+----------+-------------------+-------------------+------------------+---------+
|15.0       |12.2     |1003.8   |5.2       |1524.0       |16000.0   |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5    |
|14.4       |12.2     |1003.7   |5.7       |762.0        |4000.0    |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5    |
|13.9       |12.8     |1003.7   |6.2       |427.0        |4800.0    |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5    |
|13.9       |11.7     |1003.7   |4.1       |3353.0       |16000.0   |0.04731207462675889|-0.8319094570208571|0.5528907875103138|187.5

25/10/27 19:06:45 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/10/27 19:06:45 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


+-------+---------------+-------------------+-------------------+---------+--------------------+-----------+---------+---------------+-------+-----------+-----------+-----------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+------------------+------------------+------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+-----------------+------------------+------------------+
|summary|        STATION|               DATE|             SOURCE|elevation|                NAME|REPORT_TYPE|CALL_SIGN|QUALITY_CONTROL|    TMP|        AA1|        AA2|        AA3| AJ1| AY1| AY2| GA1| GA2| GA3| GE1| GF1| IA1| KA1| KA2| MA1| MD1| MW1| OC1| OD1| SA1| UA1| REM| EQD|         tmp_value|          tmp_flag|       temperature|            hour_sin|            hour_cos|    day_of_year_sin|    day_of_year_cos|           

In [5]:
# Split data into training and test sets
train_ratio = 0.7
train_df, test_df = df.randomSplit([train_ratio, 1 - train_ratio], seed=42)

print(f"Training set: {train_df.count()} records")
print(f"Test set: {test_df.count()} records")


Training set: 560 records
Test set: 196 records


25/10/27 19:07:03 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv
25/10/27 19:07:03 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


In [6]:
# Train models (using a local output path)
output_path = "file://" + os.path.abspath("results")

print("Training models...")
print(f"Results will be saved to: {output_path}\n")

metrics = train_models(train_df, test_df, output_path, spark)

print("\n=== Training Complete! ===")


Training models...
Results will be saved to: file:///Users/ric/proj/weather/pyspark_weather_data/results

Creating feature pipeline...
Training data count: 560


25/10/27 19:07:05 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv
25/10/27 19:07:06 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


Test data count: 196
Sample of training data features:
+-------------------+-------------------+------------------+---------+---------+---------+----------+-------------+----------+------------------+-------------------+-------------------+------------------+
|              geo_x|              geo_y|             geo_z|elevation|dew_clean|slp_clean|wind_speed|cloud_ceiling|visibility|          hour_sin|           hour_cos|    day_of_year_sin|   day_of_year_cos|
+-------------------+-------------------+------------------+---------+---------+---------+----------+-------------+----------+------------------+-------------------+-------------------+------------------+
|0.04731207462675889|-0.8319094570208571|0.5528907875103138|    187.5|     12.2|   1003.8|       5.2|       1524.0|   16000.0|               0.0|                1.0|0.01721335615583582|0.9998518392091162|
|0.04731207462675889|-0.8319094570208571|0.5528907875103138|    187.5|     12.2|   1003.7|       5.7|        762.0|    4000.0

25/10/27 19:07:06 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv
25/10/27 19:07:06 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


Transforming training data...
Transforming test data...
Training Gradient Boosting Trees (similar to LightGBM)...


25/10/27 19:07:06 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv
25/10/27 19:07:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Training Random Forest...
Saving models...
Evaluating models...


25/10/27 19:07:35 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 78, schema size: 39
CSV file: file:///Users/ric/proj/weather/pyspark_weather_data/sample.csv


Py4JJavaError: An error occurred while calling o15004.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/Users/ric/proj/weather/pyspark_weather_data/results/metrics already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:303)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:73)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1094)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1092)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1065)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1029)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1011)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1010)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:967)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1631)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1631)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1617)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1617)
	at org.apache.spark.api.java.JavaRDDLike.saveAsTextFile(JavaRDDLike.scala:565)
	at org.apache.spark.api.java.JavaRDDLike.saveAsTextFile$(JavaRDDLike.scala:564)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:46)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:840)


In [7]:
# Display results
print("\n" + "="*50)
print("MODEL PERFORMANCE RESULTS")
print("="*50)

for model_metrics in metrics["models"]:
    print(f"\n{model_metrics['model']}:")
    print(f"  RMSE: {model_metrics['rmse']:.4f}°C")
    print(f"  R²:   {model_metrics['r2']:.4f}")
    print(f"  MAE:  {model_metrics['mae']:.4f}°C")

print(f"\n✓ Best model: {metrics['best_model']}")
print(f"✓ Results saved to: {output_path}")
print("\n" + "="*50)



MODEL PERFORMANCE RESULTS


NameError: name 'metrics' is not defined

In [9]:
# Stop Spark session
print("Stopping Spark session...")
spark.stop()
print("Spark session stopped.")


Stopping Spark session...
Spark session stopped.
