In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import * # Import necessary functions
from pyspark.sql.types import * # Import necessary types

# Build the basic SparkSession
spark = SparkSession.builder \
    .appName("GlobalHourlyWeather_NativeLGBM") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

print(spark.version)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/27 20:06:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


4.0.1


In [2]:
# Update this path to where your unzipped '2024' folder is
data_path = "2024.tar.gz"

# Load all CSVs, using the first file's header
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Let's see what we've got!
df.printSchema()
df.show(5, truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- 01001099999.csv                                                                                     0100644 0000000 0000000 00010620251 14760163177 011523  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION": string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- REPORT_TYPE: string (nullable = true)
 |-- CALL_SIGN: string (nullable = true)
 |-- QUALITY_CONTROL: string (nullable = true)
 |-- WND: string (nullable = true)
 |-- CIG: string (nullable = true)
 |-- VIS:

25/10/27 20:10:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [3]:
# ----- THIS IS THE FIX -----
# First, filter out any rows where TMP doesn't have a comma (or is null)
df_with_comma = df.where(col("TMP").contains(","))
# ---------------------------

# 1. Split TMP into its value and quality flag
# Now, we use df_with_comma, not df
df_parsed = df_with_comma.withColumn("tmp_parts", split(col("TMP"), ","))

# 2. Separate the value and flag into their own columns
# This is now safe because we know a comma exists
df_parsed = df_parsed.withColumn("tmp_value", col("tmp_parts")[0]) \
                     .withColumn("tmp_flag", col("tmp_parts")[1])

# 3. Filter out bad data *before* scaling
df_cleaned = df_parsed.where(
    (col("tmp_value") != "+9999") &
    (col("tmp_flag").isin(['1', '5']))
)

# 4. Cast to a number and apply the scaling factor (divide by 10)
df_with_temp = df_cleaned.withColumn(
    "temperature",
    col("tmp_value").cast(DoubleType()) / 10.0
)

# Let's check our work!
print("Original vs. Cleaned Temperature:")
df_with_temp.select("TMP", "temperature").show(10)

# See how many rows we kept
print(f"Total records after cleaning TMP: {df_with_temp.count()}")

Original vs. Cleaned Temperature:
+-------+-----------+
|    TMP|temperature|
+-------+-----------+
|-0070,1|       -7.0|
|-0065,1|       -6.5|
|-0065,1|       -6.5|
|-0064,1|       -6.4|
|-0070,1|       -7.0|
|-0057,1|       -5.7|
|-0052,1|       -5.2|
|-0057,1|       -5.7|
|-0047,1|       -4.7|
|-0042,1|       -4.2|
+-------+-----------+
only showing top 10 rows


[Stage 4:>                                                          (0 + 1) / 1]

Total records after cleaning TMP: 122341374


                                                                                

In [4]:
from pyspark.sql.functions import col, split
from pyspark.sql.types import DoubleType

def clean_weather_column(input_df, col_name, missing_code, quality_flags, scale_factor):
    """
    Cleans a NOAA weather column that has a 'value,flag' format.
    
    :param input_df: The DataFrame to transform
    :param col_name: The name of the raw column to clean (e.g., "DEW", "SLP")
    :param missing_code: The string code for missing values (e.g., "+9999", "99999")
    :param quality_flags: A list of good-quality flags to keep (e.g., ['1', '5'])
    :param scale_factor: The number to divide the value by (e.g., 10.0)
    :return: A new DataFrame with a clean column named '<col_name>_clean'
    """
    print(f"Cleaning column: {col_name}...")
    
    # 1. Filter out rows without a comma (like we did for TMP)
    df_with_comma = input_df.where(col(col_name).contains(","))

    # 2. Split into value and flag
    df_parsed = df_with_comma.withColumn(f"{col_name}_parts", split(col(col_name), ","))
    
    df_parsed = df_parsed.withColumn(f"{col_name}_value", col(f"{col_name}_parts")[0]) \
                         .withColumn(f"{col_name}_flag", col(f"{col_name}_parts")[1])

    # 3. Filter out bad data
    df_cleaned = df_parsed.where(
        (col(f"{col_name}_value") != missing_code) &
        (col(f"{col_name}_flag").isin(quality_flags))
    )
    
    # 4. Create the final scaled, numeric column
    clean_col_name = col_name.lower() + "_clean" # e.g., 'dew_clean'
    
    df_final = df_cleaned.withColumn(
        clean_col_name,
        col(f"{col_name}_value").cast(DoubleType()) / scale_factor
    )
    
    # 5. Drop the intermediate columns
    df_final = df_final.drop(col_name, f"{col_name}_parts", f"{col_name}_value", f"{col_name}_flag")
    
    return df_final

In [5]:
# Remove the month for feature engineering
# Make sure these imports are at the top
from pyspark.sql.functions import sin, cos, pi, hour, month, dayofyear, col
from pyspark.sql.types import DoubleType, TimestampType

# 1. Parse the DATE column
df_featured = df_with_temp.withColumn("timestamp", col("DATE").cast(TimestampType()))

# 2. Extract base time features
# (NOTICE 'month' IS NO LONGER EXTRACTED)
df_with_time = df_featured.withColumn("hour", hour(col("timestamp"))) \
                            .withColumn("day_of_year", dayofyear(col("timestamp")))

# 3. Create cyclical features
df_cyclical = df_with_time.withColumn("hour_sin", sin(2 * pi() * col("hour") / 24)) \
                           .withColumn("hour_cos", cos(2 * pi() * col("hour") / 24)) \
                           .withColumn("day_sin", sin(2 * pi() * col("day_of_year") / 366)) \
                           .withColumn("day_cos", cos(2 * pi() * col("day_of_year") / 366))

# 4. Cast geographic features to numeric
df_featured = df_cyclical.withColumn("latitude", col("LATITUDE").cast(DoubleType())) \
                         .withColumn("longitude", col("LONGITUDE").cast(DoubleType())) \
                         .withColumn("elevation", col("ELEVATION").cast(DoubleType()))

# # 5. Run your custom 'clean_weather_column' functions
# # (e.g., for DEW and SLP)
# df_featured = clean_weather_column(
#     input_df=df_featured,
#     col_name="DEW",
#     missing_code="+9999",
#     quality_flags=['1', '5'],
#     scale_factor=10.0
# )

# df_featured = clean_weather_column(
#     input_df=df_featured,
#     col_name="SLP",
#     missing_code="99999",
#     quality_flags=['1', '5'],
#     scale_factor=10.0
# )

# 6. Check our new features
print("Schema after feature engineering (no 'month' cyclical features):")
df_featured.printSchema()

Schema after feature engineering (no 'month' cyclical features):
root
 |-- 01001099999.csv                                                                                     0100644 0000000 0000000 00010620251 14760163177 011523  0                                                                                                    ustar 00                                                                0000000 0000000                                                                                                                                                                        "STATION": string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- NAME: string (nullable = true)
 |-- REPORT_TYPE: string (nullable = true)
 |-- CALL_SIGN: string (nullable = true)
 |-- QUALITY_CONTROL: string (nullable = true)
 |-- WND: stri

In [6]:
# We start with df_featured from the previous step
df_final_features = df_featured

# Clean the DEW column
# Missing code: +9999, Quality flags: '1', '5', Scale: 10.0
df_final_features = clean_weather_column(
    input_df=df_final_features,
    col_name="DEW",
    missing_code="+9999",
    quality_flags=['1', '5'],
    scale_factor=10.0
)

# Clean the SLP column
# Missing code: 99999, Quality flags: '1', '5', Scale: 10.0
df_final_features = clean_weather_column(
    input_df=df_final_features,
    col_name="SLP",
    missing_code="99999",
    quality_flags=['1', '5'],
    scale_factor=10.0
)

# Check our new clean columns
df_final_features.select("temperature", "dew_clean", "slp_clean").show(10)

Cleaning column: DEW...
Cleaning column: SLP...
+-----------+---------+---------+
|temperature|dew_clean|slp_clean|
+-----------+---------+---------+
|       -7.0|    -13.0|   1020.8|
|       -6.5|    -12.4|   1020.4|
|       -6.5|    -11.3|   1020.5|
|       -6.4|    -10.5|   1020.2|
|       -7.0|    -10.6|   1020.0|
|       -5.7|     -9.9|   1019.6|
|       -5.2|     -9.5|   1019.6|
|       -5.7|     -9.9|   1019.5|
|       -4.7|     -8.6|   1019.2|
|       -4.2|     -7.7|   1018.7|
+-----------+---------+---------+
only showing top 10 rows


In [7]:
# 1. Split WND into its 5 parts
wnd_parts = split(col("WND"), ",")

# 2. Get the Speed (index 3) and SpeedQuality (index 4)
df_wind = df_final_features.withColumn("wind_speed_raw", wnd_parts[3])
df_wind = df_wind.withColumn("wind_speed_flag", wnd_parts[4])

# 3. Clean and scale it
df_wind_cleaned = df_wind.where(
    (col("wind_speed_raw") != "9999") &
    (col("wind_speed_flag").isin(['1', '5']))
)

df_wind_final = df_wind_cleaned.withColumn(
    "wind_speed_clean",
    col("wind_speed_raw").cast(DoubleType()) / 10.0  # Assuming 10.0 scale, check docs!
)

# See the result
df_wind_final.select("WND", "wind_speed_clean").show(10)

+--------------+----------------+
|           WND|wind_speed_clean|
+--------------+----------------+
|318,1,N,0061,1|             6.1|
|330,1,N,0051,1|             5.1|
|348,1,N,0035,1|             3.5|
|357,1,N,0019,1|             1.9|
|241,1,N,0008,1|             0.8|
|076,1,N,0048,1|             4.8|
|084,1,N,0054,1|             5.4|
|040,1,N,0023,1|             2.3|
|098,1,N,0057,1|             5.7|
|086,1,N,0063,1|             6.3|
+--------------+----------------+
only showing top 10 rows


In [8]:
# 1. Define the list of all our final numeric features
# (We check the sin/cos features, not the originals)
numeric_cols = [
    "temperature", 
    "dew_clean", 
    "slp_clean", 
    "latitude", 
    "longitude", 
    "elevation", 
    "hour_sin", "hour_cos",   # Check the new features
    "day_sin", "day_cos"      # Check the new features
]

# 2. Get the statistics for just those columns
print("Checking statistics for new features:")
df_final_features.select(numeric_cols).describe().show()

Checking statistics for new features:


[Stage 9:>                                                          (0 + 1) / 1]

+-------+------------------+------------------+------------------+------------------+-------------------+------------------+--------------------+--------------------+--------------------+-------------------+
|summary|       temperature|         dew_clean|         slp_clean|          latitude|          longitude|         elevation|            hour_sin|            hour_cos|             day_sin|            day_cos|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+--------------------+--------------------+--------------------+-------------------+
|  count|          49055736|          49055736|          49055736|          49055736|           49055736|          49055736|            49055736|            49055736|            49055736|           49055736|
|   mean|12.831891946655992| 7.024021272452314|1014.4458740646248| 32.88926715142788|-14.876760894417405|281.07311849390624|-0.00301160989368...|-0.01194671339957...|-0

                                                                                

In [9]:
# Checking for Null or Nan Value
from pyspark.sql.functions import count, when, isnan

# We'll use the same list of columns from before
cols_to_check = numeric_cols 

# This command counts nulls AND NaNs for each column
df_final_features.select([
    count(when(col(c).isNull() | isnan(c), c)).alias(c) 
    for c in cols_to_check
]).show()

[Stage 12:>                                                         (0 + 1) / 1]

+-----------+---------+---------+--------+---------+---------+--------+--------+-------+-------+
|temperature|dew_clean|slp_clean|latitude|longitude|elevation|hour_sin|hour_cos|day_sin|day_cos|
+-----------+---------+---------+--------+---------+---------+--------+--------+-------+-------+
|          0|        0|        0|       0|        0|        0|       0|       0|      0|      0|
+-----------+---------+---------+--------+---------+---------+--------+--------+-------+-------+



                                                                                

In [10]:
# 1. Define our final feature and label columns
feature_cols = [
    "latitude", 
    "longitude", 
    "elevation", 
    "dew_clean", 
    "slp_clean", 
    "hour_sin", "hour_cos",
    "day_sin", "day_cos"
]

label_col = "temperature"

# 2. Select only these columns AND filter out the bad elevation data
model_df = df_final_features \
    .select(*feature_cols, label_col) \
    .where(col("elevation") != -999.9)

print(f"Original count: {df_final_features.count()}")
print(f"Count after elevation filter: {model_df.count()}")

                                                                                

Original count: 49055736


[Stage 18:>                                                         (0 + 1) / 1]

Count after elevation filter: 49045972


                                                                                

## Train Native LightGBM on sampled data

### 1. Sample and convert to Pandas 

In [11]:
import pandas as pd # Make sure pandas is imported

print("--- Step 1: Sample and Convert Data ---")

# 1. Sample the Spark DataFrame
# Using .limit() is faster for a quick test.
sample_size = 1000000 
print(f"Sampling {sample_size} rows from Spark DataFrame...")
pandas_df = model_df.limit(sample_size).toPandas()
print(f"Successfully converted {len(pandas_df)} rows to Pandas.")

# Display the first few rows of the Pandas DataFrame
print("\nSample Pandas DataFrame head:")
print(pandas_df.head())

--- Step 1: Sample and Convert Data ---
Sampling 1000000 rows from Spark DataFrame...


                                                                                

Successfully converted 1000000 rows to Pandas.

Sample Pandas DataFrame head:
    latitude  longitude  elevation  dew_clean  slp_clean  hour_sin  hour_cos  \
0  70.933333  -8.666667        9.0      -13.0     1020.8  0.000000  1.000000   
1  70.933333  -8.666667        9.0      -12.4     1020.4  0.258819  0.965926   
2  70.933333  -8.666667        9.0      -11.3     1020.5  0.500000  0.866025   
3  70.933333  -8.666667        9.0      -10.5     1020.2  0.707107  0.707107   
4  70.933333  -8.666667        9.0      -10.6     1020.0  0.866025  0.500000   

    day_sin   day_cos  temperature  
0  0.017166  0.999853         -7.0  
1  0.017166  0.999853         -6.5  
2  0.017166  0.999853         -6.5  
3  0.017166  0.999853         -6.4  
4  0.017166  0.999853         -7.0  


### 2. Define features and split data

In [12]:
from sklearn.model_selection import train_test_split

print("--- Step 2: Define Features and Split Data ---")

# 1. Define features (X) and label (y) for scikit-learn/LightGBM
# Ensure 'feature_cols' and 'label_col' were defined in a previous cell or define them here
feature_cols = [
    "latitude", 
    "longitude", 
    "elevation", 
    "dew_clean", 
    "slp_clean", 
    "hour_sin", "hour_cos",
    "day_sin", "day_cos"
]
label_col = "temperature"

X = pandas_df[feature_cols] 
y = pandas_df[label_col]

# 2. Split the Pandas data into training and testing sets (70/30 split)
print("Splitting Pandas data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

--- Step 2: Define Features and Split Data ---
Splitting Pandas data into train/test sets...
Training set size: 700000, Test set size: 300000


### 3. Train  Native LightGBM Model

In [13]:
import lightgbm as lgb

print("--- Step 3: Train Native LightGBM Model ---")

# 1. Define the LightGBM model
lgbm_native = lgb.LGBMRegressor(objective='regression_l2', n_estimators=100, random_state=42)

# 2. Train the model
print("Training native LightGBM model...")
lgbm_native.fit(X_train, y_train)
print("Training complete.")

--- Step 3: Train Native LightGBM Model ---
Training native LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1639
[LightGBM] [Info] Number of data points in the train set: 700000, number of used features: 9
[LightGBM] [Info] Start training from score 6.497027
Training complete.


### 4. Evaluate the model

In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

print("--- Step 4: Evaluate the Model ---")

# 1. Make predictions on the test set
print("Evaluating model on the test set...")
predictions = lgbm_native.predict(X_test)

# 2. Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# 3. Calculate R-squared (R2)
r2 = lgbm_native.score(X_test, y_test)

# 4. Print the results
print(f"\n--- Native LightGBM Results (on {sample_size} rows sample) ---")
print(f"Root Mean Squared Error (RMSE) = {rmse:.4f}") # Format to 4 decimal places
print(f"R-squared (R2) = {r2:.4f}") # Format to 4 decimal places

--- Step 4: Evaluate the Model ---
Evaluating model on the test set...

--- Native LightGBM Results (on 1000000 rows sample) ---
Root Mean Squared Error (RMSE) = 2.0457
R-squared (R2) = 0.9464


LightGBM was considered as an alternative model because it's a popular gradient boosting framework known for its high speed and accuracy, often outperforming standard GBTRegressor. However, integrating it directly into the PySpark pipeline presented significant challenges.

Challenges with Spark Integration (SynapseML)
1. Not Native to PySpark

LightGBM requires an external library for Spark integration, unlike built-in models such as LinearRegression or GBTRegressor.

2. SynapseML Dependency Conflicts

We attempted using Microsoft's SynapseML library (which wraps LightGBM for Spark), but encountered critical version incompatibilities:

Spark Version: Project uses Spark 4.0.1 (built on Scala 2.13).

SynapseML Version: Latest stable releases (e.g., 1.0.15) target older Spark 3.x versions (built on Scala 2.12).

Issue: Spark 4.x dropped support for Scala 2.12.

Result:
This mismatch caused JVM crashes (JAVA_GATEWAY_EXITED, Py4JError) when loading incompatible SynapseML JARs, preventing Spark from starting correctly.
Even experimental SynapseML versions compatible with Scala 2.13 led to classpath conflicts that were difficult to resolve.

-------------------------------------------------------------------------------------------------------------------------------------------------

Workaround: Native LightGBM on Sampled Data

To still evaluate LightGBM‚Äôs potential, a practical workaround was adopted:

1. Leverage Spark for Preprocessing:
Use PySpark for cleaning and feature engineering on the full dataset.

2. Sample Data:
Extract a manageable sample of 1 million rows from the clean Spark DataFrame (model_df).

3. Convert to Pandas:
Transform the sample into a Pandas DataFrame for local processing.

4. Use Native Libraries:
Train LightGBM using the native lightgbm Python package with scikit-learn for splitting and evaluation.

This enabled a quick and efficient test of LightGBM‚Äôs performance while bypassing Spark integration issues.

-----------------------------------------------------------------------------------------------------------------------------------------------

Why Sampling Was Necessary (Memory Limitations) üíæ

Running native LightGBM on the full dataset wasn‚Äôt feasible due to memory constraints:

* Pandas & RAM: Pandas must load all data into RAM.

* Dataset Size: 49 million rows √ó ~10 features far exceed typical laptop memory (8‚Äì32 GB).

* Spark‚Äôs Advantage: Spark processes data in distributed chunks, enabling it to handle much larger datasets.

Attempting .toPandas() on the full model_df would result in an ‚ÄúOut of Memory‚Äù error.
Hence, sampling was essential for this experiment.

---------------------------------------------------------------------------------------------------------------------------------------

Alternative Solutions (for Full Dataset Integration)

Integrating LightGBM with Spark at scale would typically require:

1. Compatible SynapseML Version:
Wait for or locate a SynapseML release for Spark 4.x / Scala 2.13.
(Experimental versions may exist but can be unstable or poorly documented.)

2. Managed Spark Platforms:
Use cloud-based Spark environments like:

* Databricks

* Azure Synapse Analytics

* AWS EMR

* Google Cloud Dataproc
These platforms simplify dependency management and may include pre-configured LightGBM support.

---------------------------------------------------------------------------------------------------------------------------------------

Recommendation for This Project

Given the local environment constraints and Spark 4.x compatibility issues, integrating LightGBM directly into Spark is impractical for this project.

**Recommended Approach**:

1. Proceed with PySpark‚Äôs native GBTRegressor:

* Continue with this model for final hyperparameter tuning using CrossValidator.

* It‚Äôs fully supported in PySpark and scales to the entire dataset.

2. Document the Native LightGBM Experiment:

* Include the Pandas/scikit-learn LightGBM results under ‚ÄúAdditional Exploration.‚Äù

* Emphasize its strong sample performance but note:

    * Limited to 1M rows due to memory constraints

    * Incompatibility with Spark 4.x environment