In [1]:
# Core Spark Imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import * # Import common functions like col, split, sin, cos, etc.
from pyspark.sql.types import * # Import data types like StringType, DoubleType, etc.
import math # Import math for pi constant

print("Starting Spark Session...")

Starting Spark Session...


In [2]:
# Build the SparkSession (using basic configuration)
spark = SparkSession.builder \
    .appName("WeatherPreprocessing_Notebook") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/29 00:56:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Version: 4.0.1


In [3]:
# --- Define Helper Function ---
# (We define it early so subsequent cells can use it)
def clean_weather_column(
    input_df,
    col_name,
    missing_code,
    quality_flags,
    scale_factor,
    handle_signs=False,
):
    """Cleans NOAA value,flag columns, handling potential signs."""
    df_with_c = input_df.where(col(col_name).isNotNull() & col(col_name).contains(","))
    df_p = (
        df_with_c.withColumn(f"{col_name}_parts", split(col(col_name), ","))
        .withColumn(f"{col_name}_value", col(f"{col_name}_parts").getItem(0)) # Use getItem for safety
        .withColumn(f"{col_name}_flag", col(f"{col_name}_parts").getItem(1))   # Use getItem for safety
    )
    df_good = df_p.where(
        (col(f"{col_name}_value") != missing_code)
        & (col(f"{col_name}_flag").isin(quality_flags))
    )

    value_col_name = f"{col_name}_value"
    if handle_signs:
        # Create a temporary column to handle '+' or '-' prefixes
        signed_value_col = f"{col_name}_signed_value_temp"
        df_good = df_good.withColumn(
            signed_value_col,
             # Remove '+' sign, keep '-' sign, handle others
            regexp_replace(col(value_col_name), r"^\+", "")
        )
        value_col_name = signed_value_col # Use this column for casting

    clean_col_name = col_name.lower() + "_clean"
    df_final = df_good.withColumn(
        clean_col_name,
        col(value_col_name).cast(DoubleType()) / scale_factor,
    )

    # Drop intermediate columns including the temporary signed value if created
    drop_cols = [col_name, f"{col_name}_parts", f"{col_name}_value", f"{col_name}_flag"]
    if handle_signs:
        drop_cols.append(signed_value_col)

    df_final = df_final.drop(*drop_cols)
    return df_final

print("Helper function 'clean_weather_column' defined.")

Helper function 'clean_weather_column' defined.


## Define Schema and load data

In [4]:
# --- Define Schema ---
# Defining the schema manually is more reliable and faster than inferSchema=True
# Based on the columns in your provided script
schema = StructType([
    StructField("STATION", StringType(), True),
    StructField("DATE", StringType(), True),
    StructField("SOURCE", StringType(), True),
    StructField("LATITUDE", StringType(), True), # Load as String initially
    StructField("LONGITUDE", StringType(), True),# Load as String initially
    StructField("ELEVATION", StringType(), True),# Load as String initially
    StructField("NAME", StringType(), True),
    StructField("REPORT_TYPE", StringType(), True),
    StructField("CALL_SIGN", StringType(), True),
    StructField("QUALITY_CONTROL", StringType(), True),
    StructField("WND", StringType(), True),
    StructField("CIG", StringType(), True),
    StructField("VIS", StringType(), True),
    StructField("TMP", StringType(), True),
    StructField("DEW", StringType(), True),
    StructField("SLP", StringType(), True),
    # Add other string fields if needed, based on full dataset documentation
    # For now, focusing on the ones used in your script
    StructField("REM", StringType(), True), # Example REM column often present
    StructField("EQD", StringType(), True)  # Example EQD column often present
])

In [6]:
# Get the path to one of the CSV files inside the 2024 folder
# Replace with an actual file name if needed
sample_file_path = "2024/01001099999.csv"

In [7]:
print("Inferring schema from sample file...")
inferred_schema = spark.read.option("header", "true").csv(sample_file_path).schema
print("Schema inferred successfully.")
# Optional: Print the full inferred schema to see all columns
# inferred_schema.printTreeString()

Inferring schema from sample file...
Schema inferred successfully.


In [9]:
# --- Step 1: Infer the Full Schema from a Sample File ---

# Get the path to one of the CSV files inside the 2024 folder
# Make sure this file exists!
sample_file_path = "2024/01001099999.csv" 

print(f"Inferring schema from sample file: {sample_file_path}")
try:
    # Read the header and guess the schema from just ONE file
    inferred_schema = spark.read.option("header", "true").csv(sample_file_path).schema
    print("Schema inferred successfully.")
    # Optional: See all the columns Spark found
    # inferred_schema.printTreeString() 
except Exception as e:
    print(f"Error inferring schema from {sample_file_path}: {e}")
    # Handle error, maybe try a different file or check the path
    raise e # Stop execution if schema inference fails

# --- Step 2: Load ALL Data Using the Inferred Schema ---
data_path = "2024/" 
print(f"Reloading data using FULL inferred schema from folder: {data_path}")

# Load ALL csv files using the schema Spark just figured out
df_raw = spark.read.option("header", "true").schema(inferred_schema).csv(data_path)

# --- Initial Check ---
print(f"Loaded {df_raw.count()} records.")
print("Schema of loaded data:")
df_raw.printSchema() # Should now show all columns without errors/warnings
print("\nSample of raw data:")
df_raw.show(5, truncate=False)

Inferring schema from sample file: 2024/01001099999.csv
Schema inferred successfully.
Reloading data using FULL inferred schema from folder: 2024/




Loaded 130223689 records.
Schema of loaded data:
root
 |-- STATION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- REPORT_TYPE: string (nullable = true)
 |-- CALL_SIGN: string (nullable = true)
 |-- QUALITY_CONTROL: string (nullable = true)
 |-- WND: string (nullable = true)
 |-- CIG: string (nullable = true)
 |-- VIS: string (nullable = true)
 |-- TMP: string (nullable = true)
 |-- DEW: string (nullable = true)
 |-- SLP: string (nullable = true)
 |-- AA1: string (nullable = true)
 |-- AA2: string (nullable = true)
 |-- AA3: string (nullable = true)
 |-- AJ1: string (nullable = true)
 |-- AY1: string (nullable = true)
 |-- AY2: string (nullable = true)
 |-- GA1: string (nullable = true)
 |-- GA2: string (nullable = true)
 |-- GA3: string (nullable = true)
 |-- GE1: string (n

25/10/29 01:02:14 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 54, schema size: 39
CSV file: file:///Users/chris/Desktop/Project/DSA5208/Project%202/2024/99999963894.csv
