# DATA420 A1 — Processing (additions only)
These cells can be appended to your existing **Processing.ipynb**. They avoid guards and focus on straightforward functionality.

In [None]:
# Setup Spark session and common paths
from pyspark.sql import SparkSession, functions as F, types as T

print("Starting SparkSession for Processing additions...")
spark = SparkSession.builder.getOrCreate()
print("Spark version:", spark.version)

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"

WASBS_DATA = f"wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd"
WASBS_DAILY = f"{WASBS_DATA}/daily"
WASBS_COUNTRIES = f"{WASBS_DATA}/countries.txt"
WASBS_STATES = f"{WASBS_DATA}/states.txt"
WASBS_STATIONS = f"{WASBS_DATA}/stations.txt"
WASBS_INVENTORY = f"{WASBS_DATA}/inventory.txt"

WASBS_USER_BASE = f"wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/dew59"
print("DATA:", WASBS_DATA)
print("USER:", WASBS_USER_BASE)

In [None]:
# Q1: Inspect storage and capture year sizes
import subprocess, re, pandas as pd

print("Listing data container...")
_ = subprocess.call(["bash","-lc", f"hdfs dfs -ls {WASBS_DATA}"])

print("Listing daily files...")
ls_output = subprocess.check_output(["bash","-lc", f"hdfs dfs -ls {WASBS_DAILY}"], text=True)
lines = [ln for ln in ls_output.splitlines() if ln.strip().endswith(".csv.gz")]

rows = []
for ln in lines:
    parts = ln.split()
    size_bytes = int(parts[4])
    path = parts[-1]
    year = int(re.search(r'/([0-9]{4})\.csv\.gz$', path).group(1))
    rows.append((year, size_bytes, path))

year_sizes_df = pd.DataFrame(rows, columns=["year","size_bytes","path"]).sort_values("year")
year_sizes_df["size_MB"] = year_sizes_df["size_bytes"] / (1024*1024)
print("Year-size head:")
print(year_sizes_df.head(10))

print("Total daily size (MB):", round(year_sizes_df["size_MB"].sum(),2))

year_sizes_sdf = spark.createDataFrame(year_sizes_df[["year","size_bytes","size_MB"]])
target_year_sizes = f"{WASBS_USER_BASE}/years_size_metrics.parquet/"
print("Writing:", target_year_sizes)
year_sizes_sdf.write.mode("overwrite").parquet(target_year_sizes)
print("Done writing years_size_metrics.parquet")

In [None]:
# Q2(a,b): Daily schema and most recent year subset
daily_schema = T.StructType([
    T.StructField("ID", T.StringType(), True),
    T.StructField("DATE", T.StringType(), True),
    T.StructField("ELEMENT", T.StringType(), True),
    T.StructField("VALUE", T.IntegerType(), True),
    T.StructField("MFLAG", T.StringType(), True),
    T.StructField("QFLAG", T.StringType(), True),
    T.StructField("SFLAG", T.StringType(), True),
    T.StructField("OBSTIME", T.StringType(), True)
])

most_recent_year = year_sizes_df["year"].max()
recent_path = f"{WASBS_DAILY}/{most_recent_year}.csv.gz"
print("Most recent year:", most_recent_year)
print("Loading:", recent_path)

daily_recent = spark.read.csv(recent_path, schema=daily_schema, header=False, mode="PERMISSIVE")
print("Recent daily count:", daily_recent.count())
daily_recent.show(5, truncate=False)

In [None]:
# Q2(c): Load fixed-width metadata tables
stn = spark.read.text(WASBS_STATIONS).withColumnRenamed("value","raw")
stations_df = stn.select(
    F.substring("raw",1,11).alias("id"),
    F.trim(F.substring("raw",13,8)).cast("double").alias("latitude"),
    F.trim(F.substring("raw",22,9)).cast("double").alias("longitude"),
    F.trim(F.substring("raw",32,6)).cast("double").alias("elevation"),
    F.trim(F.substring("raw",39,2)).alias("state"),
    F.trim(F.substring("raw",42,30)).alias("name"),
    F.trim(F.substring("raw",73,3)).alias("gsn_flag"),
    F.trim(F.substring("raw",77,3)).alias("hcn_crn_flag"),
    F.trim(F.substring("raw",81,5)).alias("wmo_id")
)

cty = spark.read.text(WASBS_COUNTRIES).withColumnRenamed("value","raw")
countries_df = cty.select(
    F.substring("raw",1,2).alias("code"),
    F.trim(F.substring("raw",4,61)).alias("country_name")
)

sta = spark.read.text(WASBS_STATES).withColumnRenamed("value","raw")
states_df = sta.select(
    F.substring("raw",1,2).alias("state_code"),
    F.trim(F.substring("raw",4,47)).alias("state_name")
)

inv = spark.read.text(WASBS_INVENTORY).withColumnRenamed("value","raw")
inventory_df = inv.select(
    F.substring("raw",1,11).alias("id"),
    F.trim(F.substring("raw",13,8)).cast("double").alias("lat"),
    F.trim(F.substring("raw",22,9")).cast("double").alias("lon"),
    F.trim(F.substring("raw",32,4)).alias("element"),
    F.trim(F.substring("raw",37,4")).cast("int").alias("firstyear"),
    F.trim(F.substring("raw",42,4")).cast("int").alias("lastyear")
)

print("Stations:", stations_df.count())
print("Countries:", countries_df.count())
print("States:", states_df.count())
print("Inventory:", inventory_df.count())

stations_df.show(3, truncate=False)
countries_df.show(3, truncate=False)
states_df.show(3, truncate=False)
inventory_df.show(3, truncate=False)

In [None]:
# Q3(a–e): Enriched stations and core flags
stations_with_cc = stations_df.withColumn("country_code", F.substring("id",1,2))

stations_cc = stations_with_cc.join(
    countries_df.withColumnRenamed("code","country_code"),
    on="country_code",
    how="left"
).join(
    states_df,
    stations_with_cc["state"] == states_df["state_code"],
    how="left"
).drop(states_df["state_code"])

inv_agg = (inventory_df
           .groupBy("id")
           .agg(F.min("firstyear").alias("first_year"),
                F.max("lastyear").alias("last_year"),
                F.size(F.collect_set("element")).alias("element_count"),
                F.collect_set("element").alias("element_set"))
)

enriched_stations = (stations_cc.join(inv_agg, on="id", how="left")
                     .withColumn("has_prcp", F.array_contains("element_set","PRCP"))
                     .withColumn("has_snow", F.array_contains("element_set","SNOW"))
                     .withColumn("has_snwd", F.array_contains("element_set","SNWD"))
                     .withColumn("has_tmax", F.array_contains("element_set","TMAX"))
                     .withColumn("has_tmin", F.array_contains("element_set","TMIN"))
                     .withColumn("core_count",
                                 F.expr("int(has_prcp) + int(has_snow) + int(has_snwd) + int(has_tmax) + int(has_tmin)"))
)

count_all_five = enriched_stations.filter(F.col("core_count")==5).count()
count_prcp_only = enriched_stations.filter(
    (F.col("has_prcp")==True) & (F.col("has_snow")==False) & (F.col("has_snwd")==False) & (F.col("has_tmax")==False) & (F.col("has_tmin")==False)
).count()

print("Stations with all five core elements:", count_all_five)
print("Stations with precipitation only:", count_prcp_only)

target_enriched = f"{WASBS_USER_BASE}/enriched_stations.parquet/"
print("Writing enriched stations to:", target_enriched)
enriched_stations.write.mode("overwrite").parquet(target_enriched)
print("Done")

In [None]:
# Q4(a,b): Stations present in stations but not in daily
daily_all = spark.read.csv(f"{WASBS_DAILY}/*.csv.gz", schema=daily_schema, header=False, mode="PERMISSIVE")
daily_station_ids = daily_all.select("ID").distinct().withColumnRenamed("ID","id")

stations_not_in_daily = enriched_stations.select("id").join(daily_station_ids, on="id", how="left_anti")
missing_count = stations_not_in_daily.count()
print("Stations in stations but not in daily:", missing_count)

target_missing = f"{WASBS_USER_BASE}/stations_missing_in_daily.parquet/"
print("Writing:", target_missing)
stations_not_in_daily.write.mode("overwrite").parquet(target_missing)
print("Done")

In [None]:
# Extra outputs used later in analysis and visualisation
q2a_station_date_element = (daily_all
    .select(F.col("ID").alias("id"), "DATE", "ELEMENT")
)
out1 = f"{WASBS_USER_BASE}/q2a_station_date_element.parquet/"
print("Writing:", out1)
q2a_station_date_element.write.mode("overwrite").parquet(out1)

daily_prcp = daily_all.filter(F.col("ELEMENT")=="PRCP").withColumn("prcp_mm", F.col("VALUE")/10.0)
daily_with_cc = daily_prcp.join(enriched_stations.select("id","country_code"), on="id", how="left")

q2a_prcp_year_country = (daily_with_cc
    .withColumn("year", F.substring("DATE",1,4).cast("int"))
    .groupBy("country_code","year")
    .agg(F.mean("prcp_mm").alias("avg_prcp_mm"))
)

out2 = f"{WASBS_USER_BASE}/q2a_prcp_year_country.parquet/"
print("Writing:", out2)
q2a_prcp_year_country.write.mode("overwrite").parquet(out2)

print("All Processing additions complete.")