In [0]:
%pip install python-dotenv

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
from pyspark.sql.functions import min as spark_min, max as spark_max, when 
from pyspark.sql.types import ArrayType, DoubleType, StringType, StructType, StructField, IntegerType, LongType, FloatType

from delta.tables import DeltaTable

from sedona.spark import *

import random
import geopandas as gpd
from pathlib import Path
import time
import os
import numpy as np
from PIL import Image, ImageDraw
from dotenv import load_dotenv
import requests
import pandas as pd
from shapely import wkt
from shapely.errors import WKTReadingError


In [0]:
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"

bronze_table = f"{catalog_dev}.{schema_dev}.utensnuplass_bronze"
silver_table = f"{catalog_dev}.{schema_dev}.utensnuplass_silver"

spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")

In [0]:
def load_geometry_from_column()-> DataFrame:
    """
    Loads the geometries from the bronze table and returns a dataframe with the geometries as a column.
    """
    df_bronze= spark.read.table(bronze_table).withColumn("geometry", F.expr("ST_GeomFromWKT(bbox)"))
    return df_bronze 

In [0]:
def add_envolope_column(df:DataFrame) -> DataFrame:
    """
    Adds a column with the envelope of the geometries.
    """
    return df.withColumn("envelope", F.expr("ST_Boundary(geometry)"))


In [0]:
def random_adjusted_bbox_centered(
    envelope: list,
    min_size: int = 256,
    max_size: int = 256,
    margin: int = 30,
    max_offset: float = 80,  # margin - få meter
    max_attempts: int = 10
) -> list:
    
    import random

    xmin, ymin, xmax, ymax = envelope
    poly_width = xmax - xmin
    poly_height = ymax - ymin

    # Beregn ønsket BBOX-størrelse
    bbox_size = max(poly_width, poly_height) + margin * 2
    bbox_size = min(max(bbox_size, min_size), max_size)
    half_size = bbox_size / 2

    # Polygonets sentrum
    center_x_orig = (xmin + xmax) / 2
    center_y_orig = (ymin + ymax) / 2

    for _ in range(max_attempts):
        dx = random.uniform(-max_offset, max_offset)
        dy = random.uniform(-max_offset, max_offset)

        center_x = center_x_orig + dx
        center_y = center_y_orig + dy

        # Lag BBOX
        adjusted_xmin = center_x - half_size
        adjusted_xmax = center_x + half_size
        adjusted_ymin = center_y - half_size
        adjusted_ymax = center_y + half_size

        # Sjekk at hele polygonet er innenfor den justerte BBOX-en
        if (adjusted_xmin <= xmin and adjusted_ymin <= ymin and
            adjusted_xmax >= xmax and adjusted_ymax >= ymax):
            bbox = [adjusted_xmin, adjusted_ymin, adjusted_xmax, adjusted_ymax]
            bbox_str = "_".join(f"{v:.6f}" for v in bbox)
            return {"bbox" : bbox, "bbox_str" : bbox_str}

    raise ValueError("Fant ikke gyldig adjusted_bbox etter flere forsøk")

In [0]:
def make_bbox(df: DataFrame, buffer: float = 20.0) -> DataFrame:
    """
    Generates a bounding box for each row based on the 'envelope' geometry,
    expands it slightly with a buffer, and returns random adjusted boxes.

    Args:
        df: Spark DataFrame with a 'geometry' column (as ST_GeomFromWKT).
        buffer: Extra margin (in meters) added around the bounding box.

    Returns:
        DataFrame with added columns: bbox, bbox_str, Adjusted_bbox, Polygons
    """

    # Create bounding box array from envelope
    df = df.withColumn(
        "bbox",
        F.expr(f"""
        array(
            ST_X(ST_Centroid(envelope)) - (GREATEST(ST_XMax(envelope) - ST_XMin(envelope), ST_YMax(envelope) - ST_YMin(envelope)) / 2 + {buffer}),
            ST_Y(ST_Centroid(envelope)) - (GREATEST(ST_XMax(envelope) - ST_XMin(envelope), ST_YMax(envelope) - ST_YMin(envelope)) / 2 + {buffer}),
            ST_X(ST_Centroid(envelope)) + (GREATEST(ST_XMax(envelope) - ST_XMin(envelope), ST_YMax(envelope) - ST_YMin(envelope)) / 2 + {buffer}),
            ST_Y(ST_Centroid(envelope)) + (GREATEST(ST_XMax(envelope) - ST_XMin(envelope), ST_YMax(envelope) - ST_YMin(envelope)) / 2 + {buffer})
        )
        """)
    )

    # Turn bbox array into ST_Polygon
    df = df.withColumn(
        "Polygons",
        F.expr("ST_MakeEnvelope(bbox[0], bbox[1], bbox[2], bbox[3])")
    )

    # Apply adjusted BBOX logic using the UDF
    df = df.withColumn(
        "adjusted_struct",
        adjusted_bbox_udf(F.col("bbox"))
    ).withColumn(
        "Adjusted_bbox", F.col("adjusted_struct.bbox")
    ).withColumn(
        "bbox_str", F.col("adjusted_struct.bbox_str")
    ).drop("envelope")  # Drop envelope if no longer needed

    return df


In [0]:
def generate_blank_mask(id:str, save_dir: str= "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/utenSlabel"):

    """
    Generates a blank mask image for a given ID and saves it to the specified directory.
    """

    file_path= os.path.join(save_dir, f"{id}.png")
    
    os.makedirs(save_dir, exist_ok=True)

    mask = Image.new("L", (256, 256), color=0)
    mask.save(file_path)

def generate_blank_masks_for_pending(df: pd.DataFrame):
    """
    Creates blank masks only for rows where mask_status is 'PENDING'.
    """
    pending = df[df["mask_status"] == "PENDING"]
    print(f"{len(pending)} mask(s) will be generated.")

    for row in pending.itertuples():
        generate_blank_mask(row.id)

In [0]:
def generate_dom_url(bbox_str):
    width, height = [512, 512]
    return (
        f"https://wms.geonorge.no/skwms1/wms.hoyde-dom-nhm-25833?request=GetMap&Format=image/png&"
        f"GetFeatureInfo=text/plain&CRS=EPSG:25833&Layers=NHM_DOM_25833:skyggerelieff&"
        f"BBOX={bbox_str}&width={width}&height={height}"
    )

def generate_image_url(bbox_str):
    width, height = [512, 512]
    return (
        f"https://wms.geonorge.no/skwms1/wms.nib?VERSION=1.3.0"
        f"&service=WMS&request=GetMap&Format=image/png&"
        f"GetFeatureInfo=text/plain&CRS=EPSG:25833&Layers=ortofoto&"
        f"BBox={bbox_str}&width={width}&height={height}&TICKET="
    )

def dom_file_exists(id: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/utenSdom/dom_{id}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

def image_file_exists(id: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/utenSimage/image_{id}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

def mask_file_exists(id: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/utenSlabel/mask_{id}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

In [0]:
def write_delta_table(sdf: DataFrame):
    if not spark.catalog.tableExists(silver_table):
        sdf.write.format("delta") \
            .option("mergeSchema", "true") \
            .mode("overwrite") \
            .saveAsTable(silver_table)
    else:
        delta_tbl = DeltaTable.forName(spark, silver_table)
        delta_tbl.alias("target") \
            .merge(
                source=sdf.alias("source"),
                condition="target.row_hash = source.row_hash"
            ) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()


In [0]:

def safe_load_wkt(val):
    if isinstance(val, str):
        try:
            return wkt.loads(val)
        except WKTReadingError:
            print("❌ Hatalı WKT:", val)
            return None
    return None

def to_geopandas(df: DataFrame, column_name: str):
    pdf = df.toPandas()
    print("🔍 DataFrame Pandas'a dönüştürüldü.")
    print("Kolonlar:", pdf.columns)
    print("İlk 3 satır (WKT):", pdf[column_name].head(3))

    pdf["geometry"] = pdf[column_name].apply(safe_load_wkt)
    return gpd.GeoDataFrame(pdf, geometry="geometry", crs="EPSG:25833")


In [0]:
adjusted_bbox_schema = StructType([
    StructField("bbox", ArrayType(DoubleType())),
    StructField("bbox_str", StringType())
])
adjusted_bbox_udf = F.udf(lambda envelope: random_adjusted_bbox_centered(envelope), adjusted_bbox_schema)
generate_dom_url_udf = F.udf(generate_dom_url, StringType())
generate_image_url_udf = F.udf(generate_image_url, StringType())
dom_file_exists_udf = F.udf(dom_file_exists, StringType())
image_file_exists_udf = F.udf(image_file_exists, StringType())
mask_file_exists_udf = F.udf(mask_file_exists, StringType())

df= load_geometry_from_column()
df = add_envolope_column(df)
df = make_bbox(df)
df = df.withColumn("image_path", generate_image_url_udf("Adjusted_bbox")) \
       .withColumn("dom_path", generate_dom_url_udf("Adjusted_bbox")) \
       .withColumn("image_status", image_file_exists_udf("row_hash")) \
       .withColumn("dom_status", dom_file_exists_udf("row_hash")) \
       .withColumn("mask_status", mask_file_exists_udf("row_hash")) \
       .withColumn("lastet_tid", F.current_timestamp())

gdf= to_geopandas(df, "bbox")
generate_blank_masks_for_pending(gdf) 
write_delta_table(df)






In [0]:
df.display()

In [0]:
df_retable = spark.read.table(silver_table)
df_overview = df_retable.drop(
    "Shape_Length",
    "Shape_Area",
    "bbox",
    "id",
    "ingest_time",
    "source_file",
    "geometry",
    "Polygons",
    "adjusted_struct",
    "Adjusted_bbox",
    "bbox_str",
    "image_wms",
    "dom_wms",
    "lastet_tid"
)
 
df_overview.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("utenSnuplass_status_overview")
 