## Common functions notebook

This notebook is shared between all notebooks to perform common activities and provide reusable functions.

Intended to be run with `%run ...`, will not execute standalone.

### Imports

- pandas ➡️ `pd`
- geopandas ➡️ `gpd`
- pyspark functions ➡️ `F`
- pyspark window ➡️ `Window`


### Variables exposed

- `CONFIG` : Provides catalog, schema and secret lookup values as attributes.
### Functions provided

In [0]:
%pip install -r ./requirements.txt

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import geopandas as gpd
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import os
# Imports used by setup
from pprint import pprint
import json
from pydantic import BaseModel
from typing import Any, Dict
from pathlib import Path
import glob

In [0]:
  
class MyConfig(BaseModel):
    """Pydantic Config model to process the solution accelerator configs from a json file."""
    target_catalog: str
    target_schema: str
    secret_scope: str
    secret_key: str
    clone_raw_data: bool
    overwrite_data: bool
    
    @classmethod
    def from_json(cls, json_path: str) -> "Config":
        """Load configuration from a JSON file.
        
        Args:
            json_path: Path to the JSON configuration file
            
        Returns:
            Config object with attributes matching the JSON keys
        """
        path = Path(json_path)
        if not path.exists():
            raise FileNotFoundError(f"Config file not found: {json_path}")
            
        with open(path, "r") as f:
            config_data = json.load(f)
        return cls(**config_data)


json_config_path = glob.glob('**/config.json', recursive=True)[0]

CONFIG = MyConfig.from_json(json_config_path)
print(f"Config created with schema:")
pprint(CONFIG.schema())

In [0]:
# Simple helper function to convert our pandas dataframe with x, y grid coordinate data into a geopandas dataframe
def to_geodf(df: pd.DataFrame, sample_frac: float) -> gpd.GeoDataFrame:
  df["geom"] = gpd.points_from_xy(df.x, df.y, crs="EPSG:4326")
  return gpd.GeoDataFrame(
    df.sample(frac=sample_frac, random_state=42),
    geometry="geom"
  )