In [1]:
from google.cloud import bigquery
from google.auth import default
import json
import os
from geodoc_loader.download.core import download_file_with_progress, unzip_file
from geodoc_loader.download.validate import find_files_with_extension
from geodoc_loader.handlers.core import save_geojson
from geodoc_loader.handlers.tiff import process_tif_files

In [2]:
save_path = "./solar_temp"
config_path = "./solar_config.json"

In [6]:
with open(config_path, 'r') as f:
    config = json.load(f)

### download solar data

In [6]:
os.makedirs(save_path, exist_ok=True)

download_file_with_progress(url=config['source'], target_path=os.path.join(save_path, config['filename']))
unzip_file(os.path.join(save_path, config['filename']), save_path)

Downloading ./solar_temp/solar_data.zip:   0%|          | 0.00/41.0 [00:00<?, ?MB/s]

Successfully extracted ./solar_temp/solar_data.zip to ./solar_temp
Deleted ZIP file: ./solar_temp/solar_data.zip


(True, None)

### check files

In [4]:
# find files that match keys from config['legend']
files = [(f, os.path.basename(f).split('.')[0]) for f in find_files_with_extension(save_path, '.tif') if os.path.basename(f).split('.')[0] in config['legend'].keys()]
print(f"Found {len(files)} files matching the legend keys.")
for file_path, key in files:
    print(f"File: {file_path}, Key: {key}")

Found 7 files matching the legend keys.
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/GTI.tif, Key: GTI
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/DIF.tif, Key: DIF
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/TEMP.tif, Key: TEMP
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/PVOUT.tif, Key: PVOUT
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/OPTA.tif, Key: OPTA
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/GHI.tif, Key: GHI
File: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/DNI.tif, Key: DNI


In [26]:
import rasterio

for tif_file in files:
    print(f"Processing file: {tif_file}")
    with rasterio.open(tif_file) as src:
        data = src.read(1)  # Read the first band
        print(f"Data shape: {data.shape}")
        print(f"Data type: {data.dtype}")
        print(f"CRS: {src.crs}")
        print(f"Transform: {src.transform}")

Processing file: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/GTI.tif
Data shape: (2800, 4400)
Data type: float32
CRS: EPSG:4326
Transform: | 0.00, 0.00, 14.00|
| 0.00,-0.00, 55.00|
| 0.00, 0.00, 1.00|
Processing file: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/DIF.tif
Data shape: (2800, 4400)
Data type: float32
CRS: EPSG:4326
Transform: | 0.00, 0.00, 14.00|
| 0.00,-0.00, 55.00|
| 0.00, 0.00, 1.00|
Processing file: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/TEMP.tif
Data shape: (840, 1320)
Data type: float32
CRS: EPSG:4326
Transform: | 0.01, 0.00, 14.00|
| 0.00,-0.01, 55.00|
| 0.00, 0.00, 1.00|
Processing file: ./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/PVOUT.tif
Data shape: (840, 1320)
Data type: float32
CRS: EPSG:4326
Transform: | 0.01, 0.00, 14.00|
| 0.00,-0.01, 55.00|
| 0.00, 0.00, 1.00|
Processing file: ./solar_temp/Poland_GISdata_LT

### collect tifs and save as geojson

In [37]:
table_spec = config['tables'][2]
table_files = [(f, symbol) for f, symbol in files if symbol in table_spec['symbols']]
# reorder files to match table_spec['symbols']
table_files = sorted(table_files, key=lambda x: table_spec['symbols'].index(x[1]))
table_files = [f[0] for f in table_files]
table_files

['./solar_temp/Poland_GISdata_LTAy_YearlyMonthlyTotals_GlobalSolarAtlas-v2_GEOTIFF/OPTA.tif']

In [38]:
gdf = process_tif_files(table_files)

In [39]:
len(gdf)

69300

In [40]:
gdf.head()

Unnamed: 0,OPTA,row,col,x,y,geometry,id
0,-9999,0,0,14.0,55.0,"POLYGON ((14.03333 55.00000, 14.03333 54.96667...",6bfdbe7c-e8a3-4c72-be06-2e6fa9b66898
1,-9999,0,1,14.033333,55.0,"POLYGON ((14.06667 55.00000, 14.06667 54.96667...",580bfdad-c3a6-4087-a382-dd49fc42c063
2,-9999,0,2,14.066667,55.0,"POLYGON ((14.10000 55.00000, 14.10000 54.96667...",df0c9b11-30da-4595-bc86-35a1534736d6
3,-9999,0,3,14.1,55.0,"POLYGON ((14.13333 55.00000, 14.13333 54.96667...",15445c76-541d-4f64-9b25-a21a3e964f7c
4,-9999,0,4,14.133333,55.0,"POLYGON ((14.16667 55.00000, 14.16667 54.96667...",a416790e-25c9-4e2e-bc82-31f8eb2ec9b0


In [44]:
# get gdf columns types
gdf.dtypes

OPTA           int32
row            int64
col            int64
x            float64
y            float64
geometry    geometry
id            object
dtype: object

In [45]:
save_geojson(gdf, save_path, table_spec['table_name'])

GeoJSON saved to ./solar_temp/optimum.geojson


('./solar_temp/optimum.geojson', None)

### setup BigQuery tables

In [7]:
from geodoc_loader.download.gcp import create_gcs_bucket, create_bigquery_dataset, create_bigquery_table

def prepare_table_raw_schema_from_symbols(symbols, types):
    """
    Prepare BigQuery table schema from symbols and types.
    Args:
        symbols (list): List of column names.
        types (list): List of corresponding BigQuery types.
    Returns:
        list: List of dictionaries representing the schema.
    """
    return [{"name": symbol, "type": t, "mode": "NULLABLE"} for symbol, t in zip(symbols, types)]

client = bigquery.Client()
project_id = client.project

# Create GCS bucket
create_gcs_bucket(project_id, f"{project_id}-{config['bucket_name']}")

# Create BigQuery dataset
create_bigquery_dataset(project_id, config['dataset_name'])

# Create BigQuery tables
for table_spec in config['tables']:
    mid_schema = prepare_table_raw_schema_from_symbols(table_spec['symbols'], table_spec['types'])
    schema = config['base_columns'] + mid_schema + config['additional_columns']

    create_bigquery_table(
        table_name=table_spec['table_name'],
        collection_name=config['dataset_name'],
        project_id=project_id,
        columns_spec=schema,
        additional_columns=[]
    )

Bucket 'geodoc-386107-single-loads' already exists.
Dataset 'solar' created successfully.
Created table geodoc-386107.solar.irradiation
Created table geodoc-386107.solar.pv
Created table geodoc-386107.solar.optimum


### upload to BigQuery

In [8]:
from geodoc_loader.download.gcp import upload_to_gcs, load_geojson_to_bigquery
from google.cloud import storage

client = bigquery.Client()
project_id = client.project
storage_client = storage.Client(project=project_id)

for table_spec in config['tables'][::-1]:
    print(f"Processing table: {table_spec['table_name']}")
    result, err = upload_to_gcs(
        storage_client=storage_client,
        bucket_name=f"{project_id}-{config['bucket_name']}",
        folder_name=config['dataset_name'],
        file_name=f"{table_spec['table_name']}.geojson",
        local_file_path=os.path.join(save_path, f"{table_spec['table_name']}.geojson")
    )
    if err:
        print(f"Error uploading {table_spec['table_name']}.geojson: {err}")
        break

    result, err = load_geojson_to_bigquery(
        client=client,
        project_id=project_id,
        dataset_name=config['dataset_name'],
        table_name=table_spec['table_name'],
        gcs_uri=result
    )
    if err:
        print(f"Error loading {table_spec['table_name']} to BigQuery: {err}")
        break

Uploaded ./solar_temp/optimum.geojson to gs://geodoc-386107-single-loads/solar/optimum.geojson
Loaded 69300 rows into geodoc-386107.solar.optimum
Uploaded ./solar_temp/pv.geojson to gs://geodoc-386107-single-loads/solar/pv.geojson
Loaded 645084 rows into geodoc-386107.solar.pv
Uploaded ./solar_temp/irradiation.geojson to gs://geodoc-386107-single-loads/solar/irradiation.geojson
Loaded 7160121 rows into geodoc-386107.solar.irradiation


### delete temp files

In [9]:
from geodoc_loader.download.gcp import delete_gcs_temp_files
from geodoc_loader.download.core import delete_local_temp_files

for table_spec in config['tables']:
    print(f"Deleting temporary files for table: {table_spec['table_name']}")
    delete_gcs_temp_files(
        storage_client=storage_client,
        bucket_name=f"{project_id}-{config['bucket_name']}",
        folder_name=config['dataset_name'],
        file_name=f"{table_spec['table_name']}.geojson"
    )

print("Deleting local temporary files...")
delete_local_temp_files(save_path)

Deleting temporary files for table: irradiation
Deleted GCS object: solar/irradiation.geojson
Deleting temporary files for table: pv
Deleted GCS object: solar/pv.geojson
Deleting temporary files for table: optimum
Deleted GCS object: solar/optimum.geojson
Deleting local temporary files...
Deleted local temporary directory: ./solar_temp
