In [1]:
from pathlib import Path
import geopandas
import pandas as pd

In [2]:
data_dir = Path("../data/")
sources_dir = data_dir / "external" / "usgs" / "tile_index"
sources = [Path(p).resolve() for p in sources_dir.glob("*.zip")]
sources

[PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_SEDriftless_1_2021_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_SEDriftless_2_2021_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_SEDriftless_5_2021_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_MORiverBigSioux_1_B21_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_LakeSuperior_2_2021_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_UpperMissRiver_3_B22_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision/data/external/usgs/tile_index/USGS_MN_BeckerCo_1_2021_TileIndex.zip'),
 PosixPath('/home/dpower/projects/geospatial/culvert-vision

In [3]:
results = []
for file in sources:
    gdf = geopandas.read_file(file, ignore_geometry=True)
    results.append(
        {
            "filename": file.name,
            "first_record": gdf.iloc[0].to_dict(),
        }
    )
results

[{'filename': 'USGS_MN_SEDriftless_1_2021_TileIndex.zip',
  'first_record': {'GridName': '15TVJ650770',
   'Min_X': 465000.0,
   'Max_Y': 4878000.0,
   'Max_X': 466000.0,
   'Min_Y': 4877000.0,
   'area_sqmi': 0.386102158592,
   'PRJ_ID': 222538,
   'WU_ID': 228969,
   'WU_Name': 'MN_SEDriftless_1_2021'}},
 {'filename': 'USGS_MN_SEDriftless_2_2021_TileIndex.zip',
  'first_record': {'MN45INDEX': '6010_48650',
   'GridName': '15TXJ010650',
   'PRJ_ID': 222538,
   'WU_ID': 222535,
   'WU_Name': 'MN_SEDriftless_2_2021'}},
 {'filename': 'USGS_MN_SEDriftless_5_2021_TileIndex.zip',
  'first_record': {'MN45INDEX': '4435_49090',
   'GridName': '15TVK435090',
   'PRJ_ID': '222538',
   'WU_ID': '228991',
   'WU_Name': 'MN_SEDriftless_5_2021'}},
 {'filename': 'USGS_MN_MORiverBigSioux_1_B21_TileIndex.zip',
  'first_record': {'OBJECTID_1': 1,
   'OBJECTID': 15610,
   'XMINEASTIN': 309000,
   'YMINNORTHI': 4822000,
   'Shape_Leng': 4000.0,
   'XMINEAST_1': '309',
   'YMINNORT_1': '4822',
   'MN34INDE

In [4]:
import json

source_schema_json = data_dir / "interim" / "tile_index_schema.json"
with open(source_schema_json, "w") as f:
    f.write(json.dumps(results))

In [5]:
from dataclasses import dataclass


@dataclass
class TileIndexSource:
    file: Path
    include_fields: list[str]
    column_rename_mapper: dict[str, str]

In [16]:
def create_clean_tile_index_gdf(
    tile_index_source: TileIndexSource,
) -> geopandas.GeoDataFrame:
    return geopandas.read_file(
        filename=tile_index_source.file,
        include_fields=tile_index_source.column_rename_mapper.keys(),
    ).rename(columns=tile_index_source.column_rename_mapper)

In [17]:
mn_sedriftless_1_2020 = TileIndexSource(
    file=data_dir
    / "external"
    / "usgs"
    / "tile_index"
    / "USGS_MN_SEDriftless_1_2021_TileIndex.zip",
    include_fields=["GridName", "WU_ID", "WU_Name"],
    column_rename_mapper={
        "GridName": "name",
        "WU_ID": "workunit_id",
        "WU_Name": "workunit_name",
    },
)
mn_sedriftless_1_2020_gdf = create_clean_tile_index_gdf(mn_sedriftless_1_2020)
mn_sedriftless_1_2020_gdf

Unnamed: 0,name,workunit_id,workunit_name,geometry
0,15TVJ650770,228969,MN_SEDriftless_1_2021,"POLYGON ((466000.000 4877000.000, 465000.000 4..."
1,15TVJ660570,228969,MN_SEDriftless_1_2021,"POLYGON ((467000.000 4857000.000, 466688.361 4..."
2,15TVJ870960,228969,MN_SEDriftless_1_2021,"POLYGON ((488000.000 4897000.000, 488000.000 4..."
3,15TVJ910620,228969,MN_SEDriftless_1_2021,"POLYGON ((492000.000 4862000.000, 491000.000 4..."
4,15TWJ200930,228969,MN_SEDriftless_1_2021,"POLYGON ((521000.000 4893000.000, 520000.000 4..."
...,...,...,...,...
4666,15TWJ160720,228969,MN_SEDriftless_1_2021,"POLYGON ((517000.000 4873000.000, 517000.000 4..."
4667,15TVJ730650,228969,MN_SEDriftless_1_2021,"POLYGON ((474000.000 4865000.000, 473000.000 4..."
4668,15TVJ770780,228969,MN_SEDriftless_1_2021,"POLYGON ((477000.000 4878000.000, 477000.000 4..."
4669,15TVK660110,228969,MN_SEDriftless_1_2021,"POLYGON ((467000.000 4911000.000, 466000.000 4..."


In [8]:
mn_sedriftless_2_2021 = TileIndexSource(
    file=data_dir
    / "external"
    / "usgs"
    / "tile_index"
    / "USGS_MN_SEDriftless_2_2021_TileIndex.zip",
    include_fields=["GridName", "WU_ID", "WU_Name"],
    column_rename_mapper={
        "GridName": "name",
        "WU_ID": "workunit_id",
        "WU_Name": "workunit_name",
    },
)
mn_sedriftless_2_2021_gdf = create_clean_tile_index_gdf(mn_sedriftless_2_2021)
mn_sedriftless_2_2021_gdf

Unnamed: 0,name,workunit_id,workunit_name,geometry
0,15TXJ010650,222535,MN_SEDriftless_2_2021,"POLYGON ((602000.000 4865000.000, 601000.000 4..."
1,15TXJ340470,222535,MN_SEDriftless_2_2021,"POLYGON ((635000.000 4847000.000, 634000.000 4..."
2,15TWJ880790,222535,MN_SEDriftless_2_2021,"POLYGON ((589000.000 4879000.000, 588000.000 4..."
3,15TWK510230,222535,MN_SEDriftless_2_2021,"POLYGON ((552000.000 4923000.000, 551000.000 4..."
4,15TWK720120,222535,MN_SEDriftless_2_2021,"POLYGON ((573000.000 4912000.000, 572000.000 4..."
...,...,...,...,...
6076,15TWK830040,222535,MN_SEDriftless_2_2021,"POLYGON ((584000.000 4904000.000, 583000.000 4..."
6077,15TWK520050,222535,MN_SEDriftless_2_2021,"POLYGON ((552000.000 4906000.000, 553000.000 4..."
6078,15TXJ300560,222535,MN_SEDriftless_2_2021,"POLYGON ((631000.000 4856000.000, 630000.000 4..."
6079,15TWJ540940,222535,MN_SEDriftless_2_2021,"POLYGON ((555000.000 4894000.000, 554000.000 4..."


In [10]:
import json

with open(Path("../data/raw/mn_lakecounty_2018.json"), "r") as f:
    workunit_params = json.loads(f.read())
workunit_params

{'workunit': 'MN_LakeCounty_2018',
 'workunit_id': 187243,
 'tile_index_source_zip': 'MN_LakeCounty_2018_C20_TileIndex.zip',
 'tile_name_field': 'Name',
 'ept_json_href': 'https://s3-us-west-2.amazonaws.com/usgs-lidar-public/MN_SEDriftless_5_2021/ept.json'}

In [15]:
tile_index_dir = Path("../data/external/usgs/tile_index/")

tile_index_gdf = geopandas.read_file(
    filename=tile_index_dir / workunit_params["tile_index_source_zip"],
    include_fields=[workunit_params["tile_name_field"]],
)
tile_index_gdf["workunit_id"] = workunit_params["workunit_id"]
tile_index_gdf["workunit"] = workunit_params["workunit"]
tile_index_gdf

Unnamed: 0,Name,geometry,workunit_id,workunit
0,15TXN62772852,"POLYGON ((628500.000 5285250.000, 627750.000 5...",187243,MN_LakeCounty_2018
1,15TXN64572912,"POLYGON ((646500.000 5291250.000, 645750.000 5...",187243,MN_LakeCounty_2018
2,15TXN61652852,"POLYGON ((617250.000 5285250.000, 616500.000 5...",187243,MN_LakeCounty_2018
3,15TXP60003025,"POLYGON ((600046.926 5302500.000, 600000.000 5...",187243,MN_LakeCounty_2018
4,15TXN61422717,"POLYGON ((614250.000 5272500.000, 615000.000 5...",187243,MN_LakeCounty_2018
...,...,...,...,...
3036,15TXN62252867,"POLYGON ((623250.000 5286750.000, 622500.000 5...",187243,MN_LakeCounty_2018
3037,15TXN62022747,"POLYGON ((621000.000 5274750.000, 620250.000 5...",187243,MN_LakeCounty_2018
3038,15TWN59702942,"POLYGON ((597000.000 5295000.000, 597750.000 5...",187243,MN_LakeCounty_2018
3039,15TWN59622950,"POLYGON ((597000.000 5295000.000, 596250.000 5...",187243,MN_LakeCounty_2018
