# NID Validation Checks

Perform a check of the integrity of NID values within the dataset. This is done to make sure that all of the NID values within the dataset adhere to the rules as defined in the documentation.

In [1]:
import geopandas as gpd
import pandas as pd
from pathlib import Path
# used for reading shapefiles - not required for GPKG data
from simpledbf import Dbf5

# STRPLANAME Checks

Using the STRPLANAME dbf data, check for any mismatches in NID vs segment idenfitication fields.

In [2]:
# Path to the dataset and the layer name for multilayered data.
data_path = Path('../../nrn_data/NRN_RRN_PE_18_0_GPKG/NRN_PE_18_0_GPKG/NRN_PE_18_0.gpkg')
data_layer = 'NRN_PE_18_0_STRPLANAME'

In [16]:
# data_path = Path('../../nrn_data/nrn_rrn_nb_shp_en/NRN_NB_8_0_STRPLANAME.dbf')

data_path = Path('../../nrn_data/nrn_rrn_ab_shp_en/NRN_AB_14_0_STRPLANAME.dbf')

In [3]:
# Fields of interest for comparisons.

# The NID field should end up having the same value for every record when the connected segment fields are grouped.
nid_field = 'nid'

# These fields are grouped together to identify which segments should be idenfied as having a common NID value.
connected_segment_fields = ['namebody', 'strtypre', 'strtysuf', 'dirprefix', 'dirsuffix']

# The fields we are initerested in analyzing are the combination of the connected segment fields and the NID field.
fields_of_concern = connected_segment_fields + [nid_field]

## Read the DBF file into a DataFrame

This reads the data from the appropriate source. Both DBF and GPKG sources are supported.

In [17]:
# Shapefiles
dbf = Dbf5(data_path)
df = dbf.to_dataframe().rename(columns=str.lower).filter(fields_of_concern)

In [None]:
# GeoPackage table
df = (pd.read_sql_table(data_layer, f"sqlite:///{data_path.as_posix()}", coerce_float=False)
    .rename(columns=str.lower)
    .filter(fields_of_concern))

In [22]:
# Group items by the connected segment fields, and then look for any differences across the groups.
problem_nids = []
planame_groups = df.groupby(connected_segment_fields)
for group_id, group_records in planame_groups:
    # ignore any single record groups
    if len(group_records) == 1:
        continue

    # look for duplicates on the records - everything should be duplicated
    if not group_records.duplicated().all():
        bad_nids = group_records[nid_field].tolist()
        problem_nids.append(bad_nids)

In [25]:
print(f"Found {len(planame_groups)} groups")
print(f"Found {len(problem_nids)} problem groups")

Found 27706 groups
Found 4129 problem groups


In [14]:
fields_of_concern

['namebody', 'strtypre', 'strtysuf', 'dirprefix', 'dirsuffix', 'nid']