# Title
**Author**:  Greg Slater <br>
**Date**:  24th September 2024 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>

## Purpose


In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import urllib
# from sqlite_query_functions import DatasetSqlite
from datetime import datetime

td = datetime.today().strftime('%Y-%m-%d')

pd.set_option("display.max_rows", 100)

data_dir = "data/"
os.makedirs(data_dir, exist_ok=True)


In [2]:
def get_pdp_dataset(dataset, geometry_field = "geometry", crs_out=4326, underscore_cols=True):

    df = pd.read_csv(f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype = "str")
    df.columns = [x.replace("-", "_") for x in df.columns]

    df_valid_geom = df[df[geometry_field].notnull()].copy()

    # load geometry and create GDF
    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(shapely.wkt.loads)
    gdf = gpd.GeoDataFrame(df_valid_geom, geometry = geometry_field)

    # Transform to ESPG:27700 for more interpretable area units
    gdf.set_crs(epsg=4326, inplace=True)
    gdf.to_crs(epsg=crs_out, inplace=True)

    return gdf

## Data Import

In [60]:
seed_dupe_df = pd.read_csv("data/DuplicateEntityNames2.csv", dtype = str)

seed_dupe_df[["entity_max", "entity_min"]]= seed_dupe_df[["entity_max", "entity_min"]].astype(str)
seed_dupe_df["entity_join"] = seed_dupe_df.apply(lambda x: '-'.join(sorted(x[["entity_min", "entity_max"]])), axis=1)

print(len(seed_dupe_df))
# seed_dupe_df.head()

issues_df = pd.read_csv("data/issues_all.csv", dtype = "str")
print(len(issues_df))
issues_df.head()

47
2347


Unnamed: 0,entity_join,entity_1,entry_date_1,name_1,organisation_entity_1,reference_1,organisation_name_1,entity_2,entry_date_2,name_2,...,organisation_name_2,pct_min_intersection,date_match,entity_old,intersection_type,issue_type,action,priority,hint,multiple_issues
0,44000007-44000009,44000009,2004-06-01,Childwickbury,278,CA3,St Albans City and District Council,44000007,2004-06-01,Harpenden,...,St Albans City and District Council,1.1508995698706748e-06,True,False,tiny edge - ignore,Within organisation - LPA,ignore,,,False
1,44000017-44000770,44000770,2023-07-22,Leominster Town,16,2499,Historic England,44000017,2023-07-22,Leominster River Meadows,...,Historic England,1.1282780843611864e-05,True,False,tiny edge - ignore,Within organisation - Historic England,ignore,,,False
2,44000042-44000043,44000043,2023-07-22,Butterworth Hall,16,7716,Historic England,44000042,2023-07-22,Butterworth Hall (Municipal Buildings),...,Historic England,0.0,True,False,tiny edge - ignore,Within organisation - Historic England,ignore,,,False
3,44000050-44003132,44003132,2023-07-22,Worcester and Birmingham Canal,16,449,Historic England,44000050,2023-07-22,Worcester and Birmingham Canal,...,Historic England,1.6264203793924812e-05,True,False,tiny edge - ignore,Within organisation - Historic England,ignore,,,False
4,44000105-44005296,44005296,2023-07-22,Lewes,16,3207,Historic England,44000105,2023-07-22,Lewes (Malling Deanery),...,Historic England,0.0011782286279958,True,False,tiny edge - ignore,Within organisation - Historic England,ignore,,,False


In [61]:
issues_df.dtypes

entity_join              object
entity_1                 object
entry_date_1             object
name_1                   object
organisation_entity_1    object
reference_1              object
organisation_name_1      object
entity_2                 object
entry_date_2             object
name_2                   object
organisation_entity_2    object
reference_2              object
organisation_name_2      object
pct_min_intersection     object
date_match               object
entity_old               object
intersection_type        object
issue_type               object
action                   object
priority                 object
hint                     object
multiple_issues          object
dtype: object

In [62]:
seed_dupe_df = seed_dupe_df.merge(
    issues_df[["entity_join", "entity_1", "organisation_entity_1", "organisation_name_1", "pct_min_intersection", "intersection_type", "issue_type", "action", "priority", "multiple_issues"]],
    how = "left",
    on = "entity_join"
)

seed_dupe_df.to_csv("seeding_dupes_flagged_w_e1.csv", index=False)

In [54]:
print(len(issues_df[issues_df["entity_join"].isin(seed_dupe_df["entity_join"])]))

24


In [26]:
seed_dupe_df

Unnamed: 0,lpa,entity_name,entity_min,entity_max,count,entity_join
0,Calderdale Metropolitan Borough Council,Elland,44005067.0,44007924.0,2.0,44005067-44007924
1,Calderdale Metropolitan Borough Council,Hebden Bridge,44005068.0,44007920.0,2.0,44005068-44007920
2,Calderdale Metropolitan Borough Council,Todmorden,44003023.0,44007923.0,2.0,44003023-44007923
3,Carlisle City Council,Botchergate,44005069.0,44009142.0,2.0,44005069-44009142
4,Carlisle City Council,Carlisle City Centre,44005070.0,44009143.0,2.0,44005070-44009143
5,Cornwall Council,Helston,44004003.0,44009597.0,2.0,44004003-44009597
6,Cumberland Unitary Authority,Botchergate,44005069.0,44009142.0,2.0,44005069-44009142
7,Cumberland Unitary Authority,Carlisle City Centre,44005070.0,44009143.0,2.0,44005070-44009143
8,East Suffolk Council,Aldeburgh,44007092.0,44007897.0,2.0,44007092-44007897
9,East Suffolk Council,Beccles,44007344.0,44007891.0,2.0,44007344-44007891


In [20]:
'-'.join(seed_dupe_df["entity_join"][0])

'44005067-44007924'

## Analysis