# Data Alignment

This notebook aims to produce a reference clean version of the Sommarioni, aggregating and aligning different sources of data relating to the Sommarioni into a single dataset.

## Aligning the geometries and standardisation of the parcel type on Carlo's data:

Carlo's dataset proposes standardisation of the owner text from the sommarioni, from which a dataset of the people mentionned was built, which is stored in a speparate file. As the index of his version of the sommmarioni is used to link to his work on the individuals, it is a more sound base to use for the project moving on (and until the right data has been correctly deduced from the original scans). Below are the codes used to merge Carlo's dataset with the geometries fetched from the cadaster-interface app, as well as adding missing columns (standardisation of the type of ownership and the quality of each parcel). This notebook should in principle be run after the `cadaster-interface_data_fetch.ipynb` on then.

In [1]:
from os.path import join
import pandas as pd
import geopandas as gpd
from datetime import datetime as dt

def today_date() -> str:
    return dt.strftime(dt.today(), '%Y%m%d')


CADASTER_INTERFACE_DATA_FOLDER = 'fetched_from_cadaster-interface_app/'
# aligning the geometries with the text data.
gdf = gpd.read_file(join(CADASTER_INTERFACE_DATA_FOLDER, 'cadaster_18080_geometries_from_cadaster_inferface.geojson'))
gdf['geometry_id'] = gdf.groupby('parcel_number').ngroup().fillna(-1).astype(int)
gdf_number = gdf.groupby('parcel_number').first().reset_index()[['geometry_id', 'parcel_number']]
sommarioni_std = pd.read_json('../../named_entity_standardisation/1808_Sommarioni_standardised/sommarioni_dataset_STD.json')
sommarioni_std_geom = gdf_number.merge(sommarioni_std, left_on='parcel_number', right_on='numero_della_mappa')
geometries_correctly_linked = gdf.drop_duplicates().reset_index().drop(columns=['index']).sort_values(by='geometry_id')
# the value for this field is alway "yes", redundant we can drop it.
geometries_correctly_linked.drop(columns=['parcel_number_written'], inplace=True) 
geometries_correctly_linked.to_file(f'../../1808_Sommarioni/sommarioni_geometries_{today_date()}.geojson')

In [2]:
import ast

# Carlo's data is missing the standardisation of the type of ownership and category of parcels derived from the parcelCategoryText/qualita which was present on the cadaster interface website
sommarioni_categories = pd.read_json(join(CADASTER_INTERFACE_DATA_FOLDER, '1808_sommarioni_from_cadaster_interface.json'))[['parcelCategoryText','parcelCategories', 'parcelOwnershipType']]

# so the merge can use a non duplicated version of the data (list is not hashable and can be used for dropping duplicates)
sommarioni_categories['parcelCategories_str'] = sommarioni_categories['parcelCategories'].apply(str)
sommarioni_categories['parcelOwnershipType_str'] = sommarioni_categories['parcelOwnershipType'].apply(str)

sommarioni_categories.drop(columns=['parcelCategories', 'parcelOwnershipType'], inplace=True)

sommarioni_std_full = sommarioni_std_geom.merge(sommarioni_categories.drop_duplicates(),left_on='qualita', right_on='parcelCategoryText').drop(columns=['parcelCategoryText', 'numero_della_mappa'])

sommarioni_std_full = sommarioni_std_full.rename({
          'subalterno': 'sub_parcel_number',
          'corr_as': 'austrian_cadaster_correspondance',
          'corr_ai': 'austro_italian_cadaster_correspondance',
          'denom_pezzi_di_terra': 'house_number', 
          'place_acronym': 'district_acronym',
          'possessore':'owner', 
          'possessore_standardised': 'owner_standardised', 
          'qualita': 'quality',
          'uniqueID': 'unique_id'}, axis=1)

#reconverting the categorical data back to strings
sommarioni_std_full['ownership_types'] = sommarioni_std_full['parcelOwnershipType_str'].apply(lambda x: [v.strip() for v in ast.literal_eval(x)])
sommarioni_std_full['qualities'] = sommarioni_std_full['parcelCategories_str'].apply(lambda x: [v.strip() for v in ast.literal_eval(x)])
sommarioni_std_full.drop(columns=['parcelCategories_str', 'parcelOwnershipType_str'], inplace=True)

txt_file_path = f'../../1808_Sommarioni/sommarioni_text_data_{today_date()}.json'

# to get rid of utf-8 errors.
with open(txt_file_path, 'w', encoding='utf-8') as file:
    sommarioni_std_full.to_json(file, orient='records', indent=4, force_ascii=False)

In [3]:
### Generating a single geojson file with all the data for paul
sommarioni_people = pd.read_json('../../named_entity_standardisation/1808_Sommarioni_standardised/people_sommarioni_dataset.json').drop(columns=['edited', 'merged_ids'])
sommarioni_people['uid'] = sommarioni_people['uid'].astype(int)
sommarioni_people['nucleus_uid'] = sommarioni_people['nucleus_uid'].astype(int)
sommarioni_people['parcel_array'] = sommarioni_people.parcel_ids.apply(lambda v: ast.literal_eval('[' + v+']'))
people_duplicated = []
for _, r in sommarioni_people.iterrows():
    for parcel_id in r.parcel_array:
        new_r = r.copy()
        new_r['parcel_id'] = parcel_id
        people_duplicated.append(new_r)


In [4]:
people_flattened = pd.DataFrame(people_duplicated).drop(columns=['parcel_array', 'parcel_ids'])
rename_owner_dict = {v:'own_'+v for v in people_flattened.columns.to_list()}
people_flattened = people_flattened.rename(columns=rename_owner_dict)
sommarioni_text_people = sommarioni_std_full.merge(people_flattened, left_on='unique_id', right_on='own_parcel_id', how='left').drop(columns=['own_parcel_id'])

In [5]:
final_file_name = f'../../1808_Sommarioni/aggregated/sommarioni_geometries_function_and_people_standardized_{today_date()}.geojson'
to_write = geometries_correctly_linked.merge(sommarioni_text_people, on='geometry_id', how='left').rename(columns={'parcel_number_x': 'parcel_number'}).drop(columns='parcel_number_y').to_json(indent=2)
with open(final_file_name, 'w', encoding='utf-8') as file:
     file.write(to_write)

In [6]:
# finally, just to have the file with the very few changes (identifiers as int, list is a proper list, same field labels as in the aggregated version and removed useless columns) in the production folder

sommarioni_people['parcel_ids'] = sommarioni_people['parcel_array']
sommarioni_people.drop(columns='parcel_array', inplace=True)
sommarioni_people.rename(columns=rename_owner_dict, inplace=True)
people_dataset_fp = f'../../1808_Sommarioni/people_sommarioni_dataset_{today_date()}.json'
with open(people_dataset_fp, 'w', encoding='utf-8') as file:
    sommarioni_people.to_json(file, orient='records', indent=4, force_ascii=False)

# Aligning with the parishes from 1740

Using the geometries of the parcels that are registered in the Sommmarioni (so the parish value can be backpropagated to the corresponding HR). We check them against the most corresponding parish geometry (meaning the parish that contains the parcel, or that has the highest percentage of the parcel covered in the parish area)

In [7]:
pn_and_geom = geometries_correctly_linked[['parcel_number', 'geometry']].drop_duplicates()
pn_and_geom = pn_and_geom[~pn_and_geom.parcel_number.isna()]
parishes = gpd.read_file('../../1740_redrawn_parishes_cleaned_wikidata_standardised.geojson')
# will be used to 
ids_geoms_dict = parishes.groupby('id').agg(list)['geometry'].to_dict()

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()
def match_parcel_to_parish(parcel_geom, parish_df, intersection_rather_than_contain = False):
    candidates = []
    try:
        for _, r in parish_df.iterrows():
            if intersection_rather_than_contain:
                if r.geometry.intersects(parcel_geom):
                    candidates.append(r['id'])
            else:
                if r.geometry.contains(parcel_geom):
                    candidates.append(r['id'])
    except Exception as e:
        print(e)
    return candidates if len(candidates) > 0 else None
    

pn_and_geom['containing_parish'] = pn_and_geom['geometry'].progress_apply(lambda g: match_parcel_to_parish(g, parishes))
pn_and_geom['intersect_parish'] = pn_and_geom.\
    progress_apply(lambda r: r['containing_parish'] if r['containing_parish'] else match_parcel_to_parish(r['geometry'], parishes, True) , axis = 1)


In [9]:
def percentage_overlap(g, geoms) -> float:
    percentage = 0.
    for geom in geoms:
        percentage += (g.intersection(geom).area/g.area)
    return percentage

def intersection_selection(parcel_geom, geometry_ids) -> int:
    '''
    Given a parcel and a list of parish id, return the parish id that covers
    the most the area of the parcel. Basically allows to select the most prominent
    parish from which an overlapping parcel belongs to.
    '''
    if geometry_ids:
        if len(geometry_ids) > 1:
            percentages = [(id, percentage_overlap(parcel_geom, ids_geoms_dict[id])) for id in geometry_ids]
            percentages.sort(key=lambda x: x[1], reverse=True)
            return percentages[0][0]
        elif len(geometry_ids) == 1:
            return geometry_ids[0]
        
    return None
    
parish_id_to_name = parishes.set_index('id')['NAME'].to_dict()
pn_and_geom['parish_id'] = pn_and_geom.apply(lambda r: intersection_selection(r.geometry,r.intersect_parish),axis=1)
pn_and_geom['parish_name'] = pn_and_geom['parish_id'].map(parish_id_to_name)
geometries_correctly_linked['parish_standardized'] = pn_and_geom['parish_name']
geometries_correctly_linked.to_file(f'../../1808_Sommarioni/sommarioni_geometries_{today_date()}.geojson')

In [None]:
# easy way to quality check the parish attribution from the parcels.
import folium
from tqdm import tqdm

geom_id = '34'

map2 = folium.Map(location=[45.433, 12.329], tiles="CartoDB Positron", zoom_start=14.6)
for _, r in tqdm(pn_and_geom[pn_and_geom.parish == geom_id].iterrows()):
        geo_j = gpd.GeoSeries(r["geometry"]).to_json()
        geo_j = folium.GeoJson(data=geo_j) #style_function=lambda x: {"fillColor": color_dict[r['parcel_type']], 'color':color_dict[r['parcel_type']]}
        folium.Popup(r["parcel_number"]).add_to(geo_j)
        geo_j.add_to(map2)

for _, r in tqdm(parishes[parishes.id == geom_id].iterrows()):
        geo_j = gpd.GeoSeries(r["geometry"]).to_json()
        geo_j = folium.GeoJson(data=geo_j,  style_function=lambda x: {"fillColor": "#FF0000"}) #style_function=lambda x: {"fillColor": color_dict[r['parcel_type']], 'color':color_dict[r['parcel_type']]}
        geo_j.add_to(map2)
map2
