In [1]:
import pandas as pd
df = pd.read_csv('isabella_manual_correction/sommarioni_standardisation_pre_finished_20250514.csv', sep=';')

# Adding the missing class to religious entities and grand schools.

In [2]:
# owner_standardised_class
without_class_ent = df[df.owner_standardised_class.isna() & df.owner_wd.notnull()]
for i, row in without_class_ent.iterrows():
    df.at[i, 'owner_standardised_class'] = "religious_entities"

# old_entity_standardized_class
without_class_old = df[df.old_entity_standardized_class.isna() & df.old_entity_wd.notnull()]
sucoal_grande_smdc = df[df.old_entity_standardized == 'scuola grande santa maria della carità']
for i, row in sucoal_grande_smdc.iterrows():
    df.at[i, 'old_entity_standardized_class'] = "scuole_grandi_entities"

for i, row in without_class_old.iterrows():
    df.at[i, 'old_entity_standardized_class'] = "religious_entities"

# Misc. Replacements

In [3]:
df.replace(to_replace="basilica di santa maria gloriosa dei frari", value='santa maria gloriosa dei frari', inplace=True)
qualities_replace_d = {
    "MAGAZZENO": "MAGAZZINO",
    "CORTO":"CORTE",
    "SCALLA":"SCALA"
}

def quick_replace(x):
    if not pd.isna(x):
        for k, v in qualities_replace_d.items():
            x = x.replace(k, v)
    return x

df.qualities = df.qualities.apply(quick_replace)
df.owner_type = df.owner_type.apply(lambda x: x.replace("LAICO", "SECULAR") if not pd.isna(x) else x)
# wrongly classified as "SCUOLA"
df.at[df[df.parcel_number == '14'].index[0], 'ownership_types'] = ['PUBBLICO']

# Final formatting, removing unwanted columns and producing aggregated versions.

In [4]:
# fixing the normalized values in qualities (when doing manual correction, sometimes the quotes were not closed or different)
def fix_std_list(x):
    if pd.isna(x):
        return x
    if isinstance(x, list):
        return
    if isinstance(x, str):
        x = x.replace("'", '').replace('"', '').replace("[", '').replace("]", '')
        return [v.strip() for v in x.split(",")]
    return x

df['qualities'] = df['qualities'].apply(fix_std_list)
df['ownership_types'] = df['ownership_types'].apply(fix_std_list)


df.at[df[df.unique_id == 23518].index[0], 'qualities'] = ['SCUOLA']
# df.at[df[df.unique_id == 23443].index[0], 'qualities'] = "['LUOGO', 'SCUOLA', 'CAMPANILE']"
df.at[df[df.unique_id == 23476].index[0], 'qualities'] = ['SCUOLA']
df.at[df[df.unique_id == 23487].index[0], 'qualities'] = ['SCUOLA']
df.at[df[df.unique_id == 23491].index[0], 'qualities'] = ['SCUOLA']

df['qualities'] = df.qualities.apply(lambda vs: [v.replace("RAFFINARIA", 'RAFFINERIA').replace("SAGRESTIA",'SACRESTIA') for v in vs])


In [5]:
from pathlib import Path
import geopandas as gpd
sommarioni_p = Path('../../1808_Sommarioni/')
sommarioni_geo_fp = list(sommarioni_p.rglob('sommarioni_geometries_202*.geojson'))[0]
gdf = gpd.read_file(sommarioni_geo_fp)
gdf['geometry_id'] = gdf['geometry_id'].apply(lambda v: None if v == -1 else str(int(v)))
gdf = gdf.rename({'parcel_type': 'geometry_type', 'parish_standardized': 'parish_standardised'}, axis=1)

# EPSG:4326 (WGS 84) is in degrees, EPSG:3857 (Web Mercator) is in meters.
gdf = gdf.to_crs(epsg=3857)
# adding the area in square meters of the geometries directly in the gdf instead in the registry.
gdf['area'] = gdf['geometry'].area
gdf = gdf.to_crs(epsg=4326)

value_to_patch = gdf[gdf['parcel_number'] == '3607 1/2'].index[0]
# because of a subparcel number which is not referenced in the textual entries, we need to patch the geometry id of the parcel 3607 1/2 to the one of the parcel 3607
gdf.at[value_to_patch,'geometry_id'] = "8625"

# Incorporation & Homogenisation of the (yet again) new transcriptions:

In [6]:
#  fixing an issue where in the old transcription, some parcel numbers with starting with "K" were actually
# parcels starting with "R", removing them to avoid collision with the actual "K" parcels
removed_idx = df[df['parcel_number'].str.startswith('K') & ~df.new_transcription].index
df = df.drop(removed_idx)

df['owner_standardised'] = df['owner_standardised'].apply(lambda s: s.replace('basilica di san pietro di castello', 'chiesa di san pietro di castello') if isinstance(s, str) else s)

# adding the missing transcriptions
dfp = pd.read_excel('isabella_manual_correction/lacking_transcriptions_20250527.xlsx')

# filter out rows with parcel number that have already been treated.
dfp = dfp[~dfp['parcel_number'].apply(lambda v: v.startswith('K') if isinstance(v, str) else False)]  
# irrévocablement pas dans le registre: 4273
dfp = dfp.rename(columns={'LINK': 'page'})
dfp['qualities'] = dfp['qualities'].apply(lambda x: [v.strip().upper() for v in x.split(',') if v] if not pd.isna(x) else x)  # ensure qualities are lists
dfp['ownership_types'] = dfp['ownership_types'].apply(lambda x: [])

def find_geometry_id(v, pn):
    if pd.isna(v):
        if isinstance(pn, str):
            if 'bis' in pn:
                # if the parcel number has 'bis', we need to remove it to find the geometry id
                pn = pn.replace(' bis', '')
            pn = pn.strip()
        elif isinstance(pn, float) or isinstance(pn, int):
            pn = str(int(pn)).strip()
        # find the geometry id based on parcel_number
        matching_gdf = gdf[gdf['parcel_number'] == pn]
        if not matching_gdf.empty:
            return str(int(matching_gdf.iloc[0]['geometry_id']))
    return v

dfp['geometry_id'] = dfp.apply(lambda r: find_geometry_id(r['geometry_id'], r['parcel_number']), axis=1)

# last remaining geometry_id to manually patch. (the vectorized geometry did not have the parcel number transcribed.)
dfp.at[dfp[dfp['parcel_number'] == 'AI'].index[0], 'geometry_id'] = 16827
# need to find the districts appartenance to the new transcriptions
dfp['parish_standardised'] = dfp['geometry_id'].apply(lambda v: gdf[gdf['geometry_id'] == v]['parish_standardised'].values[0] if isinstance(v, str) else None)
dfp['parish_standardised'].value_counts()

parish_to_district_accronym = {
    "Santa Maria Formosa": "NCS",   
    "Santa Giustina": "NCS",
    "San Pietro di Castello": "NCS",  
    "Santa Ternita": "NCS",     
    "San Martin": "NCS",         
    "San Giovanni in Bragora": "NCS",
    "San Marco": "NSM",            
    "Sant'Antonino": "NCS",     
    "San Ziminian" : "NSM",            
    "San Lio": "NCS",               
    "San Giovanni Elmosinario": "NSP",
    "San Severo": "NSP",     
    "San Provolo": "NCS",             
    "Santa Maria Nova": "NCN",       
    "Santa Marina": "NCS",   
    "San Giovanni Grisostomo": "NCN",
    "San Giovanni Nuovo": "NCS",     
    "San Cancian": "NCN",          
    "San Marcilian": "NCN"          
}
dfp['district_acronym'] = dfp['parish_standardised'].apply(lambda v: parish_to_district_accronym.get(v, None))
# adding "NCS" as district_acronym to the row with parcel number "CA"
dfp.at[dfp[dfp['parcel_number'] == 'CA'].index[0], 'district_acronym'] = "NCS"
dfp.at[dfp[dfp['parcel_number'] == 'AI'].index[0], 'district_acronym'] = "NCS"
dfp.at[dfp[dfp['parcel_number'] == 'DR'].index[0], 'district_acronym'] = "NCS"
dfp = dfp.drop(columns=['geometry_type', 'id', 'parish_standardised'])

In [7]:
# adding the unique id to the new transcriptions, to make the uuid already generated the same, we just add the new transcription to the range
max_unique_id = int(df.unique_id.max())
dfp['unique_id'] = range(max_unique_id, max_unique_id + len(dfp))
dfp['new_transcription'] = True
cols_not_in_new_transc = [
    "austrian_cadaster_correspondance",
    "austro_italian_cadaster_correspondance",
    "is_people",
    "llm_guess",
    "area"
]
for col in cols_not_in_new_transc:
    dfp[col] = None
    
dfp['qualities'] = dfp['qualities'].apply(lambda v: [] if not isinstance(v, list) else v)
df = pd.concat([df, dfp], ignore_index=True)

In [8]:
df['owner_standardised'] = df['owner_standardised'].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)
df['old_entity_standardized'] = df['old_entity_standardized'].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)

replace_vals = {
    "ent_SCR": 'social_care_entities',
    "ent_SCL_MST": 'scuole_mestieri_entities',
    "ent_SCL_REL": 'scuole_religious_entities',
    "ent_REL_UNL": 'religious_entities', #UNL is for unlikable, but we have found wikidata for most of those, so not applicable. 
    "ent_JEW": 'jew_entities',
    "ent_VNZ": 'venezia_entities',
    "religious_titles_entities": 'religious_entities',
}

df['old_entity_standardized_class'] = df['old_entity_standardized_class'].apply(lambda x: replace_vals[x] if x in replace_vals.keys() else x)
df['owner_standardised_class'] = df['owner_standardised_class'].apply(lambda x: replace_vals[x] if x in replace_vals.keys() else x)


rename_cols = {
    "old_entity_standardized": "old_entity_standardised",
    "old_entity_standardized_class": "old_entity_standardised_class",
}
for k, v in rename_cols.items():
    df = df.rename(columns={k: v})

district_acronym_d = {
    "CN": "Cannaregio",
    "CS": "Castello",
    "SM": "San Marco",
    "DD": "Dorsoduro",
    "SP": "San Polo",
    "SC": "San Croce",
    "CC": "Cannaregio"
}

def clean_district_acronym_occurrences(s):
    if not pd.isna(s):
        for acronym in district_acronym_d.keys():
            if acronym in s:
                s = s.replace(acronym+' -', "").strip()
                s = s.replace(acronym+' –', "").strip()
                s = s.replace(acronym+'-', "").strip()
    return s

df['district'] = df['district_acronym'].apply(lambda s: district_acronym_d[s[1:]])
# the geometry id in the gdf is required to be a string as to have null values. 
df['geometry_id'] = df['geometry_id'].astype(str)
df['place'] = df['place'].apply(clean_district_acronym_occurrences)
df = df.drop(columns=['district_acronym'])

# broadcasting the owner_wd to the owner_standardised, upon manual inspection, some entries were not linked to any wikidata entity, so we are broadcasting those when they hold the same values.
chiesa_to_wd_id = df[df['owner_wd'].notnull()][['owner_standardised', 'owner_wd']].drop_duplicates().set_index('owner_standardised')['owner_wd'].to_dict()
old_ent_chiesa_to_wd_id = df[df['old_entity_wd'].notnull()][['old_entity_standardised', 'old_entity_wd']].drop_duplicates().set_index('old_entity_standardised')['old_entity_wd'].to_dict()
chiesa_to_wd_id = {**chiesa_to_wd_id, **old_ent_chiesa_to_wd_id}
chiesa_to_wd_id
df['owner_wd'] = df['owner_standardised'].apply(lambda x: chiesa_to_wd_id[x] if x in chiesa_to_wd_id else None)
df['old_entity_wd'] = df['old_entity_standardised'].apply(lambda x: chiesa_to_wd_id[x] if x in chiesa_to_wd_id else None)


# translating the standardized values

In [9]:
produce_qualities_dictionnary_for_translation = False
stand_cols = [
    'owner_right_of_use',
    'old_religious_entity_type',
    'old_owner_right_of_use',
    'qualities',
    'ownership_types'
]
if produce_qualities_dictionnary_for_translation:
    all_stand_values_and_cols_of_origin = []
    for col in stand_cols:
        if type(df[col].iloc[0]) == list:
            # if the column is a list, we need to flatten it to get unique values
            all_stand_values = set()
            for vs in df[col].dropna():
                all_stand_values.update(vs)
        else:
            all_stand_values = df[col].dropna().unique()
        all_stand_values_and_cols_of_origin = all_stand_values_and_cols_of_origin + [[col, v.strip(), ''] for v in all_stand_values if isinstance(v, str) and v]
    pd.DataFrame(all_stand_values_and_cols_of_origin, columns=['column', 'value', 'translation']).to_csv('sommarioni_standard_values_to_translate.csv', index=False, sep=',')

dft = pd.read_csv('isabella_manual_correction/sommarioni_standard_values_translated.csv')
translation_values = dft.set_index('value')['translation'].to_dict()

def translate_value(v):
    if isinstance(v, str) and v in translation_values:
        return translation_values[v]
    if isinstance(v, list):
        return [translation_values.get(item, item) for item in v]
    else:
        return v
    
for c in stand_cols:
    df[c+"_en"] = df[c].apply(translate_value)


# fetching back the last manual corrections from Isabella + homgenisation of some fields

In [10]:
from ast import literal_eval
dfn = pd.read_csv('isabella_manual_correction/venice_1808_landregister_textual_entries_internal_20250604.csv', sep=';').drop(columns=['Unnamed: 0'])

replace_own_stand = {
    "basilica di san marco": "chiesa di san marco",
    "basilica di san pietro di castello": "chiesa di san pietro di castello",
    "basilica di santa maria della salute": "santa maria della salute",
}
t_val = 'basilica di san pietro di castello'
print((dfn['old_entity_standardised'] == t_val).sum())
dfn['owner_standardised'] = dfn['owner_standardised'].apply(lambda x: replace_own_stand.get(x,x))
dfn['old_entity_standardised'] = dfn['old_entity_standardised'].apply(lambda x: replace_own_stand.get(x,x))
print((dfn['old_entity_standardised'] == t_val).sum())

def broadcast_owner_value_to_standard_cols(df:pd.DataFrame, standardised_col:str, old_ent_standardised_col:str, see_change:bool = True) -> pd.DataFrame:
    # broadcasting the owner_wd to the owner_standardised, upon manual inspection, some entries were not linked to any wikidata entity, so we are broadcasting those when they hold the same values.
    owner_to_std_id = df[df[standardised_col].notnull()][['owner_standardised', standardised_col]].drop_duplicates().set_index('owner_standardised')[standardised_col].to_dict()
    old_ent_to_std_id = df[df[old_ent_standardised_col].notnull()][['old_entity_standardised', old_ent_standardised_col]].drop_duplicates().set_index('old_entity_standardised')[old_ent_standardised_col].to_dict()
    std_dict = {**owner_to_std_id, **old_ent_to_std_id}
    print('Missing vals in {}: {}'.format(standardised_col, df[df[standardised_col].isna()].shape[0])) if see_change else None
    df[standardised_col] = df['owner_standardised'].apply(lambda x: std_dict.get(x, None))
    print('Missing vals in {}: {}'.format(standardised_col, df[df[standardised_col].isna()].shape[0]))if see_change else None
    print('Missing vals in {}: {}'.format(old_ent_standardised_col, df[df[old_ent_standardised_col].isna()].shape[0])) if see_change else None
    df[old_ent_standardised_col] = df['old_entity_standardised'].apply(lambda x: std_dict.get(x, None))
    print('Missing vals in {}: {}'.format(old_ent_standardised_col, df[df[old_ent_standardised_col].isna()].shape[0])) if see_change else None
    return df

def broadcast_owner_value_to_single_col(df:pd.DataFrame, col:str, standardised_col:str) -> pd.DataFrame:
    print('Missing vals in {}: {}'.format(standardised_col, df[df[standardised_col].isna()].shape[0]))
    # broadcasting the owner_wd to the owner_standardised, upon manual inspection, some entries were not linked to any wikidata entity, so we are broadcasting those when they hold the same values.
    col_to_std_id = df[df[standardised_col].notnull()][[col, standardised_col]].drop_duplicates().set_index(col)[standardised_col].to_dict()
    df[standardised_col] = df[col].apply(lambda x: col_to_std_id.get(x, None))
    print('Missing vals in {}: {}'.format(standardised_col, df[df[standardised_col].isna()].shape[0]))
    return df

dfn = broadcast_owner_value_to_standard_cols(dfn, 'owner_standardised_class', 'old_entity_standardised_class')
dfn = broadcast_owner_value_to_single_col(dfn, 'old_entity_standardised', 'old_religious_entity_type')
dfn = broadcast_owner_value_to_single_col(dfn, 'old_entity_standardised', 'old_religious_entity_type_en')

dfn = dfn.rename(columns={'owner': 'owner_transcription'})
def quick_reg_format(x):
    if isinstance(x, str):
        reg, numb = x.split('/')
        numb = int(numb.strip())
        reg = reg.replace('reg', 'Reg ')
        return f'{reg}, pg. {numb}'
    return x

dfn['citation'] = dfn['page'].apply(lambda x: ('ASVe, Censo Stabile, Catasto Napoleonico, Sommarioni, ' + quick_reg_format(x)) if isinstance(x, str) else x)

dfn['ownership_types'] = dfn['ownership_types'].apply(lambda v: literal_eval(v) if isinstance(v, str) else v)
dfn['ownership_types_en'] = dfn['ownership_types_en'].apply(lambda v: literal_eval(v) if isinstance(v, str) else v)

## Removing "COMUNE" from the ownnership_types, as it is not a valid ownership type. (similar information is already in the owner_standardised_class or owner_type)
dfn['ownership_types'] = dfn['ownership_types'].apply(lambda x: [v for v in x if v.upper() != 'COMUNE' and v.upper() != 'COMMUNE'] if isinstance(x, list) else x)
dfn['ownership_types_en'] = dfn['ownership_types_en'].apply(lambda x: [v for v in x if v.upper() != 'COMUNE' and v.upper() != 'COMMUNE'] if isinstance(x, list) else x)

dfn['qualities'] = dfn['qualities'].apply(lambda v: literal_eval(v) if isinstance(v, str) else v)
dfn['qualities_en'] = dfn['qualities_en'].apply(lambda v: literal_eval(v) if isinstance(v, str) else v)

dfn['old_religious_entity_type'] = dfn['old_religious_entity_type'].apply(lambda s: s.upper() if isinstance(s, str) else s)
dfn['old_religious_entity_type_en'] = dfn['old_religious_entity_type_en'].apply(lambda s: s.upper() if isinstance(s, str) else s)


  dfn = pd.read_csv('isabella_manual_correction/venice_1808_landregister_textual_entries_internal_20250604.csv', sep=';').drop(columns=['Unnamed: 0'])


5
0
Missing vals in owner_standardised_class: 20681
Missing vals in owner_standardised_class: 20566
Missing vals in old_entity_standardised_class: 22662
Missing vals in old_entity_standardised_class: 22656
Missing vals in old_religious_entity_type: 23246
Missing vals in old_religious_entity_type: 23229
Missing vals in old_religious_entity_type_en: 23246
Missing vals in old_religious_entity_type_en: 23229


# New modifications to be made (23.06.25)

In [11]:
from functools import reduce
reduce(lambda a, b: a.union(set(b)) if b else a, df['ownership_types'].tolist(),set())

reduce(lambda a, b: a.union(set(b)) if b else a, df['ownership_types_en'].tolist(),set())

{'', 'COMMUNE', 'OWN', 'PUBLIC', 'RENT'}

In [12]:
# because changing it in the translated alues doesn't work, at this step of the pipeline, we're basing the latest version of the data on Isabella's latest manual corrections.
special_qualities_replace = {
    "HOME": "HOUSE",
    "PLACES": "ADJACENT AREAS",
    "PLACE": "ADJACENT AREA"
}

def replace_special_en_qualities(ownership_types: list[str]) -> list[str]:
    if not isinstance(ownership_types, list):
        return ownership_types
    return [special_qualities_replace.get(v, v) for v in ownership_types]

def add_value_if_not_present(lst, val) -> list[str]:
    if not val in lst:
        return lst + [val]
    return lst

def add_public_to_ownership_types_when_venezia_entities_in_own_standardised_class(own_types: list[str], own_types_en: list[str], own_std_class: str) -> tuple[list[str], list[str]]:
    if not isinstance(own_types, list):
        return own_types, own_types_en
    if own_std_class == 'venezia_entities':
        if len(own_types) == 1 and own_types[0] == '':
            return ['PUBBLICO'], ['PUBLIC']
        else:
            return add_value_if_not_present(own_types, 'PUBBLICO'), add_value_if_not_present(own_types_en, 'PUBLIC')
    else:
        return own_types, own_types_en

dfn['ownership_types'], dfn['ownership_types_en'] = zip(*dfn.apply(lambda r:\
                                                            add_public_to_ownership_types_when_venezia_entities_in_own_standardised_class(r['ownership_types'], r['ownership_types_en'], r['owner_standardised_class']), axis=1))

dfn['qualities_en'] = dfn['qualities_en'].apply(replace_special_en_qualities)
dfn['owner_type'] = dfn['owner_type'].apply(lambda v: v.strip().replace('ISTITUZIONE PUBBLICA','SECULAR') if isinstance(v, str) else v)

dfn['old_religious_entity_type'] = dfn['old_religious_entity_type'].apply(lambda v: v.strip().replace('ISTITUZIONE PUBBLICA','SECULAR') if isinstance(v, str) else v)

dfn['old_religious_entity_type'] = dfn['old_religious_entity_type'].apply(lambda v: v.strip().replace('BASILICA, CONVENTO','CONVENTO') if isinstance(v, str) else v)
dfn['old_religious_entity_type'] = dfn['old_religious_entity_type'].apply(lambda v: None if v == "BASILICA" else v)


dfn['old_religious_entity_type_en'] = dfn['old_religious_entity_type_en'].apply(lambda v: v.strip().replace('BASILICA, CONVENT','CONVENT') if isinstance(v, str) else v)
dfn['old_religious_entity_type_en'] = dfn['old_religious_entity_type_en'].apply(lambda v: v.strip().replace('MONASTERO','MONASTERY') if isinstance(v, str) else v)
dfn['old_religious_entity_type_en'] = dfn['old_religious_entity_type_en'].apply(lambda v: None if v == "BASILICA" else v)

da_bis_geometry_id = 16869
db_bis_geometry_id = 16870
da_bis_ids = ["way/6125", "way/6126", "way/5319"]
db_bis_ids = ["way/12213", "way/12064", "way/12087", "way/12088"]

# matching "way/12213", "way/12064", "way/12087", "way/12088" to "DB Bis" (with a new geometry id)
for ids in da_bis_ids:
    gdf.at[gdf[gdf['id'] == ids].index[0], 'geometry_id'] = da_bis_geometry_id

dfn.at[dfn[dfn['parcel_number'] == 'DA bis'].index[0], 'geometry_id'] = da_bis_geometry_id

# matching "way/6125", "way/6126", "way/5319", to "DA Bis" (with a new geometry id)
for ids in db_bis_ids:
    gdf.at[gdf[gdf['id'] == ids].index[0], 'geometry_id'] = db_bis_geometry_id
dfn.at[dfn[dfn['parcel_number'] == 'DB bis'].index[0], 'geometry_id'] = db_bis_geometry_id

In [13]:
portionless_vals = [9124, 9125, 9126, 9127, 9128, 
            9129, 9130, 9131, 9132, 9133, 
            9134, 9135, 9136, 9137, 9138, 
            9139, 9140, 9141, 9142, 9143,
            9144, 9145, 9146, 9147, 9148,
            9149, 9150, 9151, 9172, 9173,
            9174, 9175, 9176, 9177, 9178,
            9179, 9180, 9181, 9182, 9183,
            9184, 9185, 9186, 9192, 9193,
            9194, 9195, 9196, 9197, 9198,
            9199, 9200, 9201, 9202, 9203, 
            9204, 9205, 
            # 9206 # this one is not portionless, it is the one listing all previous parcels.
            ]
# for the visualisation in observable to work, adding the "[porzione di casa]" to the quality of the portionless parcels.
for pn in portionless_vals:
    idx = dfn[dfn['parcel_number'] == str(pn)].index[0]
    curr_quality = dfn.iloc[idx]['quality']
    dfn.at[idx, 'quality'] =  (curr_quality + '[porzione di casa]') if isinstance(curr_quality, str) else '[porzione di casa]'


def clean_list_values(x):
    if isinstance(x, list):
        if len(x) == 1 and x[0] == '':
            return []
    return x

dfn['qualities'] = dfn['qualities'].apply(clean_list_values)
dfn['qualities_en'] = dfn['qualities_en'].apply(clean_list_values)
dfn['ownership_types'] = dfn['ownership_types'].apply(clean_list_values)
dfn['ownership_types_en'] = dfn['ownership_types_en'].apply(clean_list_values)

In [14]:
dfn = dfn.rename(columns={"page": "page_number"})
remove_cols = [
    "austrian_cadaster_correspondance",
    "austro_italian_cadaster_correspondance",
    "new_transcription",
    "page_number",
    "is_people",
    "llm_guess",
    "owner_type_en", # ended up being the same value as the base column.
    "area" # computed in the geometries side instead.
]
dfn['geometry_id'] = dfn['geometry_id'].apply(lambda v: int(v) if not pd.isna(v) else v)
# version with all the columns, for internal use, and for timeAtlas integration
dfn.to_json('../../1808_Sommarioni/venice_1808_landregister_textual_entries_internal_version.json', orient='records', indent=2, force_ascii=False)
df = dfn.drop(columns=remove_cols)
# for consistency with the people dataset. 
df = df.rename(columns = {'unique_id': 'parcel_id'})
df.to_json('venice_1808_landregister_textual_entries.json', orient='records', indent=2, force_ascii=False)


# Producing the final and aggregated versions

## Adding back the people to the dataset 


In [15]:

### Generating a single geojson file with all the data for paul
sommarioni_people = pd.read_json('isabella_manual_correction/people_sommarioni_dataset.json').drop(columns=['edited', 'merged_ids'])
sommarioni_people['uid'] = sommarioni_people['uid'].astype(int)
sommarioni_people['nucleus_uid'] = sommarioni_people['nucleus_uid'].astype(int)
sommarioni_people['parcel_array'] = sommarioni_people.parcel_ids.apply(lambda v: literal_eval('[' + v+']'))

sommarioni_people = sommarioni_people.replace({'': None})
people_duplicated = []
for _, r in sommarioni_people.iterrows():
    for parcel_id in r.parcel_array:
        new_r = r.copy()
        new_r['parcel_id'] = parcel_id
        people_duplicated.append(new_r)

people_flattened = pd.DataFrame(people_duplicated).drop(columns=['parcel_array', 'parcel_ids'])
rename_owner_dict = {v:'own_'+v for v in people_flattened.columns.to_list()}
people_flattened = people_flattened.rename(columns=rename_owner_dict)
sommarioni_people.drop(columns=['parcel_ids']).rename(columns={'parcel_array': 'parcel_ids'}).to_json('venice_1808_landregister_standardised_people.json', orient='records', indent=2, force_ascii=False)

In [16]:
parcel_id_people_group = {}
for g, group_df in people_flattened.groupby('own_parcel_id'):
    group_df = group_df.drop(columns=['own_parcel_id'])
    parcel_id_people_group[g] = group_df.to_json(orient='records', force_ascii=False)


In [17]:

gdf.to_file('venice_1808_landregister_geometries.geojson', driver='GeoJSON')
# no differences with the internal version, but for consistency we save it with the same name as the one used in the registry.
gdf.to_file('../../1808_Sommarioni/venice_1808_landregister_geometries_internal_version.geojson', driver='GeoJSON')

gdf = gdf.drop(columns=['parcel_number'])
gdf['geometry_id'] = gdf['geometry_id'].fillna(-1).astype(int)
max_idx = gdf['geometry_id'].astype(int).max()

# for all geometry_id == -1, replacing them with a new index starting from the max_idx + 1
non_identified = gdf[gdf['geometry_id'] == -1]
gdf.loc[non_identified.index, 'geometry_id'] = list(range(max_idx + 1, max_idx + 1 + len(gdf[gdf['geometry_id'] == -1])))


# this makes it so the non grouped by parcel_number geometries will be put in their own object in the dedicated aggregatted JSON structure. 
gdf.geometry_id = gdf.geometry_id.astype(int).astype(str)
geometries_group = gdf.groupby('geometry_id')
geometries_group_dict = geometries_group.apply(lambda x: x.drop(columns=['geometry_id']).to_json()).to_dict()
geometries_group_dict = {str(k):v for k,v in geometries_group_dict.items()}

In [21]:
import json
sommarioni_text_people_group = df.groupby('geometry_id')
sommarioni_text_people_group_dict = sommarioni_text_people_group.apply(lambda x: x.drop(columns=['geometry_id']).to_json(orient='records')).to_dict()
sommarioni_text_people_group_dict = {str(k):v for k,v in sommarioni_text_people_group_dict.items()}
new_struct = []
for k, v in geometries_group_dict.items():
    text = json.loads(sommarioni_text_people_group_dict[k]) if k in sommarioni_text_people_group_dict else []
    people = []
    if len(text) > 0:
        for t in  text:
            parcel_id = t['parcel_id']
            if parcel_id in parcel_id_people_group:
                # if the parcel_id is in the people group, we add the people to the text
                people = people + json.loads(parcel_id_people_group[parcel_id])
        t.pop('parcel_id', None)
    new_struct.append({'geometries': json.loads(v), 'text':text, 'people': people }) 

with open('venice_1808_landregister_aggregated_data.json', 'w+') as f:
    json.dump(new_struct, f, indent=2, ensure_ascii=False)