In [1]:
import json
import geopandas as gpd
import pandas as pd
import pyproj
from shapely.geometry import Point

# Catastici

In [204]:
with open('catastici_1741_STD.json', 'r') as f:
    data = json.load(f)
    
# data = gpd.read_file("../data_20240221/raw/20240221_Catastici1741_Intermediate.geojson")

In [205]:
data = pd.DataFrame(data)
print(data.shape)
print(data.columns)
display(data.sample(3))

(33297, 28)
Index(['uidx', 'id', 'owner_name', 'owner_code', 'owner_count',
       'owner_count_remark', 'owner_entity', 'owner_entity_group',
       'owner_first_name', 'owner_family_name', 'owner_family_group',
       'owner_title', 'owner_title_std', 'owner_mestiere',
       'owner_mestiere_std', 'ten_name', 'function', 'an_rendi', 'id_napo',
       'quantity_income', 'quality_income', 'author', 'place', 'parish',
       'sestiere', 'uid', 'path_img', 'geometry'],
      dtype='object')


Unnamed: 0,uidx,id,owner_name,owner_code,owner_count,owner_count_remark,owner_entity,owner_entity_group,owner_first_name,owner_family_name,...,id_napo,quantity_income,quality_income,author,place,parish,sestiere,uid,path_img,geometry
21152,21152,228,Fratelli Busca,PPL,2,2+,,,_fratelli,BUSCA,...,,,,Davide,Alli Bari,San Giacomo dall'Orio,SC,GDO-0228,438_SCroce/9_SGiacomoDallOrio/SGiacomoDallOrio...,POINT (290803.9453117192 5035349.934663499)
17846,17846,18,Domenico Vaccaro,PPL,1,,,,Domenico,VACCARO,...,,,,Francesca,Al Ponte di mezzo in Rio della Chiesa,San Trovaso,DD,TRV-0018b,439_Dorsoduro/6_STrovaso/STrovaso_2_17.png,POINT (290871.2582336992 5034313.463503406)
13778,13778,721,Antonio Altobello,PPL,1,,,,Antonio,ALTOBELLO,...,,,,Davide,Fondamenta de Tentori,San Raffael,DD,RAF-0721,439_Dorsoduro/2_SRaffael/SRaffael_45_710.png,POINT (290432.0517098375 5034706.454614096)


In [206]:
data_short = data[['owner_code', 'owner_count',
       'owner_first_name', 'owner_family_name',
       'owner_mestiere', 'ten_name', 'function', 
       'an_rendi', 'place', 'parish', 'geometry']]

In [208]:
# numeric rent
data_short = data_short[data_short['an_rendi'].apply(lambda x: type(x) == int)]
# data_short = data_short[data_short['an_rendi'].str.isnumeric()]

In [209]:
# select single owners
data_short = data_short[data_short['owner_count']==1]
# data_short = data_short[data_short['owner_count']=='1']
data_short.shape

(24460, 11)

In [210]:
# Select people only
data_short = data_short[data_short['owner_code']=='PPL']
data_short.shape

(19070, 11)

In [214]:
# clean function
data_short['function'] = data_short['function'].astype(str)
data_short['function'] = data_short['function'].apply(lambda x: x.split('da')[0].strip().split(' e '))
data_short['n_function'] = data_short['function'].apply(lambda x: len(x))
data_short['function'] = data_short['function'].apply(lambda x: ', '.join(x))

In [215]:
# remove where | or _ in owner first name
data_short = data_short[~data_short['owner_first_name'].str.contains('\|')]
data_short = data_short[~data_short['owner_first_name'].str.contains('_')]
data_short.shape

(18953, 12)

In [225]:
# clean geometry
# data_short['x'] = data_short.geometry.apply(lambda point: point.x)
# data_short['y'] = data_short.geometry.apply(lambda point: point.y)
# data_short.drop(['geometry', 'owner_count', 'owner_code'], axis=1, inplace=True)

data_short['geometry'] = data_short['geometry'].apply(lambda x: x.replace('POINT','').replace(')','').replace('(','').strip().split(' '))
data_short.drop(['owner_count', 'owner_code'], axis=1, inplace=True)

In [226]:
data_short.sample(5)

Unnamed: 0,owner_first_name,owner_family_name,owner_mestiere,ten_name,function,an_rendi,place,parish,geometry,n_function
13104,Alvise,PISANI,,Francesco Boschini,casa,10,Fossa Capera,San Raffael,"[290102.7505837445, 5034453.056417173]",1
10830,Andrea,ERIZZO,,Zuanne Angieli,"Casa, bottega",35,In Fondamenta vicin alla Chiesa,Sant'Antonino,"[292483.0843632889, 5034723.356146083]",2
1615,Elisabetta,ZINELLI,,Bortolo Dagnolo,casa a pepian,5,Calle del Centon,San Geremia,"[290591.3896869204, 5035830.948885622]",1
5327,Paolina,BADOER MOCENIGO,,Stefano Rubelli tentor,"casa in solaro, bottega",60,Calle del ponte del legno,Santa Maria Nova,"[292011.1163428106, 5035237.049365022]",2
31191,Giacomo,CORRER,,Alba Bonati primo,Casa in tre solari,30,Calle del Pestrin,San Mattio,"[291464.7446401383, 5035195.355419341]",1


In [232]:
def convert_coor(geometry):
    x , y = geometry
    src_crs = pyproj.CRS('EPSG:32633')
    tgt_crs = pyproj.CRS('EPSG:4326')
    transformer = pyproj.Transformer.from_crs(src_crs, tgt_crs, always_xy=True)
    lon, lat = transformer.transform(x, y)
    return [lon, lat]
data_short['geometry'] = data_short['geometry'].apply(convert_coor)

In [252]:
new_columns = {
    "owner_first_name":"Owner_First_Name",
    "owner_family_name":"Owner_Family_Name",
    "owner_mestiere":"Owner_Profession",
    "ten_name":"Tenant_Name",
    "function":"Property_Functions",
    "an_rendi":"Rent_Price",
    "place":"Location",
    "parish":"Parish",
    "geometry":"Coordinates",
    "n_function":"Property_Functions_Count",
}
data_short.rename(columns=new_columns, inplace=True)
data_short = data_short.applymap(lambda x: x.lower() if isinstance(x, str) else x)
data_short['Longitude'] = data_short['Coordinates'].apply(lambda x: x[0])
data_short['Latitude'] = data_short['Coordinates'].apply(lambda x: x[1])
data_short.drop('Coordinates',axis=1,inplace=True)
data_short.sample(5)

Unnamed: 0,Owner_First_Name,Owner_Family_Name,Owner_Profession,Tenant_Name,Property_Functions,Rent_Price,Location,Parish,Property_Functions_Count,Longitude,Latitude
21249,costantin,franceschi,,francesco luchi,,14,in faccia al balin,san giacomo dall'orio,1,12.325729,45.440921
5092,maria celeste,calegari,,domenico dall'oglio,casa in soler,38,calle per andar in corte a ka' minio,santa maria maddalena,1,12.330736,45.442989
20333,teresa,rossi,,francesco pedretti,inviamento,9,calle della regina,san cassiano,1,12.331084,45.439616
31360,,corner,,francesco zanchi,casa primo soler,10,in calle di cornaro,san polo,1,12.329192,45.438028
6125,nicolo,dona,,piero tacchin,bottega di favro,26,calle di ca' zen,santi apostoli,1,12.337738,45.441838


In [253]:
data_short.shape

(18953, 11)

In [20]:
data_short.Owner_Profession.unique()

array([nan, 'avvocato', 'medico', 'procurator', 'mercante da legname',
       'orefice', 'illustrissimo medico', 'procuratore', 'fruttarola',
       'luganegher', 'dottor', 'nodaro veneto', 'avocato', 'frutariol',
       'economo', 'mercadante da chiodi', 'cuoco', 'ecconomo',
       'peruchier', 'specier da confetti', "mercante d'oro", 'tutrice',
       'erbarol', 'fabro'], dtype=object)

In [14]:
data_short = data_short.fillna('')
data_short.to_csv('clean/properties.csv', index=False)

In [15]:
data_short = pd.read_csv('clean/properties.csv')

In [18]:
gdf = gpd.GeoDataFrame(data_short, geometry=[Point(xy) for xy in zip(data_short['Longitude'], data_short['Latitude'])])
geojson_str = gdf.to_json()
gdf.to_file("clean/properties_geo.geojson", driver="GeoJSON")  

# OSM

In [160]:
with open('export.geojson', 'r') as f:
    data = json.load(f)['features']
data_p = [d['properties'] for d in data]
data_g = [d['geometry']['coordinates'] for d in data]

In [161]:
all_keys = set([dk for d in data_p for dk in d.keys()])
print(all_keys)

{'roof:orientation', 'name', 'visible_name', 'building:levels', 'name:sk', 'addr:country', 'svg:bezier', 'loc_name', 'lit', 'roof:material', 'name:cs', 'religion', 'historic', 'addr:postcode', 'roof:height', 'name:ru', 'url', 'email', 'name:fr', 'bicycle', 'roof:shape', 'abandoned', 'name:zh', 'name:he', 'source:date', 'horse', '@id', 'source', 'opening_hours', 'website', 'addr:city', 'addr:street', 'name:vec', 'wikipedia', 'tourism', 'name:pl', 'ele', 'style', 'name:pt', 'name:uk', 'place', 'name:lt', 'start_date', 'building', 'wikipedia:en', 'amenity', 'height', 'wheelchair', 'foot', 'name:it', 'name:ko', 'wikimedia_commons', 'denomination', 'old_name', 'name:da', 'surface', 'check_date', 'area', 'service_times', 'name:es', 'addr:housenumber', 'building:colour', '@geometry', 'roof:colour', 'addr:place', 'name:hu', 'name:ja', 'name:de', 'wikidata', 'name:en', 'layer', 'building:material', 'importance', 'type', 'note', 'alt_name', 'highway'}


In [162]:
important_keys = ['name','religion','building','amenity','highway','place']
data_clean = []
for p, g in zip(data_p, data_g):
    p_k = {}
    for k in important_keys:
        if k in p.keys():
            p_k[k] = p[k]
    p_k['geometry'] = g
    data_clean.append(p_k)
    

In [262]:
data_exp = pd.DataFrame(data_clean)
data_exp.sample(5)

Unnamed: 0,religion,building,amenity,geometry,name,highway,place
173,christian,church,place_of_worship,"[12.3465285, 45.4355804]",Chiesa di Sant'Antonin,,
3,,,,"[12.3319525, 45.4342152]",Campo Sant'Anzolo,pedestrian,square
90,christian,church,place_of_worship,"[12.3327977, 45.4326004]",Santa Maria del Giglio o Zobenigo,,
158,christian,church,place_of_worship,"[12.3421972, 45.4392537]",Basilica dei santi Giovanni e Paolo,,
73,,,,"[12.3390051, 45.43605]",Campo della Guerra,pedestrian,


In [263]:
data_exp.loc[(data_exp['highway'].notna()) | (data_exp['place'].notna()), 'type'] = 'square'
data_exp.loc[(data_exp['building'].notna()), 'type'] = 'building'
data_exp = data_exp[data_exp['religion'] != 'jewish']
data_exp.loc[data_exp['name']=='Oratorio della Madre di Dio', 'building'] = 'church'
data_exp.loc[data_exp['name']=='Chiesa di San Giovanni Battista ai Catecumeni', 'building'] = 'church'
data_exp.loc[data_exp['name']=='Cappella di San Vio', 'building'] = 'church'
data_exp.loc[data_exp['name']=="Scuola dell'Angelo custode", 'building'] = 'church'
data_exp.loc[data_exp['building']=="chapel", 'building'] = 'church'
data_exp.loc[data_exp['building'].notna(), 'type'] = data_exp['building']
data_exp['name'].fillna('',inplace=True)
data_exp['Longitude'] = data_exp['geometry'].apply(lambda x: x[0])
data_exp['Latitude'] = data_exp['geometry'].apply(lambda x: x[1])
data_exp.drop(['geometry','highway','place','religion','building','amenity'], axis=1, inplace=True)
data_exp['name'] = data_exp['name'].apply(lambda x: x.lower())
data_exp.rename(columns={"name":"Entity_Name", "type":"Entity_Type"}, inplace=True)

In [264]:
print(data_exp.shape)
data_exp.sample(5)

(192, 4)


Unnamed: 0,Entity_Name,Entity_Type,Longitude,Latitude
12,campo san salvador,square,12.33615,45.43688
123,santa teresa,church,12.316247,45.433146
63,,square,12.326952,45.437947
152,chiesa dei santi geremia e lucia,church,12.325317,45.442656
183,campo san stin,square,12.326952,45.437947


In [23]:
data_exp.to_csv('clean/entities.csv', index=False)

In [24]:
data_exp = pd.read_csv('clean/entities.csv')

gdf = gpd.GeoDataFrame(data_exp, geometry=[Point(xy) for xy in zip(data_exp['Longitude'], data_exp['Latitude'])])
geojson_str = gdf.to_json()
gdf.to_file("clean/entities_geo.geojson", driver="GeoJSON")  
print(geojson_str)

{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"Entity_Name": null, "Entity_Type": "church", "Longitude": 12.3328817, "Latitude": 45.4463708}, "geometry": {"type": "Point", "coordinates": [12.3328817, 45.4463708]}}, {"id": "1", "type": "Feature", "properties": {"Entity_Name": "campiello della malvasia", "Entity_Type": "square", "Longitude": 12.3497984, "Latitude": 45.4329065}, "geometry": {"type": "Point", "coordinates": [12.3497984, 45.4329065]}}, {"id": "2", "type": "Feature", "properties": {"Entity_Name": "campo santa margherita", "Entity_Type": "square", "Longitude": 12.3234482, "Latitude": 45.434305}, "geometry": {"type": "Point", "coordinates": [12.3234482, 45.434305]}}, {"id": "3", "type": "Feature", "properties": {"Entity_Name": "campo sant'anzolo", "Entity_Type": "square", "Longitude": 12.3319525, "Latitude": 45.4342152}, "geometry": {"type": "Point", "coordinates": [12.3319525, 45.4342152]}}, {"id": "4", "type": "Feature", "properties