In [48]:
import geopandas as gpd

# Read the GeoJSON file
catastici = gpd.read_file("./data/raw/20240221_Catastici1741_Intermediate.geojson")

# Filtering the dataset

In [49]:
# filter the necessary datapoints
catastici_ppl = catastici[(catastici['owner_code'] == 'PPL') & (catastici['owner_count']=='1')]
catastici_ppl = catastici_ppl[['owner_first_name','owner_family_name','function','place','an_rendi']]

Drop the rows if the owner first name is not given, not to confuse the model further. -> 1273 rows

In [51]:
# drop the rows with not owner first name info
catastici_ppl = catastici_ppl[catastici_ppl.owner_first_name!='']

Some First and Family names with "|" followed by some family relationship, e.g. "angela patella paolo | _moglie". There are 118 of these rows, so I am dropping these rows as well to have cleaner dataset.

In [52]:
catastici_ppl = catastici_ppl[(~catastici_ppl['owner_first_name'].str.contains('\|')) & (~catastici_ppl['owner_family_name'].str.contains('\|'))]

The price is given in ducati if the currency is not specified, otherwise different currency is specified

In [53]:
# fill the nan values with zero and add 'ducati'
catastici_ppl.loc[catastici_ppl['an_rendi']=='nan', 'an_rendi'] = '0'
catastici_ppl['an_rendi'] = [val + ' ducati' if val.isnumeric() else val for val in catastici_ppl['an_rendi']]

Set the nan values in the ramaining 2 columns (function and place) to 'NOT GIVEN' to have a model that also learns what information is not given.

In [54]:
# format the NaN values
catastici_ppl.loc[catastici_ppl['function']=='nan', 'function'] = 'not given'
catastici_ppl.loc[catastici_ppl['place']=='nan', 'place'] = 'not given'

In [55]:
# lowercase everything
for col in catastici_ppl.columns.to_list():
    catastici_ppl[col] = catastici_ppl[col].str.lower()

In [69]:
# rename the columns
catastici_ppl.rename({
    'owner_first_name':'Owner_First_Name',
    'owner_family_name':'Owner_Family_Name',
    'function':'Property_Type',
    'place':'Property_Location',
    'an_rendi':'Rent_Income'
}, axis=1, inplace=True)
catastici_ppl.to_csv(f'./data/clean/catastici.csv', index=False)