<h1>Retired People and “Retired Places”: Italy, Data Preprocessing<span class="tocSkip"></span></h1>
<div class="toc">
  <ol class="toc-item">
    <li>
      <span>
        <a href="#Loading-input-datasets" data-toc-modified-id="Loading-input-datasets-1">
          <span class="toc-item-num"></span>Loading input datasets
        </a>
      </span>
    </li>
    <li>
      <span>
        <a href="#Processing-housing-data" data-toc-modified-id="Processing-housing-data-2">
          <span class="toc-item-num"></span>Processing housing data
        </a>
      </span>
    </li>
    <li>
      <span>
        <a href="#Processing-population-data" data-toc-modified-id="Processing-population-data-3">
          <span class="toc-item-num"></span>Processing population data
        </a>
      </span>
    </li>
    <li>
      <span>
        <a href="#Merging-into-final-dataset" data-toc-modified-id="Merging-into-final-dataset-4">
          <span class="toc-item-num"></span>Merging into final dataset
        </a>
      </span>
    </li>
  </ol>
</div>



## Loading input datasets <a id="Loading-input-datasets"></a>



In [1]:
import pandas as pd
from pathlib import Path
import geopandas as gpd

print('Libraries are downloaded')

Libraries are downloaded


In [2]:
PROJECT_ROOT = Path("..").resolve().parent

PROCESSED = PROJECT_ROOT / "data" / "processed"

PROCESSED

PosixPath('/Users/eugenia/Desktop/Open Access/project/retired_places/data/processed')

In [3]:
df_population_it = pd.read_csv(PROCESSED / "pop_reg_it_clean.csv")
df_housing_it = pd.read_csv(PROCESSED / "homes_it_clean.csv")

gdf_places_all = gpd.read_file(PROCESSED / "MED1_settlements_italy.gpkg", layer="places")
gdf_regions = gpd.read_file(PROCESSED / "italy_regions.geojson")

In [4]:
df_housing_it.head()

Unnamed: 0,region,homes_occupied,homes_unoccupied,homes_total
0,Piemonte,1964108,827768,2791876
1,Valle d'Aosta / Vallée d'Aoste,59616,75948,135564
2,Liguria,746686,431321,1178007
3,Lombardia,4415364,1184728,5600092
4,Trentino Alto Adige / Südtirol,463305,219888,683193


In [5]:
df_population_it.head(110)

Unnamed: 0,region_code,region,age,pop_male,pop_female,pop_total
0,13,Abruzzo,0.0,3842.0,3577.0,7419.0
1,13,Abruzzo,1.0,4010.0,3653.0,7663.0
2,13,Abruzzo,2.0,4260.0,3873.0,8133.0
3,13,Abruzzo,3.0,4298.0,4163.0,8461.0
4,13,Abruzzo,4.0,4456.0,4163.0,8619.0
...,...,...,...,...,...,...
105,17,Basilicata,3.0,1755.0,1608.0,3363.0
106,17,Basilicata,4.0,1864.0,1674.0,3538.0
107,17,Basilicata,5.0,1929.0,1749.0,3678.0
108,17,Basilicata,6.0,1904.0,1819.0,3723.0


In [6]:
gdf_places_all.head()

Unnamed: 0,osm_id,code,fclass,population,name,geometry
0,59889482,1004,hamlet,0,Poggio,POINT (8.72158 46.0813)
1,59943000,1004,hamlet,0,Virasca,POINT (8.65932 45.94051)
2,60005756,1003,village,0,Folsogno,POINT (8.55015 46.13431)
3,60169984,1003,village,0,Sant'Agata,POINT (8.68572 46.07089)
4,62505581,1001,city,1350000,Milano,POINT (9.18963 45.46419)


In [7]:
gdf_regions.head()

Unnamed: 0,COD_RIP,COD_REG,DEN_REG,Shape_Leng,Shape_Area,geometry
0,1,1,Piemonte,1204253.0,25388840000.0,"POLYGON ((8.44549 46.46371, 8.44654 46.46348, ..."
1,1,2,Valle d'Aosta,306881.1,3260045000.0,"POLYGON ((7.57478 45.98748, 7.57762 45.98646, ..."
2,1,3,Lombardia,1380027.0,23861280000.0,"MULTIPOLYGON (((8.81427 45.01739, 8.81351 45.0..."
3,2,4,Trentino-Alto Adige,781801.8,13606180000.0,"POLYGON ((12.20868 47.08622, 12.21624 47.08546..."
4,2,5,Veneto,1033420.0,18353820000.0,"POLYGON ((12.51368 46.67811, 12.51964 46.67677..."


## Processing housing data <a id="Processing-housing-data"></a>

In [8]:
df_housing_it

Unnamed: 0,region,homes_occupied,homes_unoccupied,homes_total
0,Piemonte,1964108,827768,2791876
1,Valle d'Aosta / Vallée d'Aoste,59616,75948,135564
2,Liguria,746686,431321,1178007
3,Lombardia,4415364,1184728,5600092
4,Trentino Alto Adige / Südtirol,463305,219888,683193
5,Provincia Autonoma Bolzano / Bozen,226675,67100,293775
6,Provincia Autonoma Trento,236630,152788,389418
7,Veneto,2076568,584378,2660946
8,Friuli-Venezia Giulia,557109,173363,730472
9,Emilia-Romagna,1993088,554077,2547165


In [9]:
# normalize region names 
def normalize_region_name(s: str) -> str:
    s = s.strip()
    s = s.replace(" / ", "/")          
    s = s.replace(" - ", "-")        
    s = s.replace("–", "-")            
    return s

df_housing_it["region_norm"] = (
    df_housing_it["region"]
    .astype(str)
    .apply(normalize_region_name)
)

df_housing_it

Unnamed: 0,region,homes_occupied,homes_unoccupied,homes_total,region_norm
0,Piemonte,1964108,827768,2791876,Piemonte
1,Valle d'Aosta / Vallée d'Aoste,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste
2,Liguria,746686,431321,1178007,Liguria
3,Lombardia,4415364,1184728,5600092,Lombardia
4,Trentino Alto Adige / Südtirol,463305,219888,683193,Trentino Alto Adige/Südtirol
5,Provincia Autonoma Bolzano / Bozen,226675,67100,293775,Provincia Autonoma Bolzano/Bozen
6,Provincia Autonoma Trento,236630,152788,389418,Provincia Autonoma Trento
7,Veneto,2076568,584378,2660946,Veneto
8,Friuli-Venezia Giulia,557109,173363,730472,Friuli-Venezia Giulia
9,Emilia-Romagna,1993088,554077,2547165,Emilia-Romagna


In [10]:
# create dict with region codes for datasets mashing 
region_codes = {
    "Piemonte": 1,
    "Valle d'Aosta/Vallée d'Aoste": 2,
    "Lombardia": 3,
    "Trentino-Alto Adige/Südtirol": 4,
    "Veneto": 5,
    "Friuli-Venezia Giulia": 6,
    "Liguria": 7,
    "Emilia-Romagna": 8,
    "Toscana": 9,
    "Umbria": 10,
    "Marche": 11,
    "Lazio": 12,
    "Abruzzo": 13,
    "Molise": 14,
    "Campania": 15,
    "Puglia": 16,
    "Basilicata": 17,
    "Calabria": 18,
    "Sicilia": 19,
    "Sardegna": 20,
}

# region code maping
df_housing_it["region_code"] = df_housing_it["region_norm"].map(region_codes)

In [11]:
# check
print(df_housing_it[df_housing_it["region_code"].isna()][["region", "region_norm"]].head())

                                 region                       region_norm
4      Trentino Alto Adige / Südtirol        Trentino Alto Adige/Südtirol
5  Provincia Autonoma Bolzano / Bozen    Provincia Autonoma Bolzano/Bozen
6           Provincia Autonoma Trento           Provincia Autonoma Trento


The data providers chose to disaggregate Trentino-Alto Adige/Südtirol into three separate units (the whole region plus the two autonomous provinces, Bolzano/Bozen and Trento). As a result, these entries did not initially have a corresponding ISTAT region code in our region_codes dictionary and had to be added manually.

In [12]:
region_codes.update({
    "Trentino Alto Adige/Südtirol": 4,
    "Provincia Autonoma Bolzano/Bozen": 4,
    "Provincia Autonoma Trento": 4,
})

df_housing_it["region_code"] = df_housing_it["region_norm"].map(region_codes)

In [13]:
df_housing_it

Unnamed: 0,region,homes_occupied,homes_unoccupied,homes_total,region_norm,region_code
0,Piemonte,1964108,827768,2791876,Piemonte,1
1,Valle d'Aosta / Vallée d'Aoste,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,2
2,Liguria,746686,431321,1178007,Liguria,7
3,Lombardia,4415364,1184728,5600092,Lombardia,3
4,Trentino Alto Adige / Südtirol,463305,219888,683193,Trentino Alto Adige/Südtirol,4
5,Provincia Autonoma Bolzano / Bozen,226675,67100,293775,Provincia Autonoma Bolzano/Bozen,4
6,Provincia Autonoma Trento,236630,152788,389418,Provincia Autonoma Trento,4
7,Veneto,2076568,584378,2660946,Veneto,5
8,Friuli-Venezia Giulia,557109,173363,730472,Friuli-Venezia Giulia,6
9,Emilia-Romagna,1993088,554077,2547165,Emilia-Romagna,8


We dropped the rows for Provincia Autonoma Bolzano/Bozen and Provincia Autonoma Trento because their values sum exactly to the regional total for Trentino-Alto Adige/Südtirol, and keeping all three would double-count the same housing units in our regional-level analysis.

In [14]:
mask = df_housing_it["region_norm"].str.contains("Provincia Autonoma", na=False)

df_housing_it = df_housing_it[~mask].reset_index(drop=True)


df_housing_it

Unnamed: 0,region,homes_occupied,homes_unoccupied,homes_total,region_norm,region_code
0,Piemonte,1964108,827768,2791876,Piemonte,1
1,Valle d'Aosta / Vallée d'Aoste,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,2
2,Liguria,746686,431321,1178007,Liguria,7
3,Lombardia,4415364,1184728,5600092,Lombardia,3
4,Trentino Alto Adige / Südtirol,463305,219888,683193,Trentino Alto Adige/Südtirol,4
5,Veneto,2076568,584378,2660946,Veneto,5
6,Friuli-Venezia Giulia,557109,173363,730472,Friuli-Venezia Giulia,6
7,Emilia-Romagna,1993088,554077,2547165,Emilia-Romagna,8
8,Toscana,1627013,506892,2133905,Toscana,9
9,Umbria,376747,126922,503669,Umbria,10


In [15]:
# count unoccupied homes share
df_housing_it["share_unoccupied"] = (
    df_housing_it["homes_unoccupied"] / df_housing_it["homes_total"] * 100
)

df_housing_it

Unnamed: 0,region,homes_occupied,homes_unoccupied,homes_total,region_norm,region_code,share_unoccupied
0,Piemonte,1964108,827768,2791876,Piemonte,1,29.649168
1,Valle d'Aosta / Vallée d'Aoste,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,2,56.023723
2,Liguria,746686,431321,1178007,Liguria,7,36.614468
3,Lombardia,4415364,1184728,5600092,Lombardia,3,21.15551
4,Trentino Alto Adige / Südtirol,463305,219888,683193,Trentino Alto Adige/Südtirol,4,32.185341
5,Veneto,2076568,584378,2660946,Veneto,5,21.961287
6,Friuli-Venezia Giulia,557109,173363,730472,Friuli-Venezia Giulia,6,23.733011
7,Emilia-Romagna,1993088,554077,2547165,Emilia-Romagna,8,21.752694
8,Toscana,1627013,506892,2133905,Toscana,9,23.754197
9,Umbria,376747,126922,503669,Umbria,10,25.199486


In [16]:
it_free_homes = PROCESSED / "MD1_share_houses_occupation.csv"
df_housing_it.to_csv(it_free_homes, index=False)

print(f"saved to: {it_free_homes}")

saved to: /Users/eugenia/Desktop/Open Access/project/retired_places/data/processed/MD1_share_houses_occupation.csv


## Processing population data <a id="Processing-population-data"></a>


In [17]:
# extract total regions population
it_total_age = df_population_it[df_population_it["age"] == 999]
it_total_age

Unnamed: 0,region_code,region,age,pop_male,pop_female,pop_total
101,13,Abruzzo,999.0,622936.0,645494.0,1268430.0
203,17,Basilicata,999.0,262604.0,267293.0,529897.0
305,18,Calabria,999.0,899712.0,932435.0,1832147.0
407,15,Campania,999.0,2726809.0,2848216.0,5575025.0
509,8,Emilia-Romagna,999.0,2194241.0,2271437.0,4465678.0
611,6,Friuli-Venezia Giulia,999.0,584758.0,609337.0,1194095.0
713,12,Lazio,999.0,2771470.0,2938802.0,5710272.0
815,7,Liguria,999.0,731614.0,778294.0,1509908.0
917,3,Lombardia,999.0,4946391.0,5089090.0,10035481.0
1019,11,Marche,999.0,726773.0,754479.0,1481252.0


In [18]:
# add total region population as a separate column
totals_for_merge = (
    it_total_age[["region_code", "pop_total"]]
    .rename(columns={"pop_total": "pop_total_all_ages"})
)

df_population_it = df_population_it.merge(
    totals_for_merge,
    on="region_code",
    how="left"
)

df_population_it

Unnamed: 0,region_code,region,age,pop_male,pop_female,pop_total,pop_total_all_ages
0,13,Abruzzo,0.0,3842.0,3577.0,7419.0,1268430.0
1,13,Abruzzo,1.0,4010.0,3653.0,7663.0,1268430.0
2,13,Abruzzo,2.0,4260.0,3873.0,8133.0,1268430.0
3,13,Abruzzo,3.0,4298.0,4163.0,8461.0,1268430.0
4,13,Abruzzo,4.0,4456.0,4163.0,8619.0,1268430.0
...,...,...,...,...,...,...,...
2035,5,Veneto,97.0,601.0,2244.0,2845.0,4851851.0
2036,5,Veneto,98.0,350.0,1524.0,1874.0,4851851.0
2037,5,Veneto,99.0,221.0,1023.0,1244.0,4851851.0
2038,5,Veneto,100.0,267.0,1654.0,1921.0,4851851.0


In [19]:
# count share of retired people
df_population_it = df_population_it[df_population_it["age"] != 999].copy()

df_population_it["pop_65plus_tmp"] = df_population_it["pop_total"].where(
    df_population_it["age"] >= 65,
    0
)

df_population_it.head(70)

Unnamed: 0,region_code,region,age,pop_male,pop_female,pop_total,pop_total_all_ages,pop_65plus_tmp
0,13,Abruzzo,0.0,3842.0,3577.0,7419.0,1268430.0,0.0
1,13,Abruzzo,1.0,4010.0,3653.0,7663.0,1268430.0,0.0
2,13,Abruzzo,2.0,4260.0,3873.0,8133.0,1268430.0,0.0
3,13,Abruzzo,3.0,4298.0,4163.0,8461.0,1268430.0,0.0
4,13,Abruzzo,4.0,4456.0,4163.0,8619.0,1268430.0,0.0
...,...,...,...,...,...,...,...,...
65,13,Abruzzo,65.0,8625.0,9412.0,18037.0,1268430.0,18037.0
66,13,Abruzzo,66.0,8259.0,9010.0,17269.0,1268430.0,17269.0
67,13,Abruzzo,67.0,8105.0,8812.0,16917.0,1268430.0,16917.0
68,13,Abruzzo,68.0,8100.0,8573.0,16673.0,1268430.0,16673.0


In [20]:
# count people after 65 in every region 
df_region_65_it = (
    df_population_it
    .groupby(["region_code", "region"], as_index=False)
    .agg(
        pop_65plus=("pop_65plus_tmp", "sum"),        
        tot_pop=("pop_total_all_ages", "first"),    
    )
)

df_region_65_it.head()

Unnamed: 0,region_code,region,pop_65plus,tot_pop
0,1,Piemonte,1142793.0,4255702.0
1,2,Valle d'Aosta/Vallée d'Aoste,31686.0,122714.0
2,3,Lombardia,2394067.0,10035481.0
3,4,Trentino-Alto Adige/Südtirol,244707.0,1086095.0
4,5,Veneto,1208173.0,4851851.0


In [21]:
# count share of 65 +
df_region_65_it["share_65plus"] = df_region_65_it["pop_65plus"] / df_region_65_it["tot_pop"] * 100

In [22]:
df_region_65_it.head(20)

Unnamed: 0,region_code,region,pop_65plus,tot_pop,share_65plus
0,1,Piemonte,1142793.0,4255702.0,26.85322
1,2,Valle d'Aosta/Vallée d'Aoste,31686.0,122714.0,25.821015
2,3,Lombardia,2394067.0,10035481.0,23.856026
3,4,Trentino-Alto Adige/Südtirol,244707.0,1086095.0,22.530902
4,5,Veneto,1208173.0,4851851.0,24.90128
5,6,Friuli-Venezia Giulia,328115.0,1194095.0,27.478132
6,7,Liguria,440645.0,1509908.0,29.183566
7,8,Emilia-Romagna,1112536.0,4465678.0,24.913037
8,9,Toscana,977876.0,3660834.0,26.711837
9,10,Umbria,232730.0,851954.0,27.317203


In [23]:
it_65_path = PROCESSED / "MD2_share_65_plus.csv"
df_region_65_it.to_csv(it_65_path, index=False)

print(f"saved to: {it_65_path}")

saved to: /Users/eugenia/Desktop/Open Access/project/retired_places/data/processed/MD2_share_65_plus.csv


## Processing geo data <a id="Processing-geo-data"></a>


We would like to count the Dispersed Settlements Index. For this from the dataset with places we retain only features tagged as place=village and place=hamlet, which correspond to small settlements below 10,000 inhabitants. We explicitly exclude larger centres (city, town), intra-urban units (suburb), non-settlement features (island, farm, region, county, locality). This keeps the index focused on the fine-grained rural and semi-rural settlement pattern rather than on the higher-order urban hierarchy.

In [24]:
# filtering by class: village and hamlet
gdf_small_places = (
    gdf_places_all[gdf_places_all["fclass"].isin(["village", "hamlet"])]
    .loc[:, ["fclass", "name", "geometry"]]
    .copy()
)

# check
print("All places:", len(gdf_places_all))
print("Only village + hamlet:", len(gdf_small_places))
print(gdf_small_places["fclass"].value_counts())

All places: 141922
Only village + hamlet: 62849
fclass
hamlet     48131
village    14718
Name: count, dtype: int64


In [25]:
gdf_small_places.head()

Unnamed: 0,fclass,name,geometry
0,hamlet,Poggio,POINT (8.72158 46.0813)
1,hamlet,Virasca,POINT (8.65932 45.94051)
2,village,Folsogno,POINT (8.55015 46.13431)
3,village,Sant'Agata,POINT (8.68572 46.07089)
13,village,Bedulita,POINT (9.55155 45.79165)


In [26]:
gdf_regions.columns

Index(['COD_RIP', 'COD_REG', 'DEN_REG', 'Shape_Leng', 'Shape_Area',
       'geometry'],
      dtype='object')

In [27]:
# taking features we need
gdf_regions_min = gdf_regions[["COD_REG", "DEN_REG", "geometry"]].copy()

# sjoin for each give region code and name 
gdf_places_with_region = gpd.sjoin(
    gdf_small_places,
    gdf_regions_min,
    how="left",       
    predicate="within" 
)

# check
gdf_places_with_region.tail()

Unnamed: 0,fclass,name,geometry,index_right,COD_REG,DEN_REG
141838,hamlet,Su Tillio,POINT (9.78144 40.5474),19.0,20.0,Sardegna
141839,hamlet,Santa Lcuia,POINT (8.61482 39.98251),19.0,20.0,Sardegna
141881,hamlet,San Leonardo Sottano,POINT (15.09094 37.35183),18.0,19.0,Sicilia
141885,hamlet,Cicogne,POINT (14.57531 37.0186),18.0,19.0,Sicilia
141895,hamlet,Borgo 84,POINT (15.11854 36.72397),18.0,19.0,Sicilia


In [28]:
# check if there is any place without clear region code
missing_region = gdf_places_with_region[gdf_places_with_region["COD_REG"].isna()]
print("Settlements without a region:", len(missing_region))

Settlements without a region: 179


A small number of settlement points (179) could not be matched to any regional polygon (likely due to minor boundary and coastline mismatches). Since our analysis is conducted at the regional level, we dropped these unmatched points and computed all indicators only for settlements that fall within an official regional boundary.

In [29]:
gdf_places_with_region = (
    gdf_places_with_region
    .dropna(subset=["COD_REG"])
    .copy()
)

# unifying column names
gdf_places_with_region = gdf_places_with_region.rename(
    columns={
        "COD_REG": "region_code",
        "DEN_REG": "region",
    }
)

# changing type
gdf_places_with_region["region_code"] = gdf_places_with_region["region_code"].astype(int)

# index_right column
gdf_places_with_region = gdf_places_with_region.drop(columns=["index_right"])

# check
gdf_places_with_region.head()

Unnamed: 0,fclass,name,geometry,region_code,region
0,hamlet,Poggio,POINT (8.72158 46.0813),3,Lombardia
1,hamlet,Virasca,POINT (8.65932 45.94051),3,Lombardia
2,village,Folsogno,POINT (8.55015 46.13431),1,Piemonte
3,village,Sant'Agata,POINT (8.68572 46.07089),1,Piemonte
13,village,Bedulita,POINT (9.55155 45.79165),3,Lombardia


In [30]:
# count how many small settlements fall into each region
df_settlements_region = (
    gdf_places_with_region
    .groupby(["region_code", "region"], as_index=False)
    .size()                               
    .rename(columns={"size": "settlements_count"})
)

df_settlements_region.head()

Unnamed: 0,region_code,region,settlements_count
0,1,Piemonte,9220
1,2,Valle d'Aosta,2901
2,3,Lombardia,8531
3,4,Trentino-Alto Adige,2625
4,5,Veneto,5888


In [31]:
# saving settlement df
settlements_path = PROCESSED / "MD3_settlements_count.csv"
df_settlements_region.to_csv(settlements_path, index=False)

print(f"saved to: {settlements_path}")

saved to: /Users/eugenia/Desktop/Open Access/project/retired_places/data/processed/MD3_settlements_count.csv


## Merging into final dataset <a id="Merging-into-final-dataset"></a>

Firstly, we merge regional population table with the settlements counts.

In [32]:
#    left join to keep all regions from df_region_65_it,
#    even if for some reason a region has zero villages/hamlets 
df_italy_dispersion = df_region_65_it.merge(
    df_settlements_region[["region_code", "settlements_count"]],
    on="region_code",
    how="left"
)

# replace missing settlements_count with 0
df_italy_dispersion["settlements_count"] = (
    df_italy_dispersion["settlements_count"]
    .fillna(0)
    .astype(int)
)

# compute the Dispersed Settlements Index: number of settlements per 1,000 inhabitants
df_italy_dispersion["dispersed_index"] = (
    df_italy_dispersion["settlements_count"]
    / (df_italy_dispersion["tot_pop"] / 1000)
)

# 4. select relevant columns
df_italy_dispersion = df_italy_dispersion[
    [
        "region_code",
        "region",
        "tot_pop",
        "settlements_count",
        "dispersed_index",
        "share_65plus"
    ]
]

# check
df_italy_dispersion.head()

Unnamed: 0,region_code,region,tot_pop,settlements_count,dispersed_index,share_65plus
0,1,Piemonte,4255702.0,9220,2.166505,26.85322
1,2,Valle d'Aosta/Vallée d'Aoste,122714.0,2901,23.640334,25.821015
2,3,Lombardia,10035481.0,8531,0.850084,23.856026
3,4,Trentino-Alto Adige/Südtirol,1086095.0,2625,2.416916,22.530902
4,5,Veneto,4851851.0,5888,1.213557,24.90128


We want to add share of unoccupied houses and macroregions to the dataframe. 

In [33]:
df_housing_subset = df_housing_it[["region_code", "share_unoccupied"]].copy()

df_italy_dispersion = df_italy_dispersion.merge(
    df_housing_subset,
    on="region_code",
    how="left"   
)

df_italy_dispersion.head()

Unnamed: 0,region_code,region,tot_pop,settlements_count,dispersed_index,share_65plus,share_unoccupied
0,1,Piemonte,4255702.0,9220,2.166505,26.85322,29.649168
1,2,Valle d'Aosta/Vallée d'Aoste,122714.0,2901,23.640334,25.821015,56.023723
2,3,Lombardia,10035481.0,8531,0.850084,23.856026,21.15551
3,4,Trentino-Alto Adige/Südtirol,1086095.0,2625,2.416916,22.530902,32.185341
4,5,Veneto,4851851.0,5888,1.213557,24.90128,21.961287


In [34]:
macro_map = {
    "Piemonte": "North",
    "Valle d'Aosta/Vallée d'Aoste": "North",
    "Lombardia": "North",
    "Trentino-Alto Adige/Südtirol": "North",
    "Veneto": "North",
    "Friuli-Venezia Giulia": "North",
    "Liguria": "North",
    "Emilia-Romagna": "North",
    "Toscana": "Centre",
    "Umbria": "Centre",
    "Marche": "Centre",
    "Lazio": "Centre",
    "Abruzzo": "South",
    "Molise": "South",
    "Campania": "South",
    "Puglia": "South",
    "Basilicata": "South",
    "Calabria": "South",
    "Sicilia": "Islands",
    "Sardegna": "Islands",
}

df_italy_dispersion["macro_region"] = df_italy_dispersion["region"].map(macro_map)

df_italy_dispersion

Unnamed: 0,region_code,region,tot_pop,settlements_count,dispersed_index,share_65plus,share_unoccupied,macro_region
0,1,Piemonte,4255702.0,9220,2.166505,26.85322,29.649168,North
1,2,Valle d'Aosta/Vallée d'Aoste,122714.0,2901,23.640334,25.821015,56.023723,North
2,3,Lombardia,10035481.0,8531,0.850084,23.856026,21.15551,North
3,4,Trentino-Alto Adige/Südtirol,1086095.0,2625,2.416916,22.530902,32.185341,North
4,5,Veneto,4851851.0,5888,1.213557,24.90128,21.961287,North
5,6,Friuli-Venezia Giulia,1194095.0,1966,1.646435,27.478132,23.733011,North
6,7,Liguria,1509908.0,2196,1.454393,29.183566,36.614468,North
7,8,Emilia-Romagna,4465678.0,8052,1.803086,24.913037,21.752694,North
8,9,Toscana,3660834.0,5316,1.452128,26.711837,23.754197,Centre
9,10,Umbria,851954.0,1901,2.231341,27.317203,25.199486,Centre


In [35]:
# saving dicpersion index df
disp_path = PROCESSED / "MD4_dispertion_places.csv"
df_italy_dispersion.to_csv(disp_path, index=False)

print(f"saved to: {disp_path}")

saved to: /Users/eugenia/Desktop/Open Access/project/retired_places/data/processed/MD4_dispertion_places.csv


Creating df with population and housing data. 

In [36]:
df_italy_merged = df_region_65_it.merge(
    df_housing_it,
    on="region_code",
    how="left"
)

df_italy_merged = df_italy_merged.drop(columns=["region_y"])
df_italy_merged = df_italy_merged.rename(columns={"region_x": "region"})

df_italy_merged

Unnamed: 0,region_code,region,pop_65plus,tot_pop,share_65plus,homes_occupied,homes_unoccupied,homes_total,region_norm,share_unoccupied
0,1,Piemonte,1142793.0,4255702.0,26.85322,1964108,827768,2791876,Piemonte,29.649168
1,2,Valle d'Aosta/Vallée d'Aoste,31686.0,122714.0,25.821015,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,56.023723
2,3,Lombardia,2394067.0,10035481.0,23.856026,4415364,1184728,5600092,Lombardia,21.15551
3,4,Trentino-Alto Adige/Südtirol,244707.0,1086095.0,22.530902,463305,219888,683193,Trentino Alto Adige/Südtirol,32.185341
4,5,Veneto,1208173.0,4851851.0,24.90128,2076568,584378,2660946,Veneto,21.961287
5,6,Friuli-Venezia Giulia,328115.0,1194095.0,27.478132,557109,173363,730472,Friuli-Venezia Giulia,23.733011
6,7,Liguria,440645.0,1509908.0,29.183566,746686,431321,1178007,Liguria,36.614468
7,8,Emilia-Romagna,1112536.0,4465678.0,24.913037,1993088,554077,2547165,Emilia-Romagna,21.752694
8,9,Toscana,977876.0,3660834.0,26.711837,1627013,506892,2133905,Toscana,23.754197
9,10,Umbria,232730.0,851954.0,27.317203,376747,126922,503669,Umbria,25.199486


In [37]:
# adding macro region
df_italy_merged["macro_region"] = df_italy_merged["region"].map(macro_map)

df_italy_merged

Unnamed: 0,region_code,region,pop_65plus,tot_pop,share_65plus,homes_occupied,homes_unoccupied,homes_total,region_norm,share_unoccupied,macro_region
0,1,Piemonte,1142793.0,4255702.0,26.85322,1964108,827768,2791876,Piemonte,29.649168,North
1,2,Valle d'Aosta/Vallée d'Aoste,31686.0,122714.0,25.821015,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,56.023723,North
2,3,Lombardia,2394067.0,10035481.0,23.856026,4415364,1184728,5600092,Lombardia,21.15551,North
3,4,Trentino-Alto Adige/Südtirol,244707.0,1086095.0,22.530902,463305,219888,683193,Trentino Alto Adige/Südtirol,32.185341,North
4,5,Veneto,1208173.0,4851851.0,24.90128,2076568,584378,2660946,Veneto,21.961287,North
5,6,Friuli-Venezia Giulia,328115.0,1194095.0,27.478132,557109,173363,730472,Friuli-Venezia Giulia,23.733011,North
6,7,Liguria,440645.0,1509908.0,29.183566,746686,431321,1178007,Liguria,36.614468,North
7,8,Emilia-Romagna,1112536.0,4465678.0,24.913037,1993088,554077,2547165,Emilia-Romagna,21.752694,North
8,9,Toscana,977876.0,3660834.0,26.711837,1627013,506892,2133905,Toscana,23.754197,Centre
9,10,Umbria,232730.0,851954.0,27.317203,376747,126922,503669,Umbria,25.199486,Centre


To move from two continuous indicators to an interpretable typology, we defined binary flags for “high” ageing and “high” vacancy using simple, data-driven thresholds (the median values of share_65plus and share_unoccupied). We then combined these flags into a four-fold categorical variable, category_2x2, which assigns each region to one of four types: “Old & Empty” (high ageing, high vacancy), “Old & Lived-in” (high ageing, low vacancy), “Younger but Emptying” (low ageing, high vacancy), and “Younger & Lived-in” (low ageing, low vacancy). This discrete typology makes it easier to communicate and compare how different regions position themselves along the joint dimensions of demographic ageing and housing under-use.

In [38]:
# Use the median values as simple, data-driven thresholds
# for defining "high" ageing and "high" vacancy.
thr_65 = df_italy_merged["share_65plus"].median()          # threshold for high share of 65+ residents
thr_vac = df_italy_merged["share_unoccupied"].median()     # threshold for high share of unoccupied homes

# Create binary flags: is this region above or below each threshold?
df_italy_merged["high_65"] = df_italy_merged["share_65plus"] >= thr_65
df_italy_merged["high_vac"] = df_italy_merged["share_unoccupied"] >= thr_vac

# Map each combination of (high_65, high_vac) into a human-readable 2×2 category.
def cat(row):
    # High ageing and high vacancy: both people and places are "retired"
    if row["high_65"] and row["high_vac"]:
        return "Old & Empty"
    # High ageing but low vacancy: many older people, but housing is still actively used
    if row["high_65"] and not row["high_vac"]:
        return "Old & Lived-in"
    # Low ageing but high vacancy: relatively younger population, but many empty homes
    if not row["high_65"] and row["high_vac"]:
        return "Younger but Emptying"
    # Low ageing and low vacancy: younger population and intensively used housing stock
    return "Younger & Lived-in"

# Apply the function row by row to assign each region to one of the four types.
df_italy_merged["category_2x2"] = df_italy_merged.apply(cat, axis=1)

df_italy_merged

Unnamed: 0,region_code,region,pop_65plus,tot_pop,share_65plus,homes_occupied,homes_unoccupied,homes_total,region_norm,share_unoccupied,macro_region,high_65,high_vac,category_2x2
0,1,Piemonte,1142793.0,4255702.0,26.85322,1964108,827768,2791876,Piemonte,29.649168,North,True,False,Old & Lived-in
1,2,Valle d'Aosta/Vallée d'Aoste,31686.0,122714.0,25.821015,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,56.023723,North,False,True,Younger but Emptying
2,3,Lombardia,2394067.0,10035481.0,23.856026,4415364,1184728,5600092,Lombardia,21.15551,North,False,False,Younger & Lived-in
3,4,Trentino-Alto Adige/Südtirol,244707.0,1086095.0,22.530902,463305,219888,683193,Trentino Alto Adige/Südtirol,32.185341,North,False,True,Younger but Emptying
4,5,Veneto,1208173.0,4851851.0,24.90128,2076568,584378,2660946,Veneto,21.961287,North,False,False,Younger & Lived-in
5,6,Friuli-Venezia Giulia,328115.0,1194095.0,27.478132,557109,173363,730472,Friuli-Venezia Giulia,23.733011,North,True,False,Old & Lived-in
6,7,Liguria,440645.0,1509908.0,29.183566,746686,431321,1178007,Liguria,36.614468,North,True,True,Old & Empty
7,8,Emilia-Romagna,1112536.0,4465678.0,24.913037,1993088,554077,2547165,Emilia-Romagna,21.752694,North,False,False,Younger & Lived-in
8,9,Toscana,977876.0,3660834.0,26.711837,1627013,506892,2133905,Toscana,23.754197,Centre,True,False,Old & Lived-in
9,10,Umbria,232730.0,851954.0,27.317203,376747,126922,503669,Umbria,25.199486,Centre,True,False,Old & Lived-in


To compare regions on a relative scale, we computed separate rank positions for ageing and vacancy. First, we ranked regions by the share of residents aged 65+ (rank_65), and then by the share of unoccupied homes (rank_vac), with higher ranks indicating higher values of the corresponding indicator. We then calculated a simple divergence measure, rank_diff = rank_vac – rank_65. Positive values of rank_diff identify regions where vacancy is higher than one would expect given their ageing rank (i.e. “emptier than their level of ageing”), while negative values indicate regions that are relatively older than one would expect given their vacancy rank (i.e. “more aged than their level of housing under-use”).


In [39]:
# Compute rank positions for ageing and vacancy to compare regions on a relative scale.
# A higher rank means a higher value of the indicator (more ageing / more vacancy).

# Rank of each region by share of 65+ residents
df_italy_merged["rank_65"] = df_italy_merged["share_65plus"].rank(method="average")

# Rank of each region by share of unoccupied homes
df_italy_merged["rank_vac"] = df_italy_merged["share_unoccupied"].rank(method="average")

# Difference in ranks: positive values mean the region is "emptier" than expected
# given its ageing rank; negative values mean it is "older" than expected
# given its vacancy rank.
df_italy_merged["rank_diff"] = df_italy_merged["rank_vac"] - df_italy_merged["rank_65"]

df_italy_merged

Unnamed: 0,region_code,region,pop_65plus,tot_pop,share_65plus,homes_occupied,homes_unoccupied,homes_total,region_norm,share_unoccupied,macro_region,high_65,high_vac,category_2x2,rank_65,rank_vac,rank_diff
0,1,Piemonte,1142793.0,4255702.0,26.85322,1964108,827768,2791876,Piemonte,29.649168,North,True,False,Old & Lived-in,15.0,10.0,-5.0
1,2,Valle d'Aosta/Vallée d'Aoste,31686.0,122714.0,25.821015,59616,75948,135564,Valle d'Aosta/Vallée d'Aoste,56.023723,North,False,True,Younger but Emptying,10.0,20.0,10.0
2,3,Lombardia,2394067.0,10035481.0,23.856026,4415364,1184728,5600092,Lombardia,21.15551,North,False,False,Younger & Lived-in,5.0,2.0,-3.0
3,4,Trentino-Alto Adige/Südtirol,244707.0,1086095.0,22.530902,463305,219888,683193,Trentino Alto Adige/Südtirol,32.185341,North,False,True,Younger but Emptying,2.0,13.0,11.0
4,5,Veneto,1208173.0,4851851.0,24.90128,2076568,584378,2660946,Veneto,21.961287,North,False,False,Younger & Lived-in,8.0,4.0,-4.0
5,6,Friuli-Venezia Giulia,328115.0,1194095.0,27.478132,557109,173363,730472,Friuli-Venezia Giulia,23.733011,North,True,False,Old & Lived-in,19.0,5.0,-14.0
6,7,Liguria,440645.0,1509908.0,29.183566,746686,431321,1178007,Liguria,36.614468,North,True,True,Old & Empty,20.0,16.0,-4.0
7,8,Emilia-Romagna,1112536.0,4465678.0,24.913037,1993088,554077,2547165,Emilia-Romagna,21.752694,North,False,False,Younger & Lived-in,9.0,3.0,-6.0
8,9,Toscana,977876.0,3660834.0,26.711837,1627013,506892,2133905,Toscana,23.754197,Centre,True,False,Old & Lived-in,14.0,6.0,-8.0
9,10,Umbria,232730.0,851954.0,27.317203,376747,126922,503669,Umbria,25.199486,Centre,True,False,Old & Lived-in,17.0,8.0,-9.0


In [40]:
# saving final mashed df
it_homes_pop_path = PROCESSED / "MD5_age_houses_occupation.csv"
df_italy_merged.to_csv(it_homes_pop_path, index=False)

print(f"saved to: {it_homes_pop_path}")

saved to: /Users/eugenia/Desktop/Open Access/project/retired_places/data/processed/MD5_age_houses_occupation.csv
