# 165: Population densité

In [3]:
import os
import sys

current_dir = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
while not current_dir.endswith("13_odis"):
    print("changing to parent dir")
    os.chdir(parent_dir)
    current_dir = parent_dir
    parent_dir = os.path.dirname(current_dir)

print(os.getcwd())
sys.path.append(current_dir)

import pandas as pd
import datetime
import json

from common.config import load_config
from common.data_source_model import DataSourceModel
from common.utils.file_handler import FileHandler
from common.utils.interfaces.data_handler import OperationType

/home/jtroadec/projects/playground/13_odis


Columns:
- GEO: code geographique
- TIME_PERIOD: période temporelle ou point dans le temps auquel l'observation se réfère effectivement
- FREQ: intervalle de temps séparant deux observations
- OCS: catégorie de logement (résidence principale, logements vacants, logements neufs mais non occupés, logements occasionnels, résidences secondaires)
    - _T : Total
    - DW_MAIN : Résidences principales
    - DW_SEC_DW_OCC : Résidences secondaires et logements occasionnels
    - DW_VAC : Logements vacants
- RP_MEASURE: Mesure de recensement
    - BRTH : Nombre de naissances cumulées entre deux recensements
    - DEATH : Nombre de décès cumulés entre deux recensements
    - DWELLINGS : Logements
    - DWELLINGS_POPSIZE : Population des ménages
    - POP : Population
    - SUP : Superficie

In [4]:
model_name = "population.population_superficie"
filepath = 'data/imports/population/population_superficie_1.json'

# Initialize common variables
dataframes = {}
artifacts = []

config = load_config("datasources.yaml", response_model=DataSourceModel)
model = config.get_model( model_name = model_name )
start_time = datetime.datetime.now()

# Instantiate File Handler for file loads and dumps
handler = FileHandler()

In [26]:
model

DomainModel(type='MelodiExtractor', description='recuperation de la population et de la superficie pour calcul de densite', format='json', name='population.population_superficie', API='INSEE.Melodi', endpoint='/data/DS_RP_SERIE_HISTORIQUE', headers=HeaderModel(accept='application/json'), extract_params={'maxResult': 10000, 'startPeriod': '2009-01-01', 'endPeriod': '2025-10-01', 'GEO': ['COM', 'DEP', 'REG'], 'RP_MEASURE': ['POP', 'SUP'], 'OCS': '_T'}, response_map={'data': 'observations', 'next': 'paging.next', 'is_last': 'paging.isLast'}, preprocessor=None, load_params=DataLoadParameters(separator=';', header=0, skipfooter=0), notebook_path=None, table_name='population_population_superficie', domain_name='population')

In [27]:
config

DataSourceModel(APIs={'INSEE.Metadonnees': APIModel(name='Metadonnees INSEE', base_url=HttpUrl('https://api.insee.fr/metadonnees/V1'), apidoc=HttpUrl('https://api.insee.fr/catalogue/site/themes/wso2/subthemes/insee/pages/item-info.jag?name=M%C3%A9tadonn%C3%A9es&version=V1&provider=insee'), description='INSEE - API des métadonnées', default_headers=HeaderModel(accept='application/json'), throttle=30), 'INSEE.Melodi': APIModel(name='MELODI', base_url=HttpUrl('https://api.insee.fr/melodi'), apidoc=HttpUrl('https://portail-api.insee.fr/catalog/api/a890b735-159c-4c91-90b7-35159c7c9126/doc?page=ee625968-272a-4637-a259-68272aa63766'), description='INSEE - API de données locales', default_headers=HeaderModel(accept='application/json'), throttle=30), 'INSEE.statistiques': APIModel(name='Statistiques INSEE', base_url=HttpUrl('https://www.insee.fr/fr/statistiques'), apidoc=None, description='URL de base de la section "Statistiques et Rapports" du site de l\'INSEE.\nOn peut y trouver des dataset é

In [5]:
input_path = "data/imports/population/population.population_superficie_1.json"

with open(input_path, "r", encoding="utf-8") as file:
        data = json.load(file)

In [6]:
data

{'identifier': 'DS_RP_SERIE_HISTORIQUE',
 'title': {'fr': 'Série historique du recensement de la population ',
  'en': 'Historical series of population census '},
 'publisher': {'id': 'INSEE',
  'label': [{'lang': 'fr',
    'content': 'Institut national de la statistique et des etudes economiques (INSEE)'},
   {'lang': 'en',
    'content': 'National Institute of Statistics and Economic Studies'}]},
 'observations': [{'dimensions': {'GEO': '2025-COM-03249',
    'FREQ': 'A',
    'TIME_PERIOD': '2022',
    'RP_MEASURE': 'SUP',
    'OCS': '_T'},
   'measures': {'OBS_VALUE_NIVEAU': {'value': 2030.0}}},
  {'dimensions': {'GEO': '2025-COM-03024',
    'FREQ': 'A',
    'TIME_PERIOD': '2022',
    'RP_MEASURE': 'SUP',
    'OCS': '_T'},
   'measures': {'OBS_VALUE_NIVEAU': {'value': 2415.0}}},
  {'dimensions': {'GEO': '2025-COM-07068',
    'FREQ': 'A',
    'TIME_PERIOD': '2022',
    'RP_MEASURE': 'SUP',
    'OCS': '_T'},
   'measures': {'OBS_VALUE_NIVEAU': {'value': 1514.0}}},
  {'dimensions': {'GE

In [7]:
len(data["observations"])

10000

In [37]:
data["identifier"]

'DS_RP_SERIE_HISTORIQUE'

In [38]:
data["title"]

{'fr': 'Série historique du recensement de la population ',
 'en': 'Historical series of population census '}

In [39]:
data["publisher"]

{'id': 'INSEE',
 'label': [{'lang': 'fr',
   'content': 'Institut national de la statistique et des etudes economiques (INSEE)'},
  {'lang': 'en',
   'content': 'National Institute of Statistics and Economic Studies'}]}

In [40]:
data["paging"]

{'first': 'https://api.insee.fr/melodi/data/DS_RP_SERIE_HISTORIQUE?maxResult=10000&startPeriod=2009-01-01&endPeriod=2025-10-01&GEO=COM&GEO=DEP&GEO=REG&RP_MEASURE=POP&RP_MEASURE=SUP&OCS=_T&page=1'}

In [52]:
data["observations"][0]

{'dimensions': {'GEO': '2025-COM-03249',
  'FREQ': 'A',
  'TIME_PERIOD': '2022',
  'RP_MEASURE': 'SUP',
  'OCS': '_T',
  'VALUE': {'value': 2030.0}},
 'measures': {'OBS_VALUE_NIVEAU': {'value': 2030.0}}}

In [9]:
rows = []
for obj in data["observations"]:
    row, value = obj.get("dimensions"), obj.get("measures")["OBS_VALUE_NIVEAU"]["value"]
    row["VALUE"] = value
    rows.append(row)

df = pd.DataFrame(rows)
df["CODEGEO"] = df["GEO"].apply(lambda x: x.split("-")[2])
df["YEARGEO"] = df["GEO"].apply(lambda x: x.split("-")[0])
df["UNKNOWN_GEO"] = df["GEO"].apply(lambda x: x.split("-")[1])

# keep SUP for superficie
cols_to_keep = ["CODEGEO", "TIME_PERIOD", "SUP"]

df_pivot = pd.pivot_table(
    data=df, 
    values="VALUE", 
    columns=["RP_MEASURE"],
    index=["GEO", "TIME_PERIOD", "FREQ", "OCS", "CODEGEO"],
    fill_value=0
).reset_index()

df_pivot

RP_MEASURE,GEO,TIME_PERIOD,FREQ,OCS,CODEGEO,POP,SUP
0,2025-COM-01001,2011,A,_T,01001,780.0,0.0
1,2025-COM-01001,2016,A,_T,01001,767.0,0.0
2,2025-COM-01001,2022,A,_T,01001,859.0,1595.0
3,2025-COM-01002,2011,A,_T,01002,234.0,0.0
4,2025-COM-01002,2016,A,_T,01002,243.0,0.0
...,...,...,...,...,...,...,...
8894,2025-COM-55248,2022,A,_T,55248,0.0,1607.0
8895,2025-COM-55250,2022,A,_T,55250,0.0,646.0
8896,2025-COM-55253,2022,A,_T,55253,0.0,555.0
8897,2025-COM-55255,2022,A,_T,55255,0.0,1744.0


In [12]:
df_pivot["CODEGEO"] = df_pivot["GEO"].apply(lambda d: d.split("-")[2])
df_pivot["LEVEL_CODEGEO"] = df_pivot["GEO"].apply(lambda d: d.split("-")[1])
df_pivot["ANNEE_CODEGEO"] = df_pivot["GEO"].apply(lambda d: d.split("-")[0])
df_pivot.head()

RP_MEASURE,GEO,TIME_PERIOD,FREQ,OCS,CODEGEO,POP,SUP,ANNEE_CODEGEO,LEVEL_CODEGEO
0,2025-COM-01001,2011,A,_T,1001,780.0,0.0,2025,COM
1,2025-COM-01001,2016,A,_T,1001,767.0,0.0,2025,COM
2,2025-COM-01001,2022,A,_T,1001,859.0,1595.0,2025,COM
3,2025-COM-01002,2011,A,_T,1002,234.0,0.0,2025,COM
4,2025-COM-01002,2016,A,_T,1002,243.0,0.0,2025,COM


In [13]:
df_pivot["LEVEL_CODEGEO"].unique()

array(['COM'], dtype=object)

In [65]:
df["TIME_PERIOD"].unique()

array(['2022', '2016', '2011'], dtype=object)

In [60]:
df["FREQ"].unique()

array(['A'], dtype=object)

In [61]:
df["RP_MEASURE"].unique()

array(['SUP', 'POP'], dtype=object)

In [62]:
df["GEO"].unique()

array(['2025-COM-03249', '2025-COM-03024', '2025-COM-07068', ...,
       '2025-COM-06150', '2025-COM-02346', '2025-COM-05014'],
      shape=(6402,), dtype=object)

In [68]:
df["CODEGEO"].nunique()

6402

In [69]:
df["YEARGEO"].unique()

array(['2025'], dtype=object)

In [70]:
df["UNKNOWN_GEO"].unique()

array(['COM'], dtype=object)