# Scrapping des préfectures et départements

In [9]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import requests
import io
import re

url = "https://fr.wikipedia.org/wiki/Liste_des_d%C3%A9partements_fran%C3%A7ais"
headers = {"User-Agent": "Etudiant_DataScience_Immo/1.0"}

response = requests.get(url, headers=headers)

dfs = pd.read_html(io.StringIO(response.text), match="Densité")
df_dept = dfs[0].copy()

df_final = df_dept.iloc[:, [0, 1, 2, 11, 12]].copy()
df_final.columns = ['Code_Insee', 'Département', 'Préfecture', 'Population_Dept', 'Densité_Dept']

# FONCTIONS DE NETTOYAGE

def clean_insee(code):
    code = str(code).strip()
    if '2A' in code: return '2A'
    if '2B' in code: return '2B'
    digits = "".join(filter(str.isdigit, code))
    if digits.startswith('97'): return digits[:3]
    if len(digits) == 3: return digits[:2]
    return digits[:2].zfill(2)

def clean_numbers(val):
    if pd.isna(val): return 0
    val = str(val)
    # Enlève les crochets [1]
    val = re.sub(r'\[.*?\]', '', val)
    # Enlève les espaces insécables et normaux
    val = val.replace('\xa0', '').replace(' ', '')
    # Enlève les virgules
    val = val.split(',')[0] 
    # Garde que les chiffres
    val = "".join(filter(str.isdigit, val))
    return int(val) if val else 0

def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = re.sub(r'\[.*?\]', '', text) 
    text = re.sub(r'\(.*?\)', '', text) 
    return text.strip()

df_final['Code_Insee'] = df_final['Code_Insee'].apply(clean_insee)
df_final['Population_Dept'] = df_final['Population_Dept'].apply(clean_numbers)
df_final['Densité_Dept'] = df_final['Densité_Dept'].apply(clean_numbers)
df_final['Département'] = df_final['Département'].apply(clean_text)
df_final['Préfecture'] = df_final['Préfecture'].apply(clean_text)

df_final = df_final[df_final['Code_Insee'] != '']
df_final['Population_Dept'] = pd.to_numeric(df_final['Population_Dept'].astype(str).str.replace(r'\D', '', regex=True).str[:-4], errors='coerce').fillna(0).astype(int)

print(df_final.head())
print(df_final.dtypes)

  Code_Insee              Département       Préfecture  Population_Dept  \
0         01                      Ain  Bourg-en-Bresse           671289   
1         02                    Aisne             Laon           525558   
2         03                   Allier          Moulins           334715   
3         04  Alpes-de-Haute-Provence  Digne-les-Bains           167179   
4         05             Hautes-Alpes              Gap           141677   

   Densité_Dept  
0          1165  
1           714  
2           456  
3           241  
4           255  
Code_Insee         object
Département        object
Préfecture         object
Population_Dept     int64
Densité_Dept        int64
dtype: object
