# Leveraging Python for Spatial Data Science

## [Spatial Data Science Bootcamp Paris](https://spatial-data-science-conference.com/bootcamps/2023/) October 26th, 2023

[Florian Bayer](https://www.linkedin.com/in/florian-bayer-a4117b30/), PhD in Public Health, MSc in Geography

Health geographer at Agence de la biomédecine, University lecturer at Paris Panthéon Sorbonne and ENSG

# Data processing

In [2]:
import pandas as pd
import numpy as np
import requests
from zipfile import ZipFile
import os

In [3]:
url_doc = "https://annuaire.sante.fr/web/site-pro/extractions-publiques?p_p_id=abonnementportlet_WAR_Inscriptionportlet_INSTANCE_gGMT6fhOPMYV&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&_abonnementportlet_WAR_Inscriptionportlet_INSTANCE_gGMT6fhOPMYV_nomFichier=PS_LibreAcces_202308300950.zip"


In [4]:


def get_data(url, tmp_dir='tmp', zipfile='data_bootcamp.zip', txtfile='PS_LibreAcces_Personne_activite_'):
    """
    Download, extract, and process data from a given URL.

    Args:
        url (str): The URL of the data source.
        tmp_dir (str, optional): The directory for extracting and verifying the ZIP file. Default is 'tmp'.
        zipfile (str, optional): The name of the final zipfile. Default is 'data_bootcamp.zip'.
        txtfile (str, optional): The prefix for text file names to select from the extracted ZIP archive.

    Returns:
        pd.DataFrame: A Pandas DataFrame containing processed data.

    Example:
        url_doc = 'https://example.com/data_bootcamp.zip'
        perso_files = get_data(url_doc, tmp_dir='my_extraction_folder')

    Detailed Steps:
        1. Check if the specified 'zipfile' already exists in the 'tmp_dir' directory.
        2. If 'zipfile' doesn't exist, download it from the provided 'url' and save it in the 'tmp_dir'.
        3. Extract the contents of the 'zipfile' in the 'tmp_dir' and look for text files with filenames starting with 'txtfile'.
        4. Read the selected text file(s) into a Pandas DataFrame, using specified column data types ('dtypes').
        5. Return the resulting Pandas DataFrame containing the processed data.

    Note:
        The 'txtfile' parameter is used to filter and select specific text files from the extracted ZIP archive
        based on their filenames.

    """
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    zip_path = os.path.join(tmp_dir, zipfile)

    if not os.path.exists(zip_path):
        print("Downloading data")
        response = requests.get(url)
        if response.status_code == 200:
            with open(zip_path, "wb") as f:
                f.write(response.content)
        else:
            print("Error during download:", response.status_code)
            return

    dtypes = {'Identification nationale PP': 'object',
              "Nom d'exercice": 'object',
              "Prénom d'exercice": 'object',
              'Code savoir-faire': 'object',
              'Libellé savoir-faire': 'category',
              'Code mode exercice': 'category',
              'Libellé mode exercice': 'category',
              'Raison sociale site': 'object',
              'Numéro Voie (coord. structure)': 'object',
              'Indice répétition voie (coord. structure)': 'object',
              'Libellé type de voie (coord. structure)': 'object',
              'Libellé Voie (coord. structure)': 'object',
              'Bureau cedex (coord. structure)': 'object',
              'Code postal (coord. structure)': 'object',
              'Code commune (coord. structure)': 'object',
              "Code secteur d'activité": 'category'}

    cols = list(dtypes.keys())

    z = ZipFile(zip_path)

    for txt in z.infolist():
        if txt.filename.startswith(txtfile):
            df = pd.read_csv(z.open(txt.filename), sep='|', usecols=cols, dtype=dtypes)

    return df

data_dir = "data"
perso_files = get_data(url_doc, tmp_dir=data_dir, zipfile='data_bootcamp.zip')


Since some departments (states) codes are not filled, we fill them based on the municipality codes or postal codes

In [5]:
perso_files['state'] = perso_files['Code commune (coord. structure)'].str[:2]
perso_files['state'] = np.where(perso_files['state']==np.nan, perso_files['Code postal (coord. structure)'].str[:2], perso_files['state'] )
perso_files['state'] = perso_files['state'].astype("object")

We only retain self-employed general practitioners or those working in a healthcare center. Paris and its suburbs are also filtered.

In [6]:
list_gen = ['SM26','SM53','SM54'] # general practitioners
list_act = ['SA05','SA07','SA08','SA52'] # self-employed or healthcare center.
list_state = ['75','77','78','91','92','93','94','95']
list_airport = ["CTRE SOINS PREVENTION AEROPORTS PARIS", "ORLY SUD 103"]
list_badaddress = ['810001593259'] 

general = perso_files.loc[(perso_files['Code savoir-faire'].isin(list_gen)) 
                          & (perso_files["Code secteur d'activité"].isin(list_act))
                          & (perso_files["state"].isin(list_state))
                          & (~perso_files["Raison sociale site"].isin(list_airport))
                          & (~perso_files["Libellé Voie (coord. structure)"].isin(list_airport))
                          & (~perso_files["Libellé Voie (coord. structure)"].isnull())
                          & (~perso_files["Identification nationale PP"].isin(list_badaddress))
                         ].copy()


Some data corrections to improved geocoding

In [7]:
general['Bureau cedex (coord. structure)'] = general['Bureau cedex (coord. structure)'].replace('94210 LA VARENNE ST HILAIRE', '94210 SAINT-MAUR-DES-FOSSÉS')

newstreets = {
    'AVENUE HECTOR BERLIOZ' : 'AVENUE BERLIOZ',
    'PLACE DES MARTYRS DE' : "Place des Martyrs de l'Occupation Allemande",
    'RUE PRIMO LEVI' :  "Rue Primo Levi, L'Étoile",
    'DES DEUX GARES - LAC RUME' : "du Lac",
    "DE L AUVERGNE" : "d'Auvergne",
    "CLINIQUE DU BLANC MESNIL" : "7 Avenue Henri Barbusse",
    "RUE LOUIS RENE VILLERME" : "Place des Droits de l'Homme",
    "ROUTE DEPARTEMENTALE 316" : "Allée de la Poire Bezy de Chaumontel",
    "RUE DU GENERAL DE GAULLE" : "Rue Charles de Gaulle",
    "RUE DE LONGJUMEAU" : "Chemin des Grands Champs",
    "ROUTE DE GIRONVILLE" : "RUE DE GIRONVILLE",
    "ZONE DE LA PETITE ARCHE" : "Avenue Jacques Chirac",
    "CHEMIN DU VEXIN" : "Allée de la Brie",
    "PLACE  HENRI DUNANT" : "Chaussee Saint Vincent",
    "PLACE HENRI DUNANT" : "Chaussee Saint Vincent",
    "ALLEE JEAN MARC FRESC" : "Rue du Moulin"
}

general.replace({'Libellé Voie (coord. structure)' : newstreets}, inplace=True)

general['Libellé Voie (coord. structure)'] = np.where(general["Identification nationale PP"]=='810100275345', "Rue de Pontoise", general['Libellé Voie (coord. structure)'])
general['Bureau cedex (coord. structure)'] = np.where(general["Identification nationale PP"]=='810000769223', "78551 Saint-Germain-en-Laye", general['Bureau cedex (coord. structure)'])




Adding statename

In [8]:
statename = {
    '75': 'Paris',
    '77': 'Seine-et-Marne',
    '78': 'Yvelines',
    '91': 'Essonne',
    '92': 'Hauts-de-Seine',
    '93': 'Seine-Saint-Denis',
    '94': 'Val-de-Marne',
    '95': "Val-d'Oise"
}

general['statename'] = general['state'].replace(statename,regex=True)

'q' will be used for geocoding addresses

In [9]:
q_cols = [
    'Numéro Voie (coord. structure)', 'Indice répétition voie (coord. structure)', 'Libellé type de voie (coord. structure)',
    'Libellé Voie (coord. structure)', 'Bureau cedex (coord. structure)', 'statename'
]

general['q'] = general[q_cols].stack().groupby(level=0).agg(' '.join)


Keeping only necessary columns

In [10]:
new_cols = ['Identification nationale PP', "Nom d'exercice", "Prénom d'exercice",
              'Raison sociale site', 'Code commune (coord. structure)', 'state', 'q']

general = general[new_cols]

Translate columns name

In [11]:
translation_dict = {
    'Identification nationale PP': 'GPid',
    "Nom d'exercice": "lastname",
    "Prénom d'exercice": "firstname",
    'Raison sociale site': 'name',
    'Code commune (coord. structure)' : 'citycode'
}

general.rename(columns=translation_dict, inplace=True)

Export to pickle

In [12]:
general.reset_index(inplace=True,drop=True)
file="data_bootcamp.pckl"
path = os.path.join(data_dir, 'GP_adress.csv')
general.to_pickle(path)