<a href="https://colab.research.google.com/github/dataforgoodfr/batch8_mednum/blob/master/notebooks/acces_information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Accès à l'information 
## Accès à un point physique de service publique

## 1. Liste des médiathèques


In [1]:
# get mediatheques map
import pandas as pd
import numpy as np
from pathlib import Path

external_data = Path('../data/external/')
processed_data = Path('../data/processed/')
raw_data = Path('../data/raw/')
interim_data = Path('../data/interim/')

In [2]:
commune = pd.read_csv(raw_data/'table_insee_libcom_dep.csv')
commune.drop(columns=['Unnamed: 0'], inplace=True)
commune

Unnamed: 0,CODE_INSEE,LIBCOM,DEP
0,01001,L'Abergement-Clémenciat,01
1,01002,L'Abergement-de-Varey,01
2,01004,Ambérieu-en-Bugey,01
3,01005,Ambérieux-en-Dombes,01
4,01006,Ambléon,01
...,...,...,...
35005,97613,M'Tsangamouji,976
35006,97614,Ouangani,976
35007,97615,Pamandzi,976
35008,97616,Sada,976


In [3]:
df = pd.read_csv(raw_data/'adresses-des-bibliotheques-publiques.csv', header=0, sep=';')
#df.rename(columns={'Code postal': 'CODE_INSEE', 'Département': 'DEP'}, inplace=True)
print(df.shape)
df.head()

(15779, 20)


Unnamed: 0,Libelle1,Libelle2,Localisation,voie,CP,CEDEXB,Ville,DEPT,INSEE,coordonnees_ban,coordonnees_insee,coordonnees_finales,code_bib,code_ua,Code region,Statut,surface,type_adresse,amplitude_horaire,pop_com
0,Bibliothèque Intercommunale 'Plaisir de Lire',,,rue de la gare,60490,,Boulogne-la-Grasse,60,60093,"49.609493, 2.705751","49.6080302361, 2.70097863571","49.609493,2.705751",8724.0,8724.0,32,Bibliothèque municipale,,Bibliothèque ouverte au public,,486.0
1,Médiathèque Madeleine Odent,,,13 rue de la Mare du four,60510,,Bresles,60,60103,"49.409555, 2.252608","49.4135603419, 2.25008112215","49.409555,2.252608",8729.0,8729.0,32,Bibliothèque municipale,373.0,Bibliothèque ouverte au public,20.0,4226.0
2,Bibliothèque Municipale De Bury,,,107 rue Pillon Crouzet,60250,,Bury,60,60116,"49.313706, 2.343188","49.3128542225, 2.35865250561","49.313706,2.343188",8732.0,8732.0,32,Bibliothèque municipale,82.0,Bibliothèque ouverte au public,16.0,3033.0
3,Médiathèque Municipale,,,1 AVENUE Henri BESSE,60290,,Cauffry,60,60134,"49.322419, 2.442326","49.3134355162, 2.43026281523","49.322419,2.442326",2584.0,2584.0,32,Bibliothèque municipale,131.0,Bibliothèque ouverte au public,26.0,2535.0
4,Bibliothèque Municipale,,,8 rue de Pont Sainte Maxence,60940,,Cinqueux,60,60154,"49.319259, 2.52798","49.3243592276, 2.53354291409","49.319259,2.52798",8742.0,8742.0,32,Bibliothèque municipale,80.0,Bibliothèque ouverte au public,3.0,1593.0


In [4]:
df['type_adresse'].value_counts()

Bibliothèque ouverte au public    15481
Adresse administrative              298
Name: type_adresse, dtype: int64

In [5]:
df['Statut'].value_counts()

Bibliothèque municipale            15182
Bibliothèque municipale classée      376
Bibliothèque départementale          177
Name: Statut, dtype: int64

In [6]:
# On garde uniquement les bibliothèques ouverte au public
df = df[df['type_adresse'] == 'Bibliothèque ouverte au public']

In [7]:
df = df[['INSEE', 'DEPT', 'Ville']]
df = df.rename(columns={'INSEE': 'CODE_INSEE', 'DEPT': 'DEP'})
df

Unnamed: 0,CODE_INSEE,DEP,Ville
0,60093,60,Boulogne-la-Grasse
1,60103,60,Bresles
2,60116,60,Bury
3,60134,60,Cauffry
4,60154,60,Cinqueux
...,...,...,...
15774,60042,60,Bailleval
15775,60053,60,Beaulieu-les-Fontaines
15776,60057,60,Beauvais
15777,60068,60,Béthisy-Saint-Pierre


In [11]:
mediatheque = commune.merge(df, on=['CODE_INSEE', 'DEP'], how='left').reset_index(drop=True)
mediatheque['ACCES_SERVICE_PUBLIC'] = mediatheque['Ville'].apply(lambda x: 0 if pd.isna(x) else 1)
mediatheque = mediatheque[['CODE_INSEE', 'LIBCOM', 'DEP', 'ACCES_SERVICE_PUBLIC']] 
mediatheque.drop_duplicates(inplace=True)
mediatheque

Unnamed: 0,CODE_INSEE,LIBCOM,DEP,ACCES_SERVICE_PUBLIC
0,01001,L'Abergement-Clémenciat,01,0
1,01002,L'Abergement-de-Varey,01,0
2,01004,Ambérieu-en-Bugey,01,0
3,01005,Ambérieux-en-Dombes,01,1
4,01006,Ambléon,01,0
...,...,...,...,...
36443,97613,M'Tsangamouji,976,0
36444,97614,Ouangani,976,0
36445,97615,Pamandzi,976,1
36446,97616,Sada,976,1


In [32]:
mediatheque['ACCES_SERVICE_PUBLIC'].value_counts()

0    21284
1    13729
Name: ACCES_SERVICE_PUBLIC, dtype: int64

In [33]:
mediatheque.to_csv(f'{interim_data}/acces_point_phys_service_publique.csv', index=False)

## 2. Liste des centres sociaux

In [None]:
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib import request

import time


def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()        
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result    
    return timed

In [None]:
@timeit
def scrap_table_page(url='https://www.senacs.fr/structure/csx?page=', page_number=1):
    """
    Generate a request on senacs page for a given 
    page number to get the name and location of 
    each social center.
    """

    url_page = url + str(page_number)
    request_text = request.urlopen(url_page).read()
    page = BeautifulSoup(request_text, 'html.parser')

    table = page.find('table', {'class': 'table table-striped table-hover'}).findAll('td')

    # fill the values of id, name and city
    ids = [id.getText() for id in table[::4]]
    names = [name.getText() for name in table[1::4]]
    cities = [city.getText() for city in table[2::4]]

    df = pd.DataFrame({'id': ids, 
                      'nom': names,
                      'ville': cities}
                      )
    df['page'] = page_number

    return df


def stack_pages(pages=range(91)):
    """
    Concatenate all empruntis scrapped data per region.
    """

    data = pd.DataFrame()    # empty Dataframe to fill
    for page_number in pages:
        temp = scrap_table_page(page_number=page_number)
        data = pd.concat([data, temp], axis=0)

    return data

In [None]:
data = stack_pages()

In [None]:
data['code_postal'] = data['ville'].apply(lambda x: x.split('-')[0])
data['ville'] = data['ville'].apply(lambda x: x.split('-')[1])

In [None]:
data

Unnamed: 0,id,nom,ville,page,code_postal
0,91,CENTRE SOCIAL DES GRANDES BORNES,Goussainville,0,95190
1,92,CENTRE SOCIAL ARCHIPELIA,PARIS 20EME ARRONDISSEMENT,0,75020
2,93,Espace Socioculturel Val de Charente,Ruffec,0,16700
3,94,CENTRE SOCIAL Maison Des Habitants Champaret,Bourgoin,0,38300
4,95,CENTRE SOCIAL MAISON DE LA CHALLE,Éragny,0,95610
...,...,...,...,...,...
6,16264,CENTRE SOCIOCULTUREL MUNICIPAL DE SAINT-DIZIER,Saint,90,52100
7,16267,ESPACE DE VIE SOCIALE L.E.P.H.A.R.E,Aniche,90,59580
8,16351,ASSOCIATION FERME BEAUREPAIRE,Boulogne,90,62200
9,16355,OFFICE DE LA JEUNESSE,Bruay,90,62700


In [None]:
dir = '/content/drive/My Drive/Colab Notebooks/'
data.to_csv(f'{dir}data_centre_sociaux.csv', index=False)

## 3. Distance des communes à l'accès le plus proche


In [None]:
# copy geoson file with every 'Franceservices' locations

import os

if 'COLAB_GPU' in os.environ:  # this is always set on Colab, the value is 0 or 1 depending on whether a GPU is attached
    from google.colab import auth
    #auth.authenticate_user()

    !rm -rf France-services/ 
    !git clone https://github.com/cget-carto/France-services.git
    !mv France-services/data/france_services.geojson .

Cloning into 'France-services'...
remote: Enumerating objects: 260, done.[K
remote: Counting objects: 100% (260/260), done.[K
remote: Compressing objects: 100% (169/169), done.[K
remote: Total 260 (delta 108), reused 221 (delta 78), pack-reused 0[K
Receiving objects: 100% (260/260), 2.12 MiB | 11.94 MiB/s, done.
Resolving deltas: 100% (108/108), done.


In [None]:
import pandas as pd
import json

with open("france_services.geojson", "r") as read_file: 
    fs = json.load(read_file) 

In [None]:
data_fs = pd.DataFrame()
for n, temp in enumerate(fs['features']): 
 
    df_temp = pd.DataFrame.from_records([{'latitude': temp['properties']['LATITUDE'], 
                            'longitude': temp['properties']['LATITUDE'], 
                            'departement': temp['properties']['DEPARTEMENT'],
                            'insee_com': temp['properties']['insee_com'],
                            'code_postal': temp['properties']['code_postal'], 
                            'lib_france_services': temp['properties']['lib_france_services']
                            }])
    
    data_fs = pd.concat([data_fs, df_temp], axis=0)

In [None]:
data_fs

Unnamed: 0,latitude,longitude,departement,insee_com,code_postal,lib_france_services
0,46.105747,46.105747,1,01033,01200,Valserhône
0,44.031548,44.031548,12,12197,12170,Réquista « Le Bercail »
0,44.475185,44.475185,12,12138,12330,Conques-Marcillac
0,49.291400,49.291400,14,14514,14130,Terre d’Auge
0,49.126169,49.126169,14,14057,14370,Val ès Dunes
...,...,...,...,...,...,...
0,47.617932,47.617932,89,89368,89520,Saint-Sauveur-en-Puisaye
0,48.980007,48.980007,95,95219,95120,Ermont
0,47.181000,47.181000,25,25527,25410,Saint Vit
0,43.208769,43.208769,31,31375,31310,Volvestre – Antenne Montesquieu


### Calculer la distance entre le centre des communes et la prefecture, sous-prefecture ou france-services la plus proche en km. 

- scrapper prefecture / sous-prefecture https://fr.wikipedia.org/wiki/Liste_des_pr%C3%A9fectures_de_France
- obtenir coordonnées des communes: OK via codes postaux + librairie `pgecode`

In [None]:
#data['distance'] = dist.query_postal_code(data[''].values, data[''].values)

# calculer le france-service (ou prefecture, sous-prefecture etc.) le plus proche de chaque commune et bueno 

In [None]:
def df_crossjoin(df1, df2, **kwargs):
    """
    Make a cross join (cartesian product) between two dataframes by using a constant temporary key.
    Also sets a MultiIndex which is the cartesian product of the indices of the input dataframes.
    See: https://github.com/pydata/pandas/issues/5401
    :param df1 dataframe 1
    :param df1 dataframe 2
    :param kwargs keyword arguments that will be passed to pd.merge()
    :return cross join of df1 and df2
    """
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res

df = df_crossjoin(commune[['code_postal']], data_fs[['code_postal']], suffixes=('_communes', '_fs')).reset_index()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,level_0,level_1,code_postal_communes,code_postal_fs
0,0,0,01500,01200
1,0,0,01500,12170
2,0,0,01500,12330
3,0,0,01500,14130
4,0,0,01500,14370
...,...,...,...,...
35586331,39191,0,98799,89520
35586332,39191,0,98799,95120
35586333,39191,0,98799,25410
35586334,39191,0,98799,31310


In [None]:
df.drop_duplicates()

Unnamed: 0,level_0,level_1,code_postal_communes,code_postal_fs
0,0,0,01500,01200
1,0,0,01500,12170
2,0,0,01500,12330
3,0,0,01500,14130
4,0,0,01500,14370
...,...,...,...,...
35586331,39191,0,98799,89520
35586332,39191,0,98799,95120
35586333,39191,0,98799,25410
35586334,39191,0,98799,31310


In [None]:
df[:len(data_fs)].groupby('code_postal_communes').apply(lambda row: dist.query_postal_code(row['code_postal_communes'], row['code_postal_fs']))

code_postal_communes
01500    []
dtype: object

In [None]:
df[:len(data_fs)].apply(lambda row: dist.query_postal_code(row['code_postal_communes'], row['code_postal_fs']), axis=1)

0       42.623548
1      301.434162
2      275.230403
3      527.270586
4      544.313797
          ...    
903    238.162672
904    409.646169
905    143.125315
906    446.226215
907    431.109707
Length: 908, dtype: float64

In [26]:
#df['distance_commune_france_service'] = df.apply(lambda row: dist.query_postal_code(row['code_postal_communes'], row['code_postal_fs']), axis=1)

In [None]:
# take the minimum per commune and OK
# df.groupby('code_postal_communes').agg({'distance_commune_france_service': ['mean', 'min']})

In [27]:
#df