This notebook is dedicated to data collection, cleaning and creation of work files for the study of the public/private distribution of French schools

### 1. Importing librairies

In [271]:
import time
startTime = time.time()

import pandas as pd
import os
import requests

### 2. Creating the relevant folders and paths

In [272]:
# Defining current folder as our main directory
dirname = os.getcwd()

# location folders variables
data_in = dirname + "\\da_data_raw\\"
data_out = dirname + "\\da_data_workfiles\\"
graph_out = dirname + "\\graphs\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in):
    os.makedirs(data_in)
    print(f"Directory Created: {data_in}")
else: 
    print(f"Already existing directory: {data_in}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out):
    os.makedirs(data_out)
    print(f"Directory Created: {data_out}")
else:
    print(f"Already existing directory: {data_out}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(graph_out):
    os.makedirs(graph_out)
    print(f"Directory Created: {graph_out}")
else:
    print(f"Already existing directory: {graph_out}")    

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in}")
print(f"Workfile folder: {data_out}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\
Already existing directory: c:\Users\33671\Documents\Python\IPS\graphs\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\


Temporary code (to comment later)

In [273]:
# Getting current date
datestr = time.strftime("%Y-%m-%d")

# location folders variables (temporary - to comment later)
data_in_temporary = dirname + "\\da_data_raw\\" + datestr + "\\"
data_out_temporary = dirname + "\\da_data_workfiles\\" + datestr + "\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in_temporary):
    os.makedirs(data_in_temporary)
    print(f"Directory Created: {data_in_temporary}")
else: 
    print(f"Already existing directory: {data_in_temporary}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out_temporary):
    os.makedirs(data_out_temporary)
    print(f"Directory Created: {data_out_temporary}")
else:
    print(f"Already existing directory: {data_out_temporary}")

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in_temporary}")
print(f"Workfile folder: {data_out_temporary}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-10\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-10\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-10\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-10\


### 3. Verifying APIs responses

In [274]:
#fr-en-ips_colleges
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-ips_ecoles
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-dnb-par-etablissement
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#georef-france-commune
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "10"
# r = requests.get(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
#                  timeout=2)
# print(f"{dataset_id}")
# print(f"URL: {r.url}")
# print(f"HTTP Response Status Code: {r.status_code}") 
# print(f"HTTP Error: {r.raise_for_status()}")
# print(f"Encoding: {r.encoding}\n")
# r.close()

fr-en-ips_colleges
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_colleges/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-ips_ecoles
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_ecoles/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-dnb-par-etablissement
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-dnb-par-etablissement/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8



### 4. Data Collection

##### ips_colleges dataset

In [275]:
# Loading data
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_colleges = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"

In [276]:
df_raw_ips_colleges.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
870,2021-2022,STRASBOURG,67,BAS-RHIN,0672254L,COLLEGE LES SEPT ARPENTS,67471,SOUFFELWEYERSHEIM,public,105.0,COLLEGE
5013,2021-2022,MONTPELLIER,66,PYRENEES-ORIENTALES,0660063K,INSTITUTION LA SALLE SAINT JEAN COLLEGE PRIVE,66136,PERPIGNAN,privé sous contrat,122.6,COLLEGE
4845,2021-2022,NANCY-METZ,57,MOSELLE,0572488V,COLLEGE LA CARRIERE,57606,SAINT AVOLD,public,89.9,COLLEGE
5709,2021-2022,ORLEANS-TOURS,18,CHER,0180002E,COLLEGE GERARD PHILIPE,18015,AUBIGNY SUR NERE,public,94.7,COLLEGE
4454,2021-2022,TOULOUSE,31,HAUTE-GARONNE,0311231V,COLLEGE MAURICE BECANNE,31555,TOULOUSE,public,96.5,COLLEGE


##### ips_ecoles dataset

In [277]:
# Loading data
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_ecoles = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [278]:
df_raw_ips_ecoles.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
22728,2021-2022,LILLE,59,NORD,0593695W,ECOLE PRIMAIRE PRIVEE SAINT ROCH,59122,CAMBRAI,privé sous contrat,96.8,ECOLE
22419,2021-2022,RENNES,56,MORBIHAN,0560761Y,ECOLE ELEMENTAIRE PUBLIQUE JULES VERNE,56036,CAUDAN,public,97.0,ECOLE
5530,2021-2022,LILLE,59,NORD,0593863D,ECOLE PRIMAIRE PRIVEE SAINT LOUIS,59599,TOURCOING,privé sous contrat,100.1,ECOLE
8868,2021-2022,LYON,42,LOIRE,0421402W,ECOLE PRIMAIRE PRIVEE SAINTE MARIE DU LANGONNAND,42302,SORBIERS,privé sous contrat,115.5,ECOLE
31169,2021-2022,NANTES,85,VENDEE,0850267D,ECOLE PRIMAIRE PUBLIQUE AIME CESAIRE,85054,LA CHAPELLE HERMIER,public,93.0,ECOLE


##### dnb-par-etablissement dataset

In [279]:
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_dnb_par_etablissement = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [280]:
df_raw_dnb_par_etablissement.sample(5) 

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
81798,2009,0050520N,COLLEGE,SERRES (DE),PUBLIC,5166,SERRES,5,HAUTES-ALPES,2,...,18,PROVENCE-ALPES-COTE D'AZUR,43,42,36,15,11,8,2,"85,70%"
39814,2017,0381694K,COLLEGE,DON BOSCO,PRIVE,38179,GIERES,38,ISERE,8,...,1,AUVERGNE-RHONE-ALPES,132,132,130,10,22,50,48,"98,40%"
124276,2021,0932581A,COLLEGE,INTERNATIONAL,PUBLIC,93051,NOISY-LE-GRAND,93,SEINE-SAINT-DENIS,24,...,10,ILE-DE-FRANCE,104,104,104,1,7,34,62,"100,0%"
13781,2010,0595175E,COLLEGE,FENELON,PUBLIC,59122,CAMBRAI,59,NORD,9,...,9,HAUTS-DE-FRANCE,162,152,106,41,27,21,17,"69,70%"
2329,2011,0940585A,LYCEE,FRANCOIS MANSART,PUBLIC,94068,SAINT-MAUR-DES-FOSSES,94,VAL-DE-MARNE,24,...,10,ILE-DE-FRANCE,24,23,16,14,1,1,0,"69,50%"


In [281]:
# Check school type distribution based on last avaiable year results
df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == 2021]['denomination_principale'].value_counts()

COLLEGE                6942
LYCEE PROFESSIONNEL    1315
LYCEE                   481
EREA                     58
AUTRE                    17
CFA                       3
Name: denomination_principale, dtype: int64

##### fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre dataset

In [282]:
# Loading data
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_geolocalisation = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [283]:
df_raw_geolocalisation.sample(5)

Unnamed: 0,numero_uai,appellation_officielle,denomination_principale,patronyme_uai,secteur_public_prive_libe,adresse_uai,lieu_dit_uai,boite_postale_uai,code_postal_uai,localite_acheminement_uai,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
34818,0260064D,Lycée général privé Chabrillan St Jean-Baptiste,LYCEE GENERAL PRIVE,CHABRILLAN ST JEAN-BAPTISTE,Privé,109 route DE DIEULEFIT,,,26200,MONTELIMAR,...,26198,Drôme,Auvergne-Rhône-Alpes,Grenoble,"{'lon': 4.774189881138587, 'lat': 44.550260062...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6,MINISTERE DE L'EDUCATION NATIONALE,1967-03-14
63638,0911413S,Ecole maternelle Le Parc,ECOLE MATERNELLE PUBLIQUE,LE PARC,Public,1 avenue Gilbert Fergant,,,91220,LE PLESSIS PATE,...,91494,Essonne,Ile-de-France,Versailles,"{'lon': 2.326310672634872, 'lat': 48.612980059...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1974-06-15
59865,0470173H,Ecole primaire,ECOLE ELEMENTAIRE PUBLIQUE,RENE MUZAS,Public,13 rue de l'Ecole,,,47550,BOE,...,47031,Lot-et-Garonne,Nouvelle-Aquitaine,Bordeaux,"{'lon': 0.631249402385469, 'lat': 44.162150624...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1970-01-13
12564,9720152H,Ecole primaire Eugene Revert,ECOLE PRIMAIRE PUBLIQUE,EUGENE REVERT,Public,Ancienne route de Schoëlcher,,,97200,FORT DE FRANCE,...,97209,Martinique,Martinique,Martinique,"{'lon': -61.068430087802746, 'lat': 14.6061838...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1969-04-17
49105,0850222E,Ecole primaire publique les dolmens,ECOLE PRIMAIRE PUBLIQUE,LES DOLMENS,Public,58 rue de la Frébouchère,,,85560,LE BERNARD,...,85022,Vendée,Pays de la Loire,Nantes,"{'lon': -1.468276915911877, 'lat': 46.44105978...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1970-01-12


##### georef-france-commune

In [284]:
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "-1" # argument to pass to get the full dataset 
# columns = "bv2012_code" + "%2C" + "bv2012_name" + "%2C" + "com_uu2020_status" 
# df_raw_georef = pd.read_json(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}&select={columns}")

##### typo-rur

In [285]:
df_raw_typo_rur = pd.read_csv('./da_data_raw/typo-rur.csv')
df_raw_typo_rur

Unnamed: 0,codgeo,libgeo,zonage_rur
0,01001,L'Abergement-Clémenciat,tr2
1,01002,L'Abergement-de-Varey,tr1
2,01004,Ambérieu-en-Bugey,tr5
3,01005,Ambérieux-en-Dombes,tr3
4,01006,Ambléon,tr1
...,...,...,...
34960,97613,M'Tsangamouji,tr5
34961,97614,Ouangani,tr5
34962,97615,Pamandzi,tr5
34963,97616,Sada,tr5


In [286]:
df_raw_typo_rur['zonage_rur'].value_counts()

tr1    8108
tr2    8096
tr3    7394
tr4    7174
tr5    3419
tr6     774
Name: zonage_rur, dtype: int64

In [287]:
df_raw_typo_rur['zonage_rur_lib'] = df_raw_typo_rur['zonage_rur'].map({'tr1':'rural autonome très peu dense', 
                                                                       'tr2': 'rural autonome peu dense', 
                                                                       'tr3': 'rural sous faible influence d\'un pole', 
                                                                       'tr4': 'rural sous forte influence d\'un pole',
                                                                       'tr5': 'urbain densité intermédiaire',
                                                                       'tr6': 'urbain dense'
                                                                       }
                                                                      )

In [288]:
df_raw_typo_rur['zonage_rur_lib'] = df_raw_typo_rur['zonage_rur'].map({'tr1':'rural autonome très peu dense', 
                                                                       'tr2': 'rural autonome peu dense', 
                                                                       'tr3': 'rural sous faible influence d\'un pole', 
                                                                       'tr4': 'rural sous forte influence d\'un pole',
                                                                       'tr5': 'urbain densité intermédiaire',
                                                                       'tr6': 'urbain dense'
                                                                       }
                                                                      )

In [289]:
df_raw_typo_rur['zonage_rur'] = df_raw_typo_rur['zonage_rur'].map({'tr1': 1, 
                                                                       'tr2': 2, 
                                                                       'tr3': 3, 
                                                                       'tr4': 4,
                                                                       'tr5': 5,
                                                                       'tr6': 6
                                                                       }
                                                                      )

In [290]:
df_raw_typo_rur.head()

Unnamed: 0,codgeo,libgeo,zonage_rur,zonage_rur_lib
0,1001,L'Abergement-Clémenciat,2,rural autonome peu dense
1,1002,L'Abergement-de-Varey,1,rural autonome très peu dense
2,1004,Ambérieu-en-Bugey,5,urbain densité intermédiaire
3,1005,Ambérieux-en-Dombes,3,rural sous faible influence d'un pole
4,1006,Ambléon,1,rural autonome très peu dense


In [291]:
df_raw_typo_rur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34965 entries, 0 to 34964
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   codgeo          34965 non-null  object
 1   libgeo          34965 non-null  object
 2   zonage_rur      34965 non-null  int64 
 3   zonage_rur_lib  34965 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


##### Niveau_de_vie_2013_a_la_commune dataset

In [292]:
# Loading data
df_raw_revenus_par_commune = pd.read_excel("https://www.data.gouv.fr/fr/datasets/r/d3ce0107-416f-42cf-a335-d71f89b00b21")

### 5. Exporting raw data to CSV

In [293]:
# Raw datasets export
file_name = "ips-colleges" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_colleges.to_csv(data_in + file_name, index = False)

file_name = "ips-ecoles" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_ecoles.to_csv(data_in + file_name, index = False)

file_name = "dnb-par-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_dnb_par_etablissement.to_csv(data_in + file_name, index = False)

file_name = "geolocalisation-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_geolocalisation.to_csv(data_in + file_name, index = False)

file_name = "revenus-par-commune" + ".csv"
print(f"file name: {file_name}")
df_raw_revenus_par_commune.to_csv(data_in + file_name, index = False)

# file_name = "georef-par-commune" + ".csv"
# print(f"file name: {file_name}")
# df_raw_georef.to_csv(data_in + file_name, index = False)

file name: ips-colleges.csv
file name: ips-ecoles.csv
file name: dnb-par-etablissement.csv
file name: geolocalisation-etablissement.csv
file name: revenus-par-commune.csv


#### Exporting raw data to CSV (temporary code - to comment later)

In [294]:
# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "ips-colleges_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_ips_colleges.to_csv(data_in_temporary + file_name, index = False)


# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "ips-ecoles_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_ips_ecoles.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "dnb-par-etablissement_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_dnb_par_etablissement.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "geolocalisation-etablissement_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_geolocalisation.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "revenus-par-commune_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_revenus_par_commune.to_csv(data_in_temporary + file_name, index = False)

### 6. Merging data into master file

##### ips_ecoles & ips_colleges dataframes

In [295]:
# Adding columns about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [296]:
df_master = pd.concat([df_raw_ips_colleges, df_raw_ips_ecoles])

In [297]:
#safety check
print(f"df_raw_ips_colleges N = {len(df_raw_ips_colleges)}")
print(f"df_raw_ips_ecoles N = {len(df_raw_ips_ecoles)}")
print(f"df_master N = {len(df_master)}")
print(len(df_raw_ips_colleges) + len(df_raw_ips_ecoles) == len(df_master))
print(f"unique uai = {df_master['uai'].nunique()}")

df_raw_ips_colleges N = 6967
df_raw_ips_ecoles N = 32091
df_master N = 39058
True
unique uai = 39058


##### dnb_par_etablissement

College certificate success rate is calculated based on admissions devived by attendees (not registrants). We will keep this convention to calculate honors rates

In [298]:
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
46368,2010,0720065E,LYCEE PROFESSIONNEL,GEORGE WASHINGTON,PUBLIC,72181,LE MANS,72,SARTHE,17,...,17,PAYS DE LA LOIRE,49,45,39,13,20,6,0,"86,60%"
33407,2010,0260050N,LYCEE PROFESSIONNEL,LEONARD DE VINCI,PUBLIC,26235,PIERRELATTE,26,DROME,8,...,1,AUVERGNE-RHONE-ALPES,24,24,23,10,13,0,0,"95,80%"
136997,2021,0593234V,COLLEGE,ALBERT BALL,PUBLIC,59011,ANNOEULLIN,59,NORD,9,...,9,HAUTS-DE-FRANCE,167,166,150,28,36,25,61,"90,4%"
13472,2010,0421456E,COLLEGE,JEAN PAPON,PUBLIC,42163,LA PACAUDIERE,42,LOIRE,10,...,1,AUVERGNE-RHONE-ALPES,46,45,43,17,15,8,3,"95,50%"
103617,2012,0340894M,COLLEGE,SAINTE-THERESE,PRIVE,34108,FRONTIGNAN,34,HERAULT,11,...,16,OCCITANIE,73,72,68,20,26,13,9,"94,40%"


In [299]:
# renaming key column
df_raw_dnb_par_etablissement = df_raw_dnb_par_etablissement.rename(columns = {'numero_d_etablissement': 'uai'})

In [300]:
#converting dnb string to float
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].str.strip('%').str.replace(',', '.')
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].astype('float')

In [301]:
df_raw_dnb_par_etablissement['dnb_taux_de_sans_mention'] = df_raw_dnb_par_etablissement['nombre_d_admis_sans_mention'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_ab'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_ab'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_b'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_b'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_tb'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_tb'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
129776,2017,0311769E,COLLEGE,GRAND SELVE,PUBLIC,31232,GRENADE,31,HAUTE-GARONNE,16,...,109,29,31,27,22,87.2,0.232,0.248,0.216,0.176
34304,2011,0870048E,COLLEGE,JEAN ROSTAND,PUBLIC,87181,SAINT-SULPICE-LAURIERE,87,HAUTE-VIENNE,22,...,40,19,9,6,6,81.6,0.387755,0.183673,0.122449,0.122449
13505,2010,0440244B,COLLEGE,ST BLAISE,PRIVE,44215,VERTOU,44,LOIRE-ATLANTIQUE,17,...,273,33,136,77,27,99.6,0.120438,0.49635,0.281022,0.09854
4981,2006,0572640K,COLLEGE,METZ-ARSENAL,PUBLIC,57463,METZ,57,MOSELLE,12,...,99,69,16,13,1,82.5,0.575,0.133333,0.108333,0.008333
2714,2012,0230028F,COLLEGE,LOUIS DURAND,PUBLIC,23247,SAINT-VAURY,23,CREUSE,22,...,36,12,12,8,4,81.8,0.272727,0.272727,0.181818,0.090909


In [302]:
# Removing columns
df_raw_dnb_par_etablissement.drop(columns=['nombre_d_admis_sans_mention', 'nombre_d_admis_mention_ab', 'nombre_d_admis_mention_b', 'nombre_d_admis_mention_tb'], 
        errors='ignore', 
        inplace=True)
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
113782,2015,0772154Y,COLLEGE,LEONARD DE VINCI,PUBLIC,77438,SAINT-THIBAULT-DES-VIGNES,77,SEINE-ET-MARNE,24,...,10,ILE-DE-FRANCE,122,121,108,89.2,0.322314,0.280992,0.165289,0.123967
27373,2009,0110725D,COLLEGE,LA NADIERE,PUBLIC,11266,PORT-LA-NOUVELLE,11,AUDE,11,...,16,OCCITANIE,67,61,47,77.0,0.540984,0.147541,0.04918,0.032787
60060,2016,0340869K,COLLEGE,PIC LA SALLE,PRIVE,34032,BEZIERS,34,HERAULT,11,...,16,OCCITANIE,167,167,167,100.0,0.107784,0.335329,0.299401,0.257485
63545,2020,0781107E,COLLEGE,JEAN PHILIPPE RAMEAU,PUBLIC,78646,VERSAILLES,78,YVELINES,25,...,10,ILE-DE-FRANCE,221,221,206,93.2,0.140271,0.18552,0.248869,0.357466
24971,2016,0951507X,COLLEGE,ST MARTIN DE FRANCE,PRIVE,95500,PONTOISE,95,VAL-D'OISE,25,...,10,ILE-DE-FRANCE,135,134,122,91.0,0.477612,0.246269,0.134328,0.052239


In [303]:
#creating sub-dataframe for college cerficate dataframe based on session year  
dfs_dnb_par_etablissement = {}
for session in df_raw_dnb_par_etablissement['session'].unique():
    dfs_dnb_par_etablissement[session] = pd.DataFrame(df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == session])
    dfs_dnb_par_etablissement[session] = dfs_dnb_par_etablissement[session].rename(columns = 
                                                                                   {'nombre_d_inscrits': f'dnb_nombre_d_inscrits_{session}', 
                                                                                    'nombre_de_presents': f'dnb_nombre_de_presents_{session}',
                                                                                    'nombre_total_d_admis': f'dnb_nombre_de_presents_{session}',
                                                                                    'taux_de_reussite': f'dnb_taux_de_reussite_{session}',
                                                                                    'dnb_taux_de_sans_mention': f'dnb_taux_de_sans_mention_{session}',
                                                                                    'dnb_taux_de_mention_ab': f'dnb_taux_de_mention_ab_{session}',
                                                                                    'dnb_taux_de_mention_b': f'dnb_taux_de_mention_b_{session}',
                                                                                    'dnb_taux_de_mention_tb': f'dnb_taux_de_mention_tb_{session}',
                                                                                    })
    print(f"{session}: {len(dfs_dnb_par_etablissement[session])}")

2007: 8623
2008: 8656
2011: 8696
2012: 8697
2006: 8562
2009: 8672
2010: 8646
2013: 8732
2014: 8746
2019: 8797
2020: 8807
2015: 8752
2017: 8796
2018: 8802
2016: 8780
2021: 8816


In [304]:
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,COLLEGE
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,COLLEGE
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,COLLEGE
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,COLLEGE
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,COLLEGE


In [305]:
# merging
for session in df_raw_dnb_par_etablissement['session'].unique():
    # df_master = df_master.join(dfs_dnb_par_etablissement[session].set_index('uai'), on='uai', how='left', rsuffix=session)
    # print(session)
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_d_inscrits_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_de_presents_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_reussite_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_sans_mention_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_ab_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_b_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_tb_{session}']], on='uai', how='left')
    
    
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,...,0.119403,0.059701,61.0,61.0,58.0,95.1,0.114754,0.327869,0.196721,0.311475
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,...,0.209402,0.145299,236.0,233.0,207.0,88.8,0.111588,0.236052,0.227468,0.313305
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,...,0.211111,0.133333,120.0,120.0,111.0,92.5,0.166667,0.233333,0.3,0.225
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,...,0.247619,0.271429,242.0,235.0,214.0,91.1,0.119149,0.187234,0.268085,0.33617
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,...,0.203008,0.157895,141.0,141.0,131.0,92.9,0.156028,0.29078,0.255319,0.22695


In [306]:
df_master.rename(columns = {'code_du_departement':'code_departement', 
                            'nom_de_l_etablissment':'nom_etablissment',
                            'code_insee_de_la_commune':'code_insee_commune',
                            'nom_de_la_commune':'commune',
                            }, inplace = True)

In [307]:
# Moving columns
uai = df_master.pop('uai')
df_master.insert(0, 'uai', uai)
del uai

nom_etablissment = df_master.pop('nom_etablissment')
df_master.insert(1, 'nom_etablissment', nom_etablissment)
del nom_etablissment

type = df_master.pop('type_etablissement')
df_master.insert(3, 'type_etablissement', type)
del type

rentree_scolaire = df_master.pop('rentree_scolaire')
df_master.insert(11, 'rentree_scolaire', rentree_scolaire)
del rentree_scolaire

df_master.sample(5)

  df_master.insert(0, 'uai', uai)
  df_master.insert(1, 'nom_etablissment', nom_etablissment)
  df_master.insert(3, 'type_etablissement', type)
  df_master.insert(11, 'rentree_scolaire', rentree_scolaire)


Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
2741,0030091H,COLLEGE JEAN JACQUES SOULIER,COLLEGE,CLERMONT-FERRAND,3,ALLIER,3185,MONTLUCON,public,98.5,...,0.198718,0.115385,136.0,133.0,108.0,81.2,0.240602,0.18797,0.195489,0.18797
2376,0761697G,COLLEGE CLAUDE BERNARD,COLLEGE,NORMANDIE,76,SEINE MARITIME,76351,LE HAVRE,public,81.1,...,0.142857,0.059524,94.0,94.0,77.0,81.9,0.265957,0.212766,0.234043,0.106383
1171,0860023H,COLLEGE JOACHIM DU BELLAY,COLLEGE,POITIERS,86,VIENNE,86137,LOUDUN,public,90.0,...,0.156522,0.104348,107.0,107.0,85.0,79.4,0.271028,0.205607,0.168224,0.149533
27871,0860660A,ECOLE PRIMAIRE LA TRIMOUILLE,ECOLE,POITIERS,86,VIENNE,86273,LA TRIMOUILLE,public,94.4,...,,,,,,,,,,
27581,0811190D,ECOLE PRIMAIRE ROULANDOU,ECOLE,TOULOUSE,81,TARN,81065,CASTRES,public,91.4,...,,,,,,,,,,


In [308]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 139 entries, uai to dnb_taux_de_mention_tb_2021
dtypes: float64(129), object(10)
memory usage: 41.7+ MB


##### Merging geolocalisation dataset

In [309]:
df_raw_geolocalisation = df_raw_geolocalisation.rename(columns={'numero_uai': 'uai'})

In [310]:
df_master = df_master.join(df_raw_geolocalisation.set_index('uai'), on='uai', how='left', rsuffix='right')
df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
19218,0910882P,ECOLE PRIMAIRE PRIVEE JEANNE D ARC,ECOLE,VERSAILLES,91,ESSONNE,91103,BRETIGNY SUR ORGE,privé sous contrat,130.6,...,91103,Essonne,Ile-de-France,Versailles,"{'lon': 2.302470292345319, 'lat': 48.605210133...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6.0,MINISTERE DE L'EDUCATION NATIONALE,1968-12-20
639,0541579Y,COLLEGE EMILE GALLE,COLLEGE,NANCY-METZ,54,MEURTHE-ET-MOSELLE,54314,LEXY,public,107.0,...,54314,Meurthe-et-Moselle,Grand Est,Nancy-Metz,"{'lon': 5.730229782400035, 'lat': 49.497035731...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1971-01-26
37384,0751213L,E E PU COLONEL MOLL,ECOLE,PARIS,75,PARIS,75117,PARIS 17E ARRONDISSEMENT,public,138.0,...,75117,Paris,Ile-de-France,Paris,"{'lon': 2.289999433257113, 'lat': 48.878100781...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1965-05-01
3265,0500012S,COLLEGE LEON GAMBETTA,COLLEGE,NORMANDIE,50,MANCHE,50099,CARENTAN LES MARAIS,public,91.5,...,50099,Manche,Normandie,Normandie,"{'lon': -1.2512693444861451, 'lat': 49.3009381...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1965-05-01
32350,0952245Z,ECOLE PRIMAIRE SIMONE VEIL,ECOLE,VERSAILLES,95,VAL-D'OISE,95018,ARGENTEUIL,public,80.9,...,95018,Val-d'Oise,Ile-de-France,Versailles,"{'lon': 2.234482388027923, 'lat': 48.933975316...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,2019-06-01


In [311]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['appellation_officielle', 'denomination_principale', 'patronyme_uai', 'secteur_public_prive_libe', 'adresse_uai',
                     'boite_postale_uai', 'localite_acheminement_uai', 'libelle_commune', 'localisation', 'nature_uai_libe',
                     'etat_etablissement', 'etat_etablissement_libe', 'code_departementright', 'code_commune', 'libelle_departement', 'libelle_academie', 
                     'secteur_prive_code_type_contrat', 'secteur_prive_libelle_type_contrat', 'code_ministere', 'libelle_ministere', 'nature_uai', 'lieu_dit_uai'], 
        errors='ignore', 
        inplace=True)

In [312]:
# Moving columns
code_academie = df_master.pop('code_academie')
df_master.insert(3, 'code_academie', code_academie)
del code_academie

code_region = df_master.pop('code_region')
df_master.insert(8, 'code_region', code_region)
del code_region

libelle_region = df_master.pop('libelle_region')
df_master.insert(9, 'libelle_region', libelle_region)
del libelle_region

df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,libelle_region,...,dnb_taux_de_mention_tb_2021,code_postal_uai,coordonnee_x,coordonnee_y,epsg,latitude,longitude,appariement,position,date_ouverture
32808,0050089V,ECOLE PRIMAIRE,ECOLE,2.0,AIX-MARSEILLE,5,HAUTES-ALPES,5012,93.0,Provence-Alpes-Côte d'Azur,...,,5200.0,977424.0,6388278.0,EPSG:2154,44.53834,6.492999,Correcte,"{'lon': 6.4929991950495, 'lat': 44.53833985905...",1968-10-15
27,0030021G,COLLEGE ANDRE BOUTRY,COLLEGE,6.0,CLERMONT-FERRAND,3,ALLIER,3155,84.0,Auvergne-Rhône-Alpes,...,0.166667,3320.0,695286.2,6625912.0,EPSG:2154,46.733303,2.93827,Parfaite,"{'lon': 2.938270030009145, 'lat': 46.733302753...",1965-05-01
14477,0221093U,ECOLE PRIMAIRE PUBLIQUE DE SAINT JACUT DE LA MER,ECOLE,14.0,RENNES,22,COTES D'ARMOR,22302,53.0,Bretagne,...,,22750.0,317598.0,6845624.1,EPSG:2154,48.597683,-2.189669,Parfaite,"{'lon': -2.189669308325677, 'lat': 48.59768269...",1970-06-12
4001,0941891V,COLLEGE PRIVE POULLART DES PLACES,COLLEGE,24.0,CRETEIL,94,VAL-DE-MARNE,94054,11.0,Ile-de-France,...,0.09375,94310.0,655737.7,6850168.2,EPSG:2154,48.750108,2.39798,Parfaite,"{'lon': 2.397980104082767, 'lat': 48.750108026...",1987-09-01
21567,0332068U,ECOLE PRIMAIRE JACQUES BASTIDE,ECOLE,4.0,BORDEAUX,33,GIRONDE,33425,75.0,Nouvelle-Aquitaine,...,,33240.0,427032.0,6443986.7,EPSG:2154,45.041173,-0.467751,MANUEL,"{'lon': -0.46775069837222605, 'lat': 45.041173...",1971-06-17


In [313]:
df_master = df_master.rename(columns={'libelle_region': 'region'})

In [314]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 151 entries, uai to date_ouverture
dtypes: float64(136), object(15)
memory usage: 45.3+ MB


##### Merging Niveau_de_vie_2013_a_la_commune dataset

In [315]:
df_raw_revenus_par_commune = df_raw_revenus_par_commune.rename(columns={'Code Commune': 'code_insee_commune'})

In [316]:
df_master = df_master.join(df_raw_revenus_par_commune.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [317]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['Nom Commune'], 
        errors='ignore', 
        inplace=True)

In [318]:
# Renaming columns
df_master.rename(columns = {'Niveau de vie Commune':'niveau_de_vie_commune', 
                            'Niveau de vie Département':'niveau_de_vie_departement'
                            }, inplace = True)

In [319]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 153 entries, uai to niveau_de_vie_departement
dtypes: float64(138), object(15)
memory usage: 45.9+ MB


##### Merging georef-france-commune dataset

##### typo_rur

In [320]:
df_raw_typo_rur = df_raw_typo_rur.rename(columns={'codgeo': 'code_insee_commune'})

In [321]:
df_master = df_master.join(df_raw_typo_rur.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [322]:
df_master

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,region,...,latitude,longitude,appariement,position,date_ouverture,niveau_de_vie_commune,niveau_de_vie_departement,libgeo,zonage_rur,zonage_rur_lib
0,0010025X,COLLEGE PAUL SIXDENIER,COLLEGE,10.0,LYON,001,AIN,01185,84.0,Auvergne-Rhône-Alpes,...,45.976219,5.600967,Correcte,"{'lon': 5.600967267017193, 'lat': 45.976219204...",1965-05-01,20198.148148,22343.574665,Plateau d'Hauteville,2.0,rural autonome peu dense
1,0010041P,COLLEGE VAUGELAS,COLLEGE,10.0,LYON,001,AIN,01244,84.0,Auvergne-Rhône-Alpes,...,45.907542,5.188640,Parfaite,"{'lon': 5.188640403667234, 'lat': 45.907541813...",1971-02-16,21367.619048,22343.574665,Meximieux,5.0,urbain densité intermédiaire
2,0010092V,COLLEGE PRIVE SAINT JOSEPH,COLLEGE,10.0,LYON,001,AIN,01283,84.0,Auvergne-Rhône-Alpes,...,46.259653,5.656330,Parfaite,"{'lon': 5.65632989664765, 'lat': 46.2596533673...",1967-01-19,16590.000000,22343.574665,Oyonnax,5.0,urbain densité intermédiaire
3,0010896U,COLLEGE INTERNATIONAL,COLLEGE,10.0,LYON,001,AIN,01160,84.0,Auvergne-Rhône-Alpes,...,46.264811,6.116650,Parfaite,"{'lon': 6.116650067099341, 'lat': 46.264811148...",1970-02-20,25508.333333,22343.574665,Ferney-Voltaire,5.0,urbain densité intermédiaire
4,0010938P,COLLEGE LES COTES,COLLEGE,10.0,LYON,001,AIN,01289,84.0,Auvergne-Rhône-Alpes,...,46.190009,5.205570,Parfaite,"{'lon': 5.205570413701443, 'lat': 46.190008673...",1972-01-25,21632.000000,22343.574665,Péronnas,5.0,urbain densité intermédiaire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39053,9760209Y,ECOLE ELEMENTAIRE DE CHICONI 5,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97605,6.0,Mayotte,...,-12.836574,45.115200,Parfaite,"{'lon': 45.11520022047355, 'lat': -12.83657434...",1998-09-01,,,Chiconi,5.0,urbain densité intermédiaire
39054,9760253W,ECOLE ELEMENTAIRE PUBLIQUE MAJICAVO KOROPA 3,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97610,6.0,Mayotte,...,-12.744716,45.218850,Parfaite,"{'lon': 45.218850302397314, 'lat': -12.7447159...",2001-09-01,,,Koungou,5.0,urbain densité intermédiaire
39055,9760301Y,ECOLE PRIMAIRE PUBLIQUE TSOUNDZOU 2,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97611,6.0,Mayotte,...,-12.817093,45.197320,Parfaite,"{'lon': 45.19731954390122, 'lat': -12.81709257...",2005-09-01,,,Mamoudzou,6.0,urbain dense
39056,9760302Z,ECOLE PRIMAIRE PUBLIQUE MROALE,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97617,6.0,Mayotte,...,-12.788367,45.128749,Correcte,"{'lon': 45.12874934018609, 'lat': -12.78836708...",2005-09-01,,,Tsingoni,5.0,urbain densité intermédiaire


In [323]:
df_master['zonage_rur_lib'].isnull().value_counts()

False    37860
True      1198
Name: zonage_rur_lib, dtype: int64

### 6. Tidying up dataframe

In [324]:
# Making all strings lower case
df_master = df_master.applymap(lambda x: x.lower() if type(x) == str else x)

# renaming "privé sous contrat" in "prive"
df_master.loc[df_master["secteur"] == "privé sous contrat", "secteur"] = "prive"

# sorting
df_master.sort_values(by=['secteur', 'type_etablissement'], ascending=False, inplace=True)

In [325]:
df_master.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 6967 to 6962
Data columns (total 156 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  
 0    uai                            39058 non-null  object 
 1    nom_etablissment               39058 non-null  object 
 2    type_etablissement             39058 non-null  object 
 3    code_academie                  38933 non-null  float64
 4    academie                       39058 non-null  object 
 5    code_departement               39058 non-null  object 
 6    departement                    39058 non-null  object 
 7    code_insee_commune             39058 non-null  object 
 8    code_region                    38933 non-null  float64
 9    region                         38933 non-null  object 
 10   commune                        39058 non-null  object 
 11   secteur                        39058 non-null  object 
 12   ips                         

### 7. Exporting workfile data to CSV

In [326]:
file_name = "df_master" + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out + file_name, index = False)


# Code to comment at some point
timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "df_master_" + timestr + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out_temporary + file_name, index = False)

file name: df_master.csv
file name: df_master_2022-11-10_20-14-34.csv


#### data-collection notebook execution time

In [327]:
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 119.05273175239563
