This notebook is dedicated to data collection, cleaning and creation of work files for the study of the public/private distribution of French schools

### 1. Importing librairies

In [203]:
import time
startTime = time.time()

import pandas as pd
import os
import requests

### 2. Creating the relevant folders and paths

In [204]:
# Defining current folder as our main directory
dirname = os.getcwd()

# location folders variables
data_in = dirname + "\\da_data_raw\\"
data_out = dirname + "\\da_data_workfiles\\"
graph_out = dirname + "\\graphs\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in):
    os.makedirs(data_in)
    print(f"Directory Created: {data_in}")
else: 
    print(f"Already existing directory: {data_in}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out):
    os.makedirs(data_out)
    print(f"Directory Created: {data_out}")
else:
    print(f"Already existing directory: {data_out}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(graph_out):
    os.makedirs(graph_out)
    print(f"Directory Created: {graph_out}")
else:
    print(f"Already existing directory: {graph_out}")    

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in}")
print(f"Workfile folder: {data_out}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\
Already existing directory: c:\Users\33671\Documents\Python\IPS\graphs\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\


Temporary code (to comment later)

In [205]:
# Getting current date
datestr = time.strftime("%Y-%m-%d")

# location folders variables (temporary - to comment later)
data_in_temporary = dirname + "\\da_data_raw\\" + datestr + "\\"
data_out_temporary = dirname + "\\da_data_workfiles\\" + datestr + "\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in_temporary):
    os.makedirs(data_in_temporary)
    print(f"Directory Created: {data_in_temporary}")
else: 
    print(f"Already existing directory: {data_in_temporary}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out_temporary):
    os.makedirs(data_out_temporary)
    print(f"Directory Created: {data_out_temporary}")
else:
    print(f"Already existing directory: {data_out_temporary}")

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in_temporary}")
print(f"Workfile folder: {data_out_temporary}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-09\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-09\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-09\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-09\


### 3. Verifying APIs responses

In [206]:
#fr-en-ips_colleges
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-ips_ecoles
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-dnb-par-etablissement
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

fr-en-ips_colleges
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_colleges/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-ips_ecoles
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_ecoles/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-dnb-par-etablissement
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-dnb-par-etablissement/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8



### 4. Data Collection

##### ips_colleges dataset

In [207]:
# Loading data
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_colleges = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"

In [208]:
df_raw_ips_colleges.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
3660,2021-2022,PARIS,75,PARIS,0753896C,COLLEGE PRIVE SAINTE CLOTILDE,75112,PARIS 12E ARRONDISSEMENT,privé sous contrat,140.3,COLLEGE
6092,2021-2022,BORDEAUX,47,LOT-ET-GARONNE,0470678G,COLLEGE ANDRE CROCHEPIERRE,47323,VILLENEUVE SUR LOT,public,82.0,COLLEGE
6234,2021-2022,LILLE,59,NORD,0592917A,COLLEGE PRIVE JEANNE D ARC,59512,ROUBAIX,privé sous contrat,144.2,COLLEGE
815,2021-2022,CLERMONT-FERRAND,63,PUY-DE-DOME,0631763Z,COLLEGE PIERRE MENDES FRANCE,63300,RIOM,public,108.6,COLLEGE
3229,2021-2022,TOULOUSE,46,LOT,0460006G,COLLEGE GAMBETTA,46042,CAHORS,public,92.7,COLLEGE


##### ips_ecoles dataset

In [209]:
# Loading data
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_ecoles = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [210]:
df_raw_ips_ecoles.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
8046,2021-2022,MONTPELLIER,30,GARD,0301620Y,ECOLE PRIMAIRE LOUIS LEPRINCE RINGUET,30007,ALES,public,75.0,ECOLE
11702,2021-2022,AMIENS,80,SOMME,0801393E,ECOLE PRIMAIRE PRIVEE NOTRE DAME DU BON CONSEIL,80021,AMIENS,privé sous contrat,92.1,ECOLE
736,2021-2022,GRENOBLE,74,HAUTE SAVOIE,0740946B,ECOLE PRIMAIRE PRIVEE SAINTE ANNE ANNECY LE VI...,74010,ANNECY,privé sous contrat,131.7,ECOLE
24962,2021-2022,BORDEAUX,33,GIRONDE,0332146D,ECOLE PRIMAIRE,33125,CISSAC MEDOC,public,94.0,ECOLE
22768,2021-2022,LILLE,59,NORD,0594180Y,ECOLE PRIMAIRE PIERRE ET MARIE CURIE,59136,LE CATEAU CAMBRESIS,public,76.6,ECOLE


##### dnb-par-etablissement dataset

In [211]:
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_dnb_par_etablissement = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [212]:
df_raw_dnb_par_etablissement.sample(5) 

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
10120,2012,9740011U,COLLEGE,HEGESIPPE HOARAU,PUBLIC,97414,SAINT-LOUIS,974,LA REUNION,28,...,11,LA REUNION,243,236,188,68,62,31,27,"79,60%"
81540,2008,0891057R,COLLEGE,PHILIPPE COUSTEAU,PUBLIC,89055,BRIENON-SUR-ARMANCON,89,YONNE,7,...,2,BOURGOGNE-FRANCHE-COMTE,43,43,38,15,14,5,4,"88,30%"
11143,2013,0622112J,COLLEGE,SAINT BERTULPHE,PRIVE,62364,FRUGES,62,PAS-DE-CALAIS,9,...,9,HAUTS-DE-FRANCE,51,51,51,20,17,11,3,"100,00%"
97797,2006,0942119T,COLLEGE,INSTITUT FRANCAIS DE GRIGNON,PRIVE,94073,THIAIS,94,VAL-DE-MARNE,24,...,10,ILE-DE-FRANCE,24,23,14,12,1,1,0,"60,80%"
83634,2011,0291655S,COLLEGE,ST CHARLES,PRIVE,29075,GUIPAVAS,29,FINISTERE,14,...,3,BRETAGNE,92,92,92,16,34,31,11,"100,00%"


In [213]:
# Check school type distribution based on last avaiable year results
df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == 2021]['denomination_principale'].value_counts()

COLLEGE                6942
LYCEE PROFESSIONNEL    1315
LYCEE                   481
EREA                     58
AUTRE                    17
CFA                       3
Name: denomination_principale, dtype: int64

##### fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre dataset

In [214]:
# Loading data
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_geolocalisation = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [215]:
df_raw_geolocalisation.sample(5)

Unnamed: 0,numero_uai,appellation_officielle,denomination_principale,patronyme_uai,secteur_public_prive_libe,adresse_uai,lieu_dit_uai,boite_postale_uai,code_postal_uai,localite_acheminement_uai,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
8808,0720379W,Ecole primaire publique,ECOLE PRIMAIRE PUBLIQUE,,Public,Place de la Mairie,,,72170,VIVOIN,...,72380,Sarthe,Pays de la Loire,Nantes,"{'lon': 0.15667945467946, 'lat': 48.2333431253...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1967-01-24
50477,0910302J,Ecole élémentaire Jean-Baptiste Corot,ECOLE ELEMENTAIRE PUBLIQUE,JEAN BAPTISTE COROT,Public,Avenue du Bois de Place,,,91090,LISSES,...,91340,Essonne,Ile-de-France,Versailles,"{'lon': 2.4278697543724173, 'lat': 48.60283201...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1965-05-01
25643,9711245B,INFORM'IP,INFORM'IP,INFORM'IP,Privé,IMMEUBLE JEFFREY RUE TAH BLOUDY,CONCORDIA,113.0,97150,ST MARTIN,...,97801,Saint-Martin,TOM et Collectivités territoriales,Guadeloupe,"{'lon': -63.07756984531601, 'lat': 18.06791279...",10.0,HORS CONTRAT,6,MINISTERE DE L'EDUCATION NATIONALE,2009-09-01
41207,0762245C,Ecole élémentaire Jacques Prévert,ECOLE ELEMENTAIRE PUBLIQUE,JACQUES PREVERT,Public,20 rue Jacques Prévert,NEUVILLE LES DIEPPE,,76370,DIEPPE,...,76217,Seine-Maritime,Normandie,Normandie,"{'lon': 1.101500249133354, 'lat': 49.918612197...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1976-06-08
7260,0620377Y,Ecole maternelle Léon Blum,ECOLE MATERNELLE PUBLIQUE,LEON BLUM,Public,Rue Albert Camus,,,62410,WINGLES,...,62895,Pas-de-Calais,Hauts-de-France,Lille,"{'lon': 2.866990160667272, 'lat': 50.495931977...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1966-09-01


##### Niveau_de_vie_2013_a_la_commune dataset

In [216]:
# Loading data
df_raw_revenus_par_commune = pd.read_excel("https://www.data.gouv.fr/fr/datasets/r/d3ce0107-416f-42cf-a335-d71f89b00b21")

### 5. Exporting raw data to CSV

In [217]:
# Raw datasets export
file_name = "ips-colleges" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_colleges.to_csv(data_in + file_name, index = False)

file_name = "ips-ecoles" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_ecoles.to_csv(data_in + file_name, index = False)

file_name = "dnb-par-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_dnb_par_etablissement.to_csv(data_in + file_name, index = False)

file_name = "geolocalisation-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_geolocalisation.to_csv(data_in + file_name, index = False)

file_name = "revenus-par-commune" + ".csv"
print(f"file name: {file_name}")
df_raw_revenus_par_commune.to_csv(data_in + file_name, index = False)

file name: ips-colleges.csv
file name: ips-ecoles.csv
file name: dnb-par-etablissement.csv
file name: geolocalisation-etablissement.csv
file name: revenus-par-commune.csv


#### Exporting raw data to CSV (temporary code - to comment later)

In [218]:
# Generating a raw csv file with timestamp included in the name

# Raw datasets
timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "ips-colleges_" + timestr + ".csv"
print(f"file name: {file_name}")
df_raw_ips_colleges.to_csv(data_in_temporary + file_name, index = False)


timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "ips-ecoles_" + timestr + ".csv"
print(f"file name: {file_name}")
df_raw_ips_ecoles.to_csv(data_in_temporary + file_name, index = False)

timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "dnb-par-etablissement_" + timestr + ".csv"
print(f"file name: {file_name}")
df_raw_dnb_par_etablissement.to_csv(data_in_temporary + file_name, index = False)

timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "geolocalisation-etablissement_" + timestr + ".csv"
print(f"file name: {file_name}")
df_raw_geolocalisation.to_csv(data_in_temporary + file_name, index = False)

timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "revenus-par-commune_" + timestr + ".csv"
print(f"file name: {file_name}")
df_raw_revenus_par_commune.to_csv(data_in_temporary + file_name, index = False)

file name: ips-colleges_2022-11-09_12-44-02.csv
file name: ips-ecoles_2022-11-09_12-44-03.csv
file name: dnb-par-etablissement_2022-11-09_12-44-03.csv
file name: geolocalisation-etablissement_2022-11-09_12-44-04.csv
file name: revenus-par-commune_2022-11-09_12-44-06.csv


### 6. Merging data into master file

##### ips_ecoles & ips_colleges dataframes

In [219]:
# Adding columns about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [220]:
df_master = pd.concat([df_raw_ips_colleges, df_raw_ips_ecoles])

In [221]:
#safety check
print(f"df_raw_ips_colleges N = {len(df_raw_ips_colleges)}")
print(f"df_raw_ips_ecoles N = {len(df_raw_ips_ecoles)}")
print(f"df_master N = {len(df_master)}")
print(len(df_raw_ips_colleges) + len(df_raw_ips_ecoles) == len(df_master))
print(f"unique uai = {df_master['uai'].nunique()}")

df_raw_ips_colleges N = 6967
df_raw_ips_ecoles N = 32091
df_master N = 39058
True
unique uai = 39058


##### dnb_par_etablissement

College certificate success rate is calculated based on admissions devived by attendees (not registrants). We will keep this convention to calculate honors rates

In [222]:
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
111571,2016,0801489J,COLLEGE,DU MARQUENTERRE,PUBLIC,80688,RUE,80,SOMME,20,...,9,HAUTS-DE-FRANCE,69,68,54,18,16,12,8,"79,40%"
31442,2006,0752895P,COLLEGE,CHARLES PEGUY,PRIVE,75111,PARIS 11E ARRONDISSEMENT,75,PARIS,1,...,10,ILE-DE-FRANCE,133,133,133,27,45,40,21,"100,00%"
90338,2010,0441859G,COLLEGE,ERIC TABARLY,PUBLIC,44055,LA BAULE-ESCOUBLAC,44,LOIRE-ATLANTIQUE,17,...,17,PAYS DE LA LOIRE,92,92,85,16,25,36,8,"92,30%"
90990,2010,0610037J,COLLEGE,GASTON LEFAVRAIS,PUBLIC,61339,PUTANGES-LE-LAC,61,ORNE,5,...,14,NORMANDIE,47,46,41,17,10,9,5,"89,10%"
63099,2017,0670076U,COLLEGE,ALBERT CAMUS,PUBLIC,67472,SOUFFLENHEIM,67,BAS-RHIN,15,...,6,GRAND EST,151,147,119,28,28,22,41,"80,90%"


In [223]:
# renaming key column
df_raw_dnb_par_etablissement = df_raw_dnb_par_etablissement.rename(columns = {'numero_d_etablissement': 'uai'})

In [224]:
#converting dnb string to float
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].str.strip('%').str.replace(',', '.')
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].astype('float')

In [225]:
df_raw_dnb_par_etablissement['dnb_taux_de_sans_mention'] = df_raw_dnb_par_etablissement['nombre_d_admis_sans_mention'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_ab'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_ab'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_b'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_b'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_tb'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_tb'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
138644,2021,0570108H,LYCEE,LA BRIQUERIE,PUBLIC,57672,THIONVILLE,57,MOSELLE,12,...,49,10,17,18,4,90.7,0.185185,0.314815,0.333333,0.074074
21270,2019,0401070W,COLLEGE,AIME CESAIRE,PUBLIC,40261,SAINT-GEOURS-DE-MAREMNE,40,LANDES,4,...,120,28,28,35,29,87.5,0.20438,0.20438,0.255474,0.211679
3418,2012,0595389M,COLLEGE,NOTRE-DAME DES DUNES,PRIVE,59183,DUNKERQUE,59,NORD,9,...,149,37,50,27,35,99.3,0.246667,0.333333,0.18,0.233333
42208,2012,0941596Z,COLLEGE,NICOLAS DE STAEL,PUBLIC,94046,MAISONS-ALFORT,94,VAL-DE-MARNE,24,...,88,33,30,19,6,87.1,0.326733,0.29703,0.188119,0.059406
107537,2013,0440066H,COLLEGE,RENE GUY CADOU,PUBLIC,44154,SAINT-BREVIN-LES-PINS,44,LOIRE-ATLANTIQUE,17,...,96,27,38,20,11,84.2,0.236842,0.333333,0.175439,0.096491


In [226]:
# Removing columns
df_raw_dnb_par_etablissement.drop(columns=['nombre_d_admis_sans_mention', 'nombre_d_admis_mention_ab', 'nombre_d_admis_mention_b', 'nombre_d_admis_mention_tb'], 
        errors='ignore', 
        inplace=True)
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
47326,2011,0332344U,LYCEE PROFESSIONNEL,HENRI BRULLE,PUBLIC,33243,LIBOURNE,33,GIRONDE,4,...,15,NOUVELLE-AQUITAINE,24,22,22,100.0,0.727273,0.227273,0.045455,0.0
68347,2019,0271320N,COLLEGE,ROSA PARKS,PUBLIC,27016,LES ANDELYS,27,EURE,21,...,14,NORMANDIE,109,108,82,75.9,0.259259,0.175926,0.157407,0.166667
67567,2019,0081001V,COLLEGE,JULES LEROUX,PUBLIC,8480,VILLERS-SEMEUSE,8,ARDENNES,19,...,6,GRAND EST,81,81,70,86.4,0.209877,0.209877,0.197531,0.246914
128739,2016,0692579D,COLLEGE,MARTIN LUTHER KING,PUBLIC,69283,MIONS,69,RHONE,10,...,1,AUVERGNE-RHONE-ALPES,168,163,138,84.6,0.306748,0.208589,0.220859,0.110429
132274,2012,9740036W,COLLEGE,TERRAIN FLEURY,PUBLIC,97422,LE TAMPON,974,LA REUNION,28,...,11,LA REUNION,252,252,194,76.9,0.22619,0.25,0.178571,0.115079


In [227]:
#creating sub-dataframe for college cerficate dataframe based on session year  
dfs_dnb_par_etablissement = {}
for session in df_raw_dnb_par_etablissement['session'].unique():
    dfs_dnb_par_etablissement[session] = pd.DataFrame(df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == session])
    dfs_dnb_par_etablissement[session] = dfs_dnb_par_etablissement[session].rename(columns = 
                                                                                   {'nombre_d_inscrits': f'dnb_nombre_d_inscrits_{session}', 
                                                                                    'nombre_de_presents': f'dnb_nombre_de_presents_{session}',
                                                                                    'nombre_total_d_admis': f'dnb_nombre_de_presents_{session}',
                                                                                    'taux_de_reussite': f'dnb_taux_de_reussite_{session}',
                                                                                    'dnb_taux_de_sans_mention': f'dnb_taux_de_sans_mention_{session}',
                                                                                    'dnb_taux_de_mention_ab': f'dnb_taux_de_mention_ab_{session}',
                                                                                    'dnb_taux_de_mention_b': f'dnb_taux_de_mention_b_{session}',
                                                                                    'dnb_taux_de_mention_tb': f'dnb_taux_de_mention_tb_{session}',
                                                                                    })
    print(f"{session}: {len(dfs_dnb_par_etablissement[session])}")

2007: 8623
2008: 8656
2011: 8696
2012: 8697
2006: 8562
2009: 8672
2010: 8646
2013: 8732
2014: 8746
2019: 8797
2020: 8807
2015: 8752
2017: 8796
2018: 8802
2016: 8780
2021: 8816


In [228]:
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,COLLEGE
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,COLLEGE
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,COLLEGE
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,COLLEGE
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,COLLEGE


In [229]:
# merging
for session in df_raw_dnb_par_etablissement['session'].unique():
    # df_master = df_master.join(dfs_dnb_par_etablissement[session].set_index('uai'), on='uai', how='left', rsuffix=session)
    # print(session)
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_d_inscrits_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_de_presents_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_reussite_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_sans_mention_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_ab_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_b_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_tb_{session}']], on='uai', how='left')
    
    
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,...,0.119403,0.059701,61.0,61.0,58.0,95.1,0.114754,0.327869,0.196721,0.311475
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,...,0.209402,0.145299,236.0,233.0,207.0,88.8,0.111588,0.236052,0.227468,0.313305
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,...,0.211111,0.133333,120.0,120.0,111.0,92.5,0.166667,0.233333,0.3,0.225
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,...,0.247619,0.271429,242.0,235.0,214.0,91.1,0.119149,0.187234,0.268085,0.33617
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,...,0.203008,0.157895,141.0,141.0,131.0,92.9,0.156028,0.29078,0.255319,0.22695


In [230]:
df_master.rename(columns = {'code_du_departement':'code_departement', 
                            'nom_de_l_etablissment':'nom_etablissment',
                            'code_insee_de_la_commune':'code_insee_commune',
                            'nom_de_la_commune':'commune',
                            }, inplace = True)

In [231]:
# Moving columns
uai = df_master.pop('uai')
df_master.insert(0, 'uai', uai)
del uai

nom_etablissment = df_master.pop('nom_etablissment')
df_master.insert(1, 'nom_etablissment', nom_etablissment)
del nom_etablissment

type = df_master.pop('type_etablissement')
df_master.insert(3, 'type_etablissement', type)
del type

rentree_scolaire = df_master.pop('rentree_scolaire')
df_master.insert(11, 'rentree_scolaire', rentree_scolaire)
del rentree_scolaire

df_master.sample(5)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
37706,0780482A,ECOLE ELEMENTAIRE LOUIS PASTEUR,ECOLE,VERSAILLES,78,YVELINES,78138,CHANTELOUP LES VIGNES,public,115.7,...,,,,,,,,,,
10354,0271603W,ECOLE PRIMAIRE,ECOLE,NORMANDIE,27,EURE,27599,SAINT PIERRE LA GARENNE,public,111.3,...,,,,,,,,,,
30617,0100216F,ECOLE ELEMENTAIRE,ECOLE,REIMS,10,AUBE,10113,COUVIGNON,public,103.9,...,,,,,,,,,,
13516,0021604H,ECOLE PRIMAIRE PRIVEE SAINTE MARIE MADELEINE,ECOLE,AMIENS,2,AISNE,2168,CHATEAU THIERRY,privé sous contrat,114.9,...,,,,,,,,,,
7751,0750353B,E P PR SAINTE THERESE,ECOLE,PARIS,75,PARIS,75119,PARIS 19E ARRONDISSEMENT,privé sous contrat,138.6,...,,,,,,,,,,


##### Merging geolocalisation dataset

In [232]:
df_raw_geolocalisation = df_raw_geolocalisation.rename(columns={'numero_uai': 'uai'})

In [233]:
df_master = df_master.join(df_raw_geolocalisation.set_index('uai'), on='uai', how='left', rsuffix='right')
df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
33957,0271788X,ECOLE ELEMENTAIRE,ECOLE,NORMANDIE,27,EURE,27298,GRAVERON SEMERVILLE,public,113.1,...,27298,Eure,Normandie,Normandie,"{'lon': 0.975590352098275, 'lat': 49.092936097...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,2003-09-01
24068,0680964D,ECOLE ELEMENTAIRE INTERCOMMUNALE,ECOLE,STRASBOURG,68,HAUT-RHIN,68327,STETTEN,public,120.0,...,68327,Haut-Rhin,Grand Est,Strasbourg,"{'lon': 7.428699423861031, 'lat': 47.624874144...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1966-11-17
11877,0501721Z,ECOLE PRIMAIRE JOSEPH BOCHER,ECOLE,NORMANDIE,50,MANCHE,50129,CHERBOURG EN COTENTIN,public,105.6,...,50129,Manche,Normandie,Normandie,"{'lon': -1.6637004637583601, 'lat': 49.6534619...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1981-09-10
32033,0342045N,ECOLE PRIMAIRE LES CROZES,ECOLE,MONTPELLIER,34,HERAULT,34108,FRONTIGNAN,public,116.7,...,34108,Hérault,Occitanie,Montpellier,"{'lon': 3.758719946725911, 'lat': 43.455705398...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,2000-09-01
7820,0754942P,E P PU TANDOU R,ECOLE,PARIS,75,PARIS,75119,PARIS 19E ARRONDISSEMENT,public,91.3,...,75119,Paris,Ile-de-France,Paris,"{'lon': 2.379950298081658, 'lat': 48.886147626...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1999-09-01


In [234]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['appellation_officielle', 'denomination_principale', 'patronyme_uai', 'secteur_public_prive_libe', 'adresse_uai',
                     'boite_postale_uai', 'localite_acheminement_uai', 'libelle_commune', 'localisation', 'nature_uai_libe',
                     'etat_etablissement', 'etat_etablissement_libe', 'code_departementright', 'code_commune', 'libelle_departement', 'libelle_academie', 
                     'secteur_prive_code_type_contrat', 'secteur_prive_libelle_type_contrat', 'code_ministere', 'libelle_ministere', 'nature_uai', 'lieu_dit_uai'], 
        errors='ignore', 
        inplace=True)

In [235]:
# Moving columns
code_academie = df_master.pop('code_academie')
df_master.insert(3, 'code_academie', code_academie)
del code_academie

code_region = df_master.pop('code_region')
df_master.insert(8, 'code_region', code_region)
del code_region

libelle_region = df_master.pop('libelle_region')
df_master.insert(9, 'libelle_region', libelle_region)
del libelle_region

df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,libelle_region,...,dnb_taux_de_mention_tb_2021,code_postal_uai,coordonnee_x,coordonnee_y,epsg,latitude,longitude,appariement,position,date_ouverture
36435,0623036N,ECOLE PRIMAIRE JULES FERRY,ECOLE,9.0,LILLE,62,PAS-DE-CALAIS,62178,32.0,Hauts-de-France,...,,62700.0,667427.8,7043174.4,EPSG:2154,50.4851,2.541741,Parfaite,"{'lon': 2.5417405828755752, 'lat': 50.48509997...",1975-09-01
21071,0270413C,ECOLE PRIMAIRE LOUIS PERGAUD,ECOLE,70.0,NORMANDIE,27,EURE,27593,28.0,Normandie,...,,27370.0,551429.2,6907529.0,EPSG:2154,49.250151,0.95951,Parfaite,"{'lon': 0.959510080877863, 'lat': 49.250150605...",1965-08-25
29548,0573693E,ECOLE PRIMAIRE BICULTURELLE ROBERT SCHUMAN,ECOLE,12.0,NANCY-METZ,57,MOSELLE,57160,44.0,Grand Est,...,,57150.0,967810.1,6905747.8,EPSG:2154,49.195357,6.675161,Parfaite,"{'lon': 6.675160529708393, 'lat': 49.195356686...",2005-09-01
36514,0630658Y,ECOLE PRIMAIRE,ECOLE,6.0,CLERMONT-FERRAND,63,PUY-DE-DOME,63305,84.0,Auvergne-Rhône-Alpes,...,,63210.0,684957.9,6509394.5,EPSG:2154,45.683948,2.80675,Parfaite,"{'lon': 2.806749606528464, 'lat': 45.683948054...",1965-07-30
32770,0030571E,ECOLE PRIMAIRE,ECOLE,6.0,CLERMONT-FERRAND,3,ALLIER,3121,84.0,Auvergne-Rhône-Alpes,...,,3400.0,731561.3,6615321.0,EPSG:2154,46.637214,3.412585,Parfaite,"{'lon': 3.412585324569879, 'lat': 46.637213764...",1967-05-10


In [236]:
df_master = df_master.rename(columns={'libelle_region': 'region'})

##### Merging Niveau_de_vie_2013_a_la_commune dataset

In [237]:
df_raw_revenus_par_commune = df_raw_revenus_par_commune.rename(columns={'Code Commune': 'code_insee_commune'})

In [238]:
df_master = df_master.join(df_raw_revenus_par_commune.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [239]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['Nom Commune'], 
        errors='ignore', 
        inplace=True)

In [240]:
# Renaming columns
df_master.rename(columns = {'Niveau de vie Commune':'niveau_de_vie_commune', 
                            'Niveau de vie Département':'niveau_de_vie_departement'
                            }, inplace = True)

### 6. Tidying up dataframe

In [241]:
# Making all strings lower case
df_master = df_master.applymap(lambda x: x.lower() if type(x) == str else x)

# renaming "privé sous contrat" in "prive"
df_master.loc[df_master["secteur"] == "privé sous contrat", "secteur"] = "prive"

# sorting
df_master.sort_values(by=['secteur', 'type_etablissement'], ascending=False, inplace=True)

### 7. Exporting workfile data to CSV

In [242]:
file_name = "df_master" + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out + file_name, index = False)


# Code to comment at some point
timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "df_master_" + timestr + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out_temporary + file_name, index = False)

file name: df_master.csv
file name: df_master_2022-11-09_12-44-24.csv


#### data-collection notebook execution time

In [243]:
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 147.61149644851685
