This notebook is dedicated to data collection, cleaning and creation of work files for the study of the public/private distribution of French schools

### 1. Importing librairies

In [145]:
import time
startTime = time.time()
 
import pandas as pd
import os
import requests

### 2. Creating the relevant folders and paths

In [1]:
# Defining current folder as our main directory
dirname = os.getcwd()

# location folders variables
data_in = dirname + "\\da_data_raw\\"
data_out = dirname + "\\da_data_workfiles\\"
graph_out = dirname + "\\graphs\\"
html_graph_out = "C:\\Users\\33671\\Documents\\my-website\\html5up-massively\\images\\graphs\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in):
    os.makedirs(data_in)
    print(f"Directory Created: {data_in}")
else: 
    print(f"Already existing directory: {data_in}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out):
    os.makedirs(data_out)
    print(f"Directory Created: {data_out}")
else:
    print(f"Already existing directory: {data_out}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(graph_out):
    os.makedirs(graph_out)
    print(f"Directory Created: {graph_out}")
else:
    print(f"Already existing directory: {graph_out}")    

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in}")
print(f"Workfile folder: {data_out}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\
Already existing directory: c:\Users\33671\Documents\Python\IPS\graphs\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\


Temporary code (to comment later)

In [147]:
# Getting current date
datestr = time.strftime("%Y-%m-%d")

# location folders variables (temporary - to comment later)
data_in_temporary = dirname + "\\da_data_raw\\" + datestr + "\\"
data_out_temporary = dirname + "\\da_data_workfiles\\" + datestr + "\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in_temporary):
    os.makedirs(data_in_temporary)
    print(f"Directory Created: {data_in_temporary}")
else: 
    print(f"Already existing directory: {data_in_temporary}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out_temporary):
    os.makedirs(data_out_temporary)
    print(f"Directory Created: {data_out_temporary}")
else:
    print(f"Already existing directory: {data_out_temporary}")

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in_temporary}")
print(f"Workfile folder: {data_out_temporary}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-14\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-14\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-14\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-14\


### 3. Verifying APIs responses

In [148]:
#fr-en-ips_colleges
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-ips_ecoles
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-dnb-par-etablissement
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#georef-france-commune
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "10"
# r = requests.get(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
#                  timeout=2)
# print(f"{dataset_id}")
# print(f"URL: {r.url}")
# print(f"HTTP Response Status Code: {r.status_code}") 
# print(f"HTTP Error: {r.raise_for_status()}")
# print(f"Encoding: {r.encoding}\n")
# r.close()

fr-en-ips_colleges
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_colleges/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-ips_ecoles
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_ecoles/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-dnb-par-etablissement
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-dnb-par-etablissement/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8



### 4. Data Collection

##### ips_colleges dataset

In [149]:
# Loading data
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_colleges = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"

In [150]:
df_raw_ips_colleges.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
498,2021-2022,LYON,42,LOIRE,0420978K,COLLEGE PRIVE FRANCOIS D ASSISE,42187,ROANNE,privé sous contrat,120.1,COLLEGE
3919,2021-2022,VERSAILLES,92,HAUTS-DE-SEINE,0920624D,COLLEGE ROMAIN ROLLAND,92060,LE PLESSIS ROBINSON,public,127.5,COLLEGE
1249,2021-2022,VERSAILLES,92,HAUTS-DE-SEINE,0921396T,COLLEGE VICTOR HUGO,92040,ISSY LES MOULINEAUX,public,116.6,COLLEGE
1942,2021-2022,NANTES,49,MAINE-ET-LOIRE,0490955X,COLLEGE SAINT EXUPERY,49063,CHALONNES SUR LOIRE,public,115.4,COLLEGE
4657,2021-2022,NANTES,44,LOIRE-ATLANTIQUE,0440229K,COLLEGE PRIVE SAINT PAUL,44143,REZE,privé sous contrat,127.5,COLLEGE


##### ips_ecoles dataset

In [151]:
# Loading data
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_ecoles = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [152]:
df_raw_ips_ecoles.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
8696,2021-2022,GRENOBLE,38,ISERE,0383132Y,ECOLE PRIMAIRE,38157,ESTRABLIN,public,112.8,ECOLE
26976,2021-2022,NORMANDIE,27,EURE,0271517C,ECOLE ELEMENTAIRE JEAN BURY,27580,SAINT OUEN DE THOUBERVILLE,public,111.1,ECOLE
13310,2021-2022,MONTPELLIER,11,AUDE,0110532U,ECOLE ELEMENTAIRE JEAN BLANC,11325,ROUFFIAC D AUDE,public,94.0,ECOLE
16001,2021-2022,RENNES,56,MORBIHAN,0560772K,ECOLE PRIMAIRE PUBLIQUE LE TAUREAU BLEU,56043,CONCORET,public,111.2,ECOLE
3603,2021-2022,MONTPELLIER,30,GARD,0301127M,ECOLE ELEMENTAIRE PRIVEE NOTRE DAME DES GARDIANS,30006,AIMARGUES,privé sous contrat,107.0,ECOLE


##### dnb-par-etablissement dataset

In [153]:
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_dnb_par_etablissement = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [154]:
df_raw_dnb_par_etablissement.sample(5) 

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
72895,2015,0240050Z,LYCEE PROFESSIONNEL,ARNAUD DANIEL,PUBLIC,24352,RIBERAC,24,DORDOGNE,4,...,15,NOUVELLE-AQUITAINE,23,21,17,13,4,0,0,"80,90%"
89676,2010,0280867R,COLLEGE,FRANCOIS RABELAIS,PUBLIC,28103,CLOYES-LES-TROIS-RIVIERES,28,EURE-ET-LOIR,18,...,4,CENTRE-VAL DE LOIRE,70,68,47,18,16,11,2,"69,10%"
65469,2014,0020067M,COLLEGE,CONDORCET,PUBLIC,2789,VERVINS,2,AISNE,20,...,9,HAUTS-DE-FRANCE,69,64,50,20,10,15,5,"78,10%"
62836,2013,0530779J,COLLEGE,JEAN ROSTAND,PUBLIC,53062,CHATEAU-GONTIER-SUR-MAYENNE,53,MAYENNE,17,...,17,PAYS DE LA LOIRE,91,91,80,24,25,18,13,"87,90%"
122736,2021,0761949F,COLLEGE,FERNAND LEGER,PUBLIC,76498,LE PETIT-QUEVILLY,76,SEINE MARITIME,70,...,14,NORMANDIE,126,123,100,33,21,25,21,"81,3%"


In [155]:
# Check school type distribution based on last avaiable year results
df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == 2021]['denomination_principale'].value_counts()

COLLEGE                6942
LYCEE PROFESSIONNEL    1315
LYCEE                   481
EREA                     58
AUTRE                    17
CFA                       3
Name: denomination_principale, dtype: int64

##### fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre dataset

In [156]:
# Loading data
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_geolocalisation = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [157]:
df_raw_geolocalisation.sample(5)

Unnamed: 0,numero_uai,appellation_officielle,denomination_principale,patronyme_uai,secteur_public_prive_libe,adresse_uai,lieu_dit_uai,boite_postale_uai,code_postal_uai,localite_acheminement_uai,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
39766,0580930W,Ecole primaire d'application Guynemer,ECOLE PRIMAIRE APPLICATION,GEORGES GUYNEMER,Public,16 rue des Tailles,,,58000,NEVERS,...,58194,Nièvre,Bourgogne-Franche-Comté,Dijon,"{'lon': 3.162220552055981, 'lat': 47.002027408...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1997-09-01
17258,0241236N,Section d'enseignement général et professionne...,SEGPA PRIVEE,CLG PR NOTRE DAME,Privé,,LE CLUZEAU,,24240,SIGOULES ET FLAUGEAC,...,24534,Dordogne,Nouvelle-Aquitaine,Bordeaux,"{'lon': 0.41809232528516704, 'lat': 44.7651819...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6,MINISTERE DE L'EDUCATION NATIONALE,2005-03-01
27349,0561247B,Ecole primaire privée Sainte Thérèse,ECOLE PRIMAIRE PRIVEE,STE THERESE,Privé,20 avenue Porhoët,,,56490,ST MALO DES TROIS FONTAINES,...,56227,Morbihan,Bretagne,Rennes,"{'lon': -2.47223026677767, 'lat': 48.013877396...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6,MINISTERE DE L'EDUCATION NATIONALE,1971-03-17
7671,0580170V,Ecole élémentaire,ECOLE ELEMENTAIRE PUBLIQUE,GUILLAUME APOLLINAIRE,Public,,LA CROIX,,58330,ST BENIN DES BOIS,...,58233,Nièvre,Bourgogne-Franche-Comté,Dijon,"{'lon': 3.410960120296873, 'lat': 47.120363601...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1966-09-21
51802,9740435E,Ecole primaire publique Antoine Bertin,ECOLE PRIMAIRE PUBLIQUE,ANTOINE BERTIN,Public,3 rue Belle eau,,,97441,STE SUZANNE,...,97420,La Réunion,La Réunion,La Réunion,"{'lon': 55.59770216033512, 'lat': -20.90044341...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1968-09-16


##### georef-france-commune

In [158]:
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "-1" # argument to pass to get the full dataset 
# columns = "bv2012_code" + "%2C" + "bv2012_name" + "%2C" + "com_uu2020_status" 
# df_raw_georef = pd.read_json(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}&select={columns}")

##### typo-rur

In [159]:
df_raw_typo_rur = pd.read_csv('./da_data_raw/typo-rur.csv')
df_raw_typo_rur

Unnamed: 0,codgeo,libgeo,zonage_rur
0,01001,L'Abergement-Clémenciat,tr2
1,01002,L'Abergement-de-Varey,tr1
2,01004,Ambérieu-en-Bugey,tr5
3,01005,Ambérieux-en-Dombes,tr3
4,01006,Ambléon,tr1
...,...,...,...
34960,97613,M'Tsangamouji,tr5
34961,97614,Ouangani,tr5
34962,97615,Pamandzi,tr5
34963,97616,Sada,tr5


In [160]:
df_raw_typo_rur['zonage_rur'].value_counts()

tr1    8108
tr2    8096
tr3    7394
tr4    7174
tr5    3419
tr6     774
Name: zonage_rur, dtype: int64

In [161]:
df_raw_typo_rur['zonage_rur'] = df_raw_typo_rur['zonage_rur'].map({'tr1': 1, 
                                                                       'tr2': 2, 
                                                                       'tr3': 3, 
                                                                       'tr4': 4,
                                                                       'tr5': 5,
                                                                       'tr6': 6
                                                                       }
                                                                      )

In [162]:
df_raw_typo_rur['zonage_rur_lib'] = df_raw_typo_rur['zonage_rur'].map({1:'rural autonome très peu dense', 
                                                                       2: 'rural autonome peu dense', 
                                                                       3: 'rural sous faible influence d\'un pole', 
                                                                       4: 'rural sous forte influence d\'un pole',
                                                                       5: 'urbain densité intermédiaire',
                                                                       6: 'urbain dense'
                                                                       }
                                                                      )

In [163]:
df_raw_typo_rur.head()

Unnamed: 0,codgeo,libgeo,zonage_rur,zonage_rur_lib
0,1001,L'Abergement-Clémenciat,2,rural autonome peu dense
1,1002,L'Abergement-de-Varey,1,rural autonome très peu dense
2,1004,Ambérieu-en-Bugey,5,urbain densité intermédiaire
3,1005,Ambérieux-en-Dombes,3,rural sous faible influence d'un pole
4,1006,Ambléon,1,rural autonome très peu dense


In [164]:
df_raw_typo_rur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34965 entries, 0 to 34964
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   codgeo          34965 non-null  object
 1   libgeo          34965 non-null  object
 2   zonage_rur      34965 non-null  int64 
 3   zonage_rur_lib  34965 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


##### Niveau_de_vie_2013_a_la_commune dataset

In [165]:
# Loading data
df_raw_revenus_par_commune = pd.read_excel("https://www.data.gouv.fr/fr/datasets/r/d3ce0107-416f-42cf-a335-d71f89b00b21")

### 5. Exporting raw data to CSV

In [166]:
# Raw datasets export
file_name = "ips-colleges" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_colleges.to_csv(data_in + file_name, index = False)

file_name = "ips-ecoles" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_ecoles.to_csv(data_in + file_name, index = False)

file_name = "dnb-par-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_dnb_par_etablissement.to_csv(data_in + file_name, index = False)

file_name = "geolocalisation-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_geolocalisation.to_csv(data_in + file_name, index = False)

file_name = "revenus-par-commune" + ".csv"
print(f"file name: {file_name}")
df_raw_revenus_par_commune.to_csv(data_in + file_name, index = False)

# file_name = "georef-par-commune" + ".csv"
# print(f"file name: {file_name}")
# df_raw_georef.to_csv(data_in + file_name, index = False)

file name: ips-colleges.csv
file name: ips-ecoles.csv
file name: dnb-par-etablissement.csv
file name: geolocalisation-etablissement.csv
file name: revenus-par-commune.csv


#### Exporting raw data to CSV (temporary code - to comment later)

In [167]:
# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "ips-colleges_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_ips_colleges.to_csv(data_in_temporary + file_name, index = False)


# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "ips-ecoles_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_ips_ecoles.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "dnb-par-etablissement_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_dnb_par_etablissement.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "geolocalisation-etablissement_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_geolocalisation.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "revenus-par-commune_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_revenus_par_commune.to_csv(data_in_temporary + file_name, index = False)

### 6. Merging data into master file

##### ips_ecoles & ips_colleges dataframes

In [168]:
# Adding columns about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [169]:
df_master = pd.concat([df_raw_ips_colleges, df_raw_ips_ecoles])

In [170]:
#safety check
print(f"df_raw_ips_colleges N = {len(df_raw_ips_colleges)}")
print(f"df_raw_ips_ecoles N = {len(df_raw_ips_ecoles)}")
print(f"df_master N = {len(df_master)}")
print(len(df_raw_ips_colleges) + len(df_raw_ips_ecoles) == len(df_master))
print(f"unique uai = {df_master['uai'].nunique()}")

df_raw_ips_colleges N = 6967
df_raw_ips_ecoles N = 32091
df_master N = 39058
True
unique uai = 39058


##### dnb_par_etablissement

College certificate success rate is calculated based on admissions devived by attendees (not registrants). We will keep this convention to calculate honors rates

In [171]:
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
43185,2013,0572755K,LYCEE PROFESSIONNEL,LA MALGRANGE,PUBLIC,57672,THIONVILLE,57,MOSELLE,12,...,6,GRAND EST,22,20,12,5,6,1,0,"60,00%"
118557,2020,0831187P,LYCEE PROFESSIONNEL,SAINT JOSEPH,PRIVE,83090,OLLIOULES,83,VAR,23,...,18,PROVENCE-ALPES-COTE D'AZUR,22,22,22,3,2,8,9,"100,00%"
122405,2021,0291103S,COLLEGE,DES ILES DU PONANT,PUBLIC,29019,BREST,29,FINISTERE,14,...,3,BRETAGNE,16,16,15,3,1,4,7,"93,8%"
80081,2010,0410914W,COLLEGE,MARIE CURIE,PUBLIC,41220,SAINT-LAURENT-NOUAN,41,LOIR-ET-CHER,18,...,4,CENTRE-VAL DE LOIRE,85,83,77,27,30,13,7,"92,70%"
52480,2007,0440423W,COLLEGE,ST JOSEPH,PRIVE,44051,DERVAL,44,LOIRE-ATLANTIQUE,17,...,17,PAYS DE LA LOIRE,46,46,44,13,13,15,3,"95,60%"


In [172]:
# renaming key column
df_raw_dnb_par_etablissement = df_raw_dnb_par_etablissement.rename(columns = {'numero_d_etablissement': 'uai'})

In [173]:
#converting dnb string to float
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].str.strip('%').str.replace(',', '.')
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].astype('float')

In [174]:
df_raw_dnb_par_etablissement['dnb_taux_de_sans_mention'] = df_raw_dnb_par_etablissement['nombre_d_admis_sans_mention'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_ab'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_ab'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_b'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_b'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_tb'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_tb'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
54698,2008,0671907J,COLLEGE,HANS ARP,PUBLIC,67482,STRASBOURG,67,BAS-RHIN,15,...,79,39,28,5,7,73.8,0.364486,0.261682,0.046729,0.065421
4107,2006,0011073L,LYCEE PROFESSIONNEL,ORMFREO PERONNAS,PRIVE,1289,PERONNAS,1,AIN,10,...,37,16,17,4,0,82.2,0.355556,0.377778,0.088889,0.0
5053,2006,0594644C,COLLEGE,LAVOISIER,PUBLIC,59544,SAINT-SAULVE,59,NORD,9,...,81,46,18,12,5,81.0,0.46,0.18,0.12,0.05
33071,2009,9720072W,COLLEGE,LISETTE MOUTACHY,PRIVE,97213,LE LAMENTIN,972,MARTINIQUE,31,...,73,40,17,13,3,93.5,0.512821,0.217949,0.166667,0.038462
107066,2013,0141163N,COLLEGE,SAINT PAUL,PRIVE,14118,CAEN,14,CALVADOS,5,...,73,33,19,17,4,76.8,0.347368,0.2,0.178947,0.042105


In [175]:
# Removing columns
df_raw_dnb_par_etablissement.drop(columns=['nombre_d_admis_sans_mention', 'nombre_d_admis_mention_ab', 'nombre_d_admis_mention_b', 'nombre_d_admis_mention_tb'], 
        errors='ignore', 
        inplace=True)
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
102905,2011,0911343R,LYCEE PROFESSIONNEL,AUGUSTE PERRET,PUBLIC,91228,EVRY-COURCOURONNES,91,ESSONNE,25,...,10,ILE-DE-FRANCE,37,37,31,83.7,0.621622,0.189189,0.027027,0.0
32145,2007,0221509W,COLLEGE,DES LIVAUDIERES,PUBLIC,22136,LOUDEAC,22,COTES D'ARMOR,14,...,3,BRETAGNE,111,111,99,89.1,0.36036,0.351351,0.117117,0.063063
132537,2013,0210014B,COLLEGE,FONTAINE DES DUCS,PUBLIC,21154,CHATILLON-SUR-SEINE,21,COTE D'OR,7,...,2,BOURGOGNE-FRANCHE-COMTE,97,97,84,86.5,0.247423,0.391753,0.113402,0.113402
17895,2015,0220166L,COLLEGE,NOTRE DAME DES FONTAINES,PRIVE,22250,PONTRIEUX,22,COTES D'ARMOR,14,...,3,BRETAGNE,24,24,23,95.8,0.333333,0.375,0.125,0.125
98880,2015,0290314J,COLLEGE,ST STANISLAS,PRIVE,29260,SAINT-RENAN,29,FINISTERE,14,...,3,BRETAGNE,178,178,161,90.4,0.179775,0.280899,0.320225,0.123596


In [176]:
#creating sub-dataframe for college cerficate dataframe based on session year  
dfs_dnb_par_etablissement = {}
for session in df_raw_dnb_par_etablissement['session'].unique():
    dfs_dnb_par_etablissement[session] = pd.DataFrame(df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == session])
    dfs_dnb_par_etablissement[session] = dfs_dnb_par_etablissement[session].rename(columns = 
                                                                                   {'nombre_d_inscrits': f'dnb_nombre_d_inscrits_{session}', 
                                                                                    'nombre_de_presents': f'dnb_nombre_de_presents_{session}',
                                                                                    'nombre_total_d_admis': f'dnb_nombre_de_presents_{session}',
                                                                                    'taux_de_reussite': f'dnb_taux_de_reussite_{session}',
                                                                                    'dnb_taux_de_sans_mention': f'dnb_taux_de_sans_mention_{session}',
                                                                                    'dnb_taux_de_mention_ab': f'dnb_taux_de_mention_ab_{session}',
                                                                                    'dnb_taux_de_mention_b': f'dnb_taux_de_mention_b_{session}',
                                                                                    'dnb_taux_de_mention_tb': f'dnb_taux_de_mention_tb_{session}',
                                                                                    })
    print(f"{session}: {len(dfs_dnb_par_etablissement[session])}")

2007: 8623
2008: 8656
2011: 8696
2012: 8697
2006: 8562
2009: 8672
2010: 8646
2013: 8732
2014: 8746
2019: 8797
2020: 8807
2015: 8752
2017: 8796
2018: 8802
2016: 8780
2021: 8816


In [177]:
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,COLLEGE
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,COLLEGE
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,COLLEGE
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,COLLEGE
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,COLLEGE


In [178]:
# merging
for session in df_raw_dnb_par_etablissement['session'].unique():
    # df_master = df_master.join(dfs_dnb_par_etablissement[session].set_index('uai'), on='uai', how='left', rsuffix=session)
    # print(session)
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_d_inscrits_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_de_presents_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_reussite_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_sans_mention_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_ab_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_b_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_tb_{session}']], on='uai', how='left')
    
    
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,...,0.119403,0.059701,61.0,61.0,58.0,95.1,0.114754,0.327869,0.196721,0.311475
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,...,0.209402,0.145299,236.0,233.0,207.0,88.8,0.111588,0.236052,0.227468,0.313305
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,...,0.211111,0.133333,120.0,120.0,111.0,92.5,0.166667,0.233333,0.3,0.225
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,...,0.247619,0.271429,242.0,235.0,214.0,91.1,0.119149,0.187234,0.268085,0.33617
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,...,0.203008,0.157895,141.0,141.0,131.0,92.9,0.156028,0.29078,0.255319,0.22695


In [179]:
df_master.rename(columns = {'code_du_departement':'code_departement', 
                            'nom_de_l_etablissment':'nom_etablissment',
                            'code_insee_de_la_commune':'code_insee_commune',
                            'nom_de_la_commune':'commune',
                            }, inplace = True)

In [180]:
# Moving columns
uai = df_master.pop('uai')
df_master.insert(0, 'uai', uai)
del uai

nom_etablissment = df_master.pop('nom_etablissment')
df_master.insert(1, 'nom_etablissment', nom_etablissment)
del nom_etablissment

type = df_master.pop('type_etablissement')
df_master.insert(3, 'type_etablissement', type)
del type

rentree_scolaire = df_master.pop('rentree_scolaire')
df_master.insert(11, 'rentree_scolaire', rentree_scolaire)
del rentree_scolaire

df_master.sample(5)

  df_master.insert(0, 'uai', uai)
  df_master.insert(1, 'nom_etablissment', nom_etablissment)
  df_master.insert(3, 'type_etablissement', type)
  df_master.insert(11, 'rentree_scolaire', rentree_scolaire)


Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
22148,0420229W,ECOLE PRIMAIRE DE LA TOUR,ECOLE,LYON,42,LOIRE,42076,CREMEAUX,public,111.6,...,,,,,,,,,,
9021,0010384M,ECOLE PRIMAIRE ROBERT DOISNEAU,ECOLE,LYON,1,AIN,1424,TRAMOYES,public,133.7,...,,,,,,,,,,
16144,0480208R,ECOLE PRIMAIRE GUILHEM ADEMAR,ECOLE,MONTPELLIER,48,LOZERE,48096,MEYRUEIS,public,105.1,...,,,,,,,,,,
12023,0540433C,ECOLE ELEMENTAIRE JULES RENARD,ECOLE,NANCY-METZ,54,MEURTHE-ET-MOSELLE,54197,FLEVILLE DEVANT NANCY,public,134.3,...,,,,,,,,,,
516,0421852K,COLLEGE LOUIS ARAGON,COLLEGE,LYON,42,LOIRE,42127,MABLY,public,88.6,...,0.171875,0.117188,105.0,101.0,87.0,86.1,0.207921,0.148515,0.217822,0.287129


In [181]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 139 entries, uai to dnb_taux_de_mention_tb_2021
dtypes: float64(129), object(10)
memory usage: 41.7+ MB


##### Merging geolocalisation dataset

In [182]:
df_raw_geolocalisation = df_raw_geolocalisation.rename(columns={'numero_uai': 'uai'})

In [183]:
df_master = df_master.join(df_raw_geolocalisation.set_index('uai'), on='uai', how='left', rsuffix='right')
df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
18234,0751175V,E P PU PAUL VALERY,ECOLE,PARIS,75,PARIS,75116,PARIS 16E ARRONDISSEMENT,public,125.1,...,75116,Paris,Ile-de-France,Paris,"{'lon': 2.289990585963959, 'lat': 48.870348393...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1965-05-01
30383,0030511P,ECOLE ELEMENTAIRE,ECOLE,CLERMONT-FERRAND,3,ALLIER,3071,CHAVROCHES,public,98.7,...,3071,Allier,Auvergne-Rhône-Alpes,Clermont-Ferrand,"{'lon': 3.587079489696487, 'lat': 46.354622425...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1993-09-01
29113,0501367P,ECOLE PRIMAIRE PRIVEE IMMACULEE CONCEPTION,ECOLE,NORMANDIE,50,MANCHE,50487,SAINT JAMES,privé sous contrat,103.2,...,50487,Manche,Normandie,Normandie,"{'lon': -1.32766961222067, 'lat': 48.522785810...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6.0,MINISTERE DE L'EDUCATION NATIONALE,1968-09-26
5526,9740069G,COLLEGE L ETANG SAINT PAUL,COLLEGE,LA REUNION,974,LA REUNION,97415,SAINT PAUL,public,85.8,...,97415,La Réunion,La Réunion,La Réunion,"{'lon': 55.28471003195644, 'lat': -20.98162557...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1991-09-01
33323,0140693C,ECOLE PRIMAIRE,ECOLE,NORMANDIE,14,CALVADOS,14452,MORTEAUX COULIBOEUF,public,94.2,...,14452,Calvados,Normandie,Normandie,"{'lon': -0.07514939649711501, 'lat': 48.924619...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1965-08-13


In [184]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['appellation_officielle', 'denomination_principale', 'patronyme_uai', 'secteur_public_prive_libe', 'adresse_uai',
                     'boite_postale_uai', 'localite_acheminement_uai', 'libelle_commune', 'localisation', 'nature_uai_libe',
                     'etat_etablissement', 'etat_etablissement_libe', 'code_departementright', 'code_commune', 'libelle_departement', 'libelle_academie', 
                     'secteur_prive_code_type_contrat', 'secteur_prive_libelle_type_contrat', 'code_ministere', 'libelle_ministere', 'nature_uai', 'lieu_dit_uai'], 
        errors='ignore', 
        inplace=True)

In [185]:
# Moving columns
code_academie = df_master.pop('code_academie')
df_master.insert(3, 'code_academie', code_academie)
del code_academie

code_region = df_master.pop('code_region')
df_master.insert(8, 'code_region', code_region)
del code_region

libelle_region = df_master.pop('libelle_region')
df_master.insert(9, 'libelle_region', libelle_region)
del libelle_region

df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,libelle_region,...,dnb_taux_de_mention_tb_2021,code_postal_uai,coordonnee_x,coordonnee_y,epsg,latitude,longitude,appariement,position,date_ouverture
1655,0271317K,COLLEGE JEAN DE LA FONTAINE,COLLEGE,70.0,NORMANDIE,027,EURE,27105,28.0,Normandie,...,0.181287,27520.0,544966.5,6913497.8,EPSG:2154,49.302254,0.86858,Parfaite,"{'lon': 0.8685804853209761, 'lat': 49.30225420...",1975-03-18
4020,0951617S,COLLEGE GERARD PHILIPE,COLLEGE,25.0,VERSAILLES,095,VAL-D'OISE,95127,11.0,Ile-de-France,...,0.116438,95800.0,630055.4,6883352.6,EPSG:2154,49.046237,2.043229,Parfaite,"{'lon': 2.043229439724775, 'lat': 49.046237263...",1983-09-01
14232,0160417S,ECOLE ELEMENTAIRE COURBILLAC,ECOLE,13.0,POITIERS,016,CHARENTE,16109,75.0,Nouvelle-Aquitaine,...,,16200.0,453014.1,6523791.7,EPSG:2154,45.768985,-0.178841,Parfaite,"{'lon': -0.178840544742306, 'lat': 45.76898464...",1970-01-12
32369,7200136K,ECOLE PRIMAIRE VEZZANI LCC BILINGUE,ECOLE,27.0,CORSE,02B,HAUTE-CORSE,2B347,94.0,Corse,...,,20242.0,1216514.4,6139961.5,EPSG:2154,42.175953,9.246519,Parfaite,"{'lon': 9.246519480656108, 'lat': 42.175952963...",1985-09-16
13507,0021385V,ECOLE PRIMAIRE,ECOLE,20.0,AMIENS,002,AISNE,02532,32.0,Hauts-de-France,...,,2610.0,726164.5,6961497.7,EPSG:2154,49.7519,3.362831,Parfaite,"{'lon': 3.362830602192527, 'lat': 49.751900147...",1966-11-08


In [186]:
df_master = df_master.rename(columns={'libelle_region': 'region'})

In [187]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 151 entries, uai to date_ouverture
dtypes: float64(136), object(15)
memory usage: 45.3+ MB


##### Merging Niveau_de_vie_2013_a_la_commune dataset

In [188]:
df_raw_revenus_par_commune = df_raw_revenus_par_commune.rename(columns={'Code Commune': 'code_insee_commune'})

In [189]:
df_master = df_master.join(df_raw_revenus_par_commune.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [190]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['Nom Commune'], 
        errors='ignore', 
        inplace=True)

In [191]:
# Renaming columns
df_master.rename(columns = {'Niveau de vie Commune':'niveau_de_vie_commune', 
                            'Niveau de vie Département':'niveau_de_vie_departement'
                            }, inplace = True)

In [192]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 153 entries, uai to niveau_de_vie_departement
dtypes: float64(138), object(15)
memory usage: 45.9+ MB


##### Merging georef-france-commune dataset

##### typo_rur

In [193]:
df_raw_typo_rur = df_raw_typo_rur.rename(columns={'codgeo': 'code_insee_commune'})

In [194]:
df_master = df_master.join(df_raw_typo_rur.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [195]:
df_master

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,region,...,latitude,longitude,appariement,position,date_ouverture,niveau_de_vie_commune,niveau_de_vie_departement,libgeo,zonage_rur,zonage_rur_lib
0,0010025X,COLLEGE PAUL SIXDENIER,COLLEGE,10.0,LYON,001,AIN,01185,84.0,Auvergne-Rhône-Alpes,...,45.976219,5.600967,Correcte,"{'lon': 5.600967267017193, 'lat': 45.976219204...",1965-05-01,20198.148148,22343.574665,Plateau d'Hauteville,2.0,rural autonome peu dense
1,0010041P,COLLEGE VAUGELAS,COLLEGE,10.0,LYON,001,AIN,01244,84.0,Auvergne-Rhône-Alpes,...,45.907542,5.188640,Parfaite,"{'lon': 5.188640403667234, 'lat': 45.907541813...",1971-02-16,21367.619048,22343.574665,Meximieux,5.0,urbain densité intermédiaire
2,0010092V,COLLEGE PRIVE SAINT JOSEPH,COLLEGE,10.0,LYON,001,AIN,01283,84.0,Auvergne-Rhône-Alpes,...,46.259653,5.656330,Parfaite,"{'lon': 5.65632989664765, 'lat': 46.2596533673...",1967-01-19,16590.000000,22343.574665,Oyonnax,5.0,urbain densité intermédiaire
3,0010896U,COLLEGE INTERNATIONAL,COLLEGE,10.0,LYON,001,AIN,01160,84.0,Auvergne-Rhône-Alpes,...,46.264811,6.116650,Parfaite,"{'lon': 6.116650067099341, 'lat': 46.264811148...",1970-02-20,25508.333333,22343.574665,Ferney-Voltaire,5.0,urbain densité intermédiaire
4,0010938P,COLLEGE LES COTES,COLLEGE,10.0,LYON,001,AIN,01289,84.0,Auvergne-Rhône-Alpes,...,46.190009,5.205570,Parfaite,"{'lon': 5.205570413701443, 'lat': 46.190008673...",1972-01-25,21632.000000,22343.574665,Péronnas,5.0,urbain densité intermédiaire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39053,9760209Y,ECOLE ELEMENTAIRE DE CHICONI 5,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97605,6.0,Mayotte,...,-12.836574,45.115200,Parfaite,"{'lon': 45.11520022047355, 'lat': -12.83657434...",1998-09-01,,,Chiconi,5.0,urbain densité intermédiaire
39054,9760253W,ECOLE ELEMENTAIRE PUBLIQUE MAJICAVO KOROPA 3,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97610,6.0,Mayotte,...,-12.744716,45.218850,Parfaite,"{'lon': 45.218850302397314, 'lat': -12.7447159...",2001-09-01,,,Koungou,5.0,urbain densité intermédiaire
39055,9760301Y,ECOLE PRIMAIRE PUBLIQUE TSOUNDZOU 2,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97611,6.0,Mayotte,...,-12.817093,45.197320,Parfaite,"{'lon': 45.19731954390122, 'lat': -12.81709257...",2005-09-01,,,Mamoudzou,6.0,urbain dense
39056,9760302Z,ECOLE PRIMAIRE PUBLIQUE MROALE,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97617,6.0,Mayotte,...,-12.788367,45.128749,Correcte,"{'lon': 45.12874934018609, 'lat': -12.78836708...",2005-09-01,,,Tsingoni,5.0,urbain densité intermédiaire


In [196]:
df_master['zonage_rur_lib'].isnull().value_counts()

False    37860
True      1198
Name: zonage_rur_lib, dtype: int64

In [197]:
df_master[df_master['zonage_rur_lib'].isnull()]['commune'].value_counts()

PARIS 20E ARRONDISSEMENT        65
PARIS 19E ARRONDISSEMENT        62
PARIS 18E ARRONDISSEMENT        58
PARIS 13E ARRONDISSEMENT        57
PARIS 15E ARRONDISSEMENT        50
PARIS 17E ARRONDISSEMENT        45
PARIS 16E ARRONDISSEMENT        43
MARSEILLE 13E ARRONDISSEMENT    40
MARSEILLE 15E ARRONDISSEMENT    39
PARIS 12E ARRONDISSEMENT        37
PARIS 11E ARRONDISSEMENT        35
MARSEILLE 9E ARRONDISSEMENT     33
PARIS 14E ARRONDISSEMENT        32
MARSEILLE 14E ARRONDISSEMENT    31
MARSEILLE 8E ARRONDISSEMENT     30
LYON 3E ARRONDISSEMENT          28
PARIS 10E ARRONDISSEMENT        26
LYON 8E ARRONDISSEMENT          26
LYON 7E ARRONDISSEMENT          25
MARSEILLE 11E ARRONDISSEMENT    25
MARSEILLE 12E ARRONDISSEMENT    25
LYON 5E ARRONDISSEMENT          23
MARSEILLE 4E ARRONDISSEMENT     23
MARSEILLE 3E ARRONDISSEMENT     22
MARSEILLE 10E ARRONDISSEMENT    21
PARIS 5E ARRONDISSEMENT         21
LYON 9E ARRONDISSEMENT          21
MARSEILLE 6E ARRONDISSEMENT     21
LYON 6E ARRONDISSEME

In [198]:
# allocating zone 3 to saint martin
df_master['zonage_rur'][df_master['code_insee_commune']=='97801'] = 3.00

# allocating zone 3 to saint barthelemy
df_master['zonage_rur'][df_master['code_insee_commune']=='97701'] = 3.00

# allocating the rest (Paris, Marseille, Lyon) to zone 6
df_master['zonage_rur'][df_master['zonage_rur'].isnull()] = 6.00

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['zonage_rur'][df_master['code_insee_commune']=='97801'] = 3.00
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['zonage_rur'][df_master['code_insee_commune']=='97701'] = 3.00
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['zonage_rur'][df_master['zonage_rur'].isnull()] = 6.00


In [199]:
df_master['zonage_rur_lib'] = df_master['zonage_rur'].map({1:'rural autonome très peu dense', 
                                                                       2: 'rural autonome peu dense', 
                                                                       3: 'rural sous faible influence d\'un pole', 
                                                                       4: 'rural sous forte influence d\'un pole',
                                                                       5: 'urbain densité intermédiaire',
                                                                       6: 'urbain dense'
                                                                       }
                                                                      )

In [200]:
df_master[df_master['zonage_rur_lib'].isnull()]['commune'].value_counts()

Series([], Name: commune, dtype: int64)

### 6. Tidying up dataframe

In [201]:
# Making all strings lower case
df_master = df_master.applymap(lambda x: x.lower() if type(x) == str else x)

# renaming "privé sous contrat" in "prive"
df_master.loc[df_master["secteur"] == "privé sous contrat", "secteur"] = "prive"

# sorting
df_master.sort_values(by=['secteur', 'type_etablissement'], ascending=False, inplace=True)

In [202]:
df_master.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 6967 to 6962
Data columns (total 156 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  
 0    uai                            39058 non-null  object 
 1    nom_etablissment               39058 non-null  object 
 2    type_etablissement             39058 non-null  object 
 3    code_academie                  38932 non-null  float64
 4    academie                       39058 non-null  object 
 5    code_departement               39058 non-null  object 
 6    departement                    39058 non-null  object 
 7    code_insee_commune             39058 non-null  object 
 8    code_region                    38932 non-null  float64
 9    region                         38932 non-null  object 
 10   commune                        39058 non-null  object 
 11   secteur                        39058 non-null  object 
 12   ips                         

### 7. Exporting workfile data to CSV

In [203]:
file_name = "df_master" + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out + file_name, index = False)


# Code to comment at some point
timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "df_master_" + timestr + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out_temporary + file_name, index = False)

file name: df_master.csv
file name: df_master_2022-11-14_09-24-13.csv


#### data-collection notebook execution time

In [204]:
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 111.60529947280884
