This notebook is dedicated to data collection, cleaning and creation of work files for the study of the public/private distribution of French schools

### 1. Importing librairies

In [1]:
import time
startTime = time.time()

import pandas as pd
import os
import requests

### 2. Creating the relevant folders and paths

In [2]:
# Defining current folder as our main directory
dirname = os.getcwd()

# location folders variables
data_in = dirname + "\\da_data_raw\\"
data_out = dirname + "\\da_data_workfiles\\"
graph_out = dirname + "\\graphs\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in):
    os.makedirs(data_in)
    print(f"Directory Created: {data_in}")
else: 
    print(f"Already existing directory: {data_in}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out):
    os.makedirs(data_out)
    print(f"Directory Created: {data_out}")
else:
    print(f"Already existing directory: {data_out}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(graph_out):
    os.makedirs(graph_out)
    print(f"Directory Created: {graph_out}")
else:
    print(f"Already existing directory: {graph_out}")    

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in}")
print(f"Workfile folder: {data_out}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\
Already existing directory: c:\Users\33671\Documents\Python\IPS\graphs\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\


Temporary code (to comment later)

In [3]:
# Getting current date
datestr = time.strftime("%Y-%m-%d")

# location folders variables (temporary - to comment later)
data_in_temporary = dirname + "\\da_data_raw\\" + datestr + "\\"
data_out_temporary = dirname + "\\da_data_workfiles\\" + datestr + "\\"

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in_temporary):
    os.makedirs(data_in_temporary)
    print(f"Directory Created: {data_in_temporary}")
else: 
    print(f"Already existing directory: {data_in_temporary}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out_temporary):
    os.makedirs(data_out_temporary)
    print(f"Directory Created: {data_out_temporary}")
else:
    print(f"Already existing directory: {data_out_temporary}")

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in_temporary}")
print(f"Workfile folder: {data_out_temporary}")

Directory Created: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-12\
Directory Created: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-12\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\2022-11-12\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\2022-11-12\


### 3. Verifying APIs responses

In [4]:
#fr-en-ips_colleges
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-ips_ecoles
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-dnb-par-etablissement
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#georef-france-commune
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "10"
# r = requests.get(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
#                  timeout=2)
# print(f"{dataset_id}")
# print(f"URL: {r.url}")
# print(f"HTTP Response Status Code: {r.status_code}") 
# print(f"HTTP Error: {r.raise_for_status()}")
# print(f"Encoding: {r.encoding}\n")
# r.close()

fr-en-ips_colleges
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_colleges/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-ips_ecoles
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_ecoles/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-dnb-par-etablissement
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-dnb-par-etablissement/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8



### 4. Data Collection

##### ips_colleges dataset

In [5]:
# Loading data
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_colleges = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"

In [6]:
df_raw_ips_colleges.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
1584,2021-2022,ORLEANS-TOURS,18,CHER,0180710Z,COLLEGE CLAUDE DEBUSSY,18108,LA GUERCHE SUR L AUBOIS,public,92.3,COLLEGE
6359,2021-2022,CLERMONT-FERRAND,63,PUY-DE-DOME,0631411S,COLLEGE BLAISE PASCAL,63113,CLERMONT FERRAND,public,101.9,COLLEGE
1709,2021-2022,TOULOUSE,31,HAUTE-GARONNE,0310031R,COLLEGE FRANCOIS CAZES,31471,SAINT BEAT LEZ,public,109.1,COLLEGE
1004,2021-2022,PARIS,75,PARIS,0754706H,COLLEGE MARIE CURIE,75118,PARIS 18E ARRONDISSEMENT,public,103.9,COLLEGE
3210,2021-2022,NANTES,44,LOIRE-ATLANTIQUE,0442835T,COLLEGE SIMONE VEIL,44109,NANTES,public,106.4,COLLEGE


##### ips_ecoles dataset

In [7]:
# Loading data
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_ecoles = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

# Adding column about the type of school
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [8]:
df_raw_ips_ecoles.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
20584,2021-2022,TOULOUSE,81,TARN,0810360B,ECOLE PRIMAIRE,81285,SERENAC,public,83.9,ECOLE
31364,2021-2022,NANCY-METZ,88,VOSGES,0881648N,ECOLE PRIMAIRE DU CENTRE,88481,URIMENIL,public,109.4,ECOLE
31155,2021-2022,AIX-MARSEILLE,84,VAUCLUSE,0840656G,ECOLE PRIMAIRE GABRIEL PERI,84019,BOLLENE,public,81.2,ECOLE
20648,2021-2022,TOULOUSE,82,TARN-ET-GARONNE,0820810K,ECOLE ELEMENTAIRE PIERRE BONHOURE,82124,MONTBETON,public,105.4,ECOLE
3253,2021-2022,BESANCON,25,DOUBS,0251692D,ECOLE ELEMENTAIRE CENTRE,25321,VILLERS LE LAC,public,101.3,ECOLE


##### dnb-par-etablissement dataset

In [9]:
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_dnb_par_etablissement = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [10]:
df_raw_dnb_par_etablissement.sample(5) 

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
14041,2019,0840066R,COLLEGE,NOTRE DAME DU BON ACCUEIL,PRIVE,84080,MONTEUX,84,VAUCLUSE,2,...,18,PROVENCE-ALPES-COTE D'AZUR,121,120,118,22,40,35,21,"98,30%"
27853,2009,0400012W,COLLEGE,VAL D ADOUR,PUBLIC,40117,GRENADE-SUR-L'ADOUR,40,LANDES,4,...,15,NOUVELLE-AQUITAINE,80,80,74,25,30,15,4,"92,50%"
88590,2009,0911491B,COLLEGE,CONDORCET,PUBLIC,91200,DOURDAN,91,ESSONNE,25,...,10,ILE-DE-FRANCE,186,183,158,71,49,25,13,"86,30%"
56879,2006,0511216V,COLLEGE,JEAN MOULIN,PUBLIC,51506,SAINT-MEMMIE,51,MARNE,19,...,6,GRAND EST,108,106,77,42,20,11,4,"72,60%"
114948,2017,0640168K,COLLEGE,ETCHECOPAR,PRIVE,64493,SAINT-PALAIS,64,PYRENEES-ATLANTIQUES,4,...,15,NOUVELLE-AQUITAINE,45,45,40,1,13,13,13,"88,80%"


In [11]:
# Check school type distribution based on last avaiable year results
df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == 2021]['denomination_principale'].value_counts()

COLLEGE                6942
LYCEE PROFESSIONNEL    1315
LYCEE                   481
EREA                     58
AUTRE                    17
CFA                       3
Name: denomination_principale, dtype: int64

##### fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre dataset

In [12]:
# Loading data
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_geolocalisation = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [13]:
df_raw_geolocalisation.sample(5)

Unnamed: 0,numero_uai,appellation_officielle,denomination_principale,patronyme_uai,secteur_public_prive_libe,adresse_uai,lieu_dit_uai,boite_postale_uai,code_postal_uai,localite_acheminement_uai,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
46836,0750663N,Lycée général et technologique Chaptal,LYCEE GENERAL ET TECHNOLOGIQUE,CHAPTAL,Public,45 boulevard des Batignolles,,,75008,PARIS,...,75108,Paris,Ile-de-France,Paris,"{'lon': 2.319669465196226, 'lat': 48.881734614...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1965-05-01
41444,0440748Z,Ecole primaire publique Astrolabe,ECOLE PRIMAIRE PUBLIQUE,ASTROLABE,Public,3 rue Prosper Mérimée,,,44330,LE PALLET,...,44117,Loire-Atlantique,Pays de la Loire,Nantes,"{'lon': -1.336939440396343, 'lat': 47.14018417...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1970-01-21
27303,0630924M,Ecole primaire,ECOLE PRIMAIRE,,Public,Rue marechal de turenne,,,63350,JOZE,...,63180,Puy-de-Dôme,Auvergne-Rhône-Alpes,Clermont-Ferrand,"{'lon': 3.301650282544672, 'lat': 45.863866656...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1966-08-08
63741,0851353J,Ecole primaire publique LA MER,ECOLE PRIMAIRE PUBLIQUE,LA MER,Public,50 avenue de la Plage,,,85360,LA TRANCHE SUR MER,...,85294,Vendée,Pays de la Loire,Nantes,"{'lon': -1.438599619043155, 'lat': 46.34101801...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1983-09-01
13057,0631587H,Ecole maternelle Philippe Arbos,ECOLE MATERNELLE PUBLIQUE,PHILIPPE ARBOS,Public,Rue des Hauts de Chanturgue,,,63100,CLERMONT FERRAND,...,63113,Puy-de-Dôme,Auvergne-Rhône-Alpes,Clermont-Ferrand,"{'lon': 3.10156948642659, 'lat': 45.8121021731...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1977-05-23


##### georef-france-commune

In [14]:
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "-1" # argument to pass to get the full dataset 
# columns = "bv2012_code" + "%2C" + "bv2012_name" + "%2C" + "com_uu2020_status" 
# df_raw_georef = pd.read_json(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}&select={columns}")

##### typo-rur

In [15]:
df_raw_typo_rur = pd.read_csv('./da_data_raw/typo-rur.csv')
df_raw_typo_rur

Unnamed: 0,codgeo,libgeo,zonage_rur
0,01001,L'Abergement-Clémenciat,tr2
1,01002,L'Abergement-de-Varey,tr1
2,01004,Ambérieu-en-Bugey,tr5
3,01005,Ambérieux-en-Dombes,tr3
4,01006,Ambléon,tr1
...,...,...,...
34960,97613,M'Tsangamouji,tr5
34961,97614,Ouangani,tr5
34962,97615,Pamandzi,tr5
34963,97616,Sada,tr5


In [16]:
df_raw_typo_rur['zonage_rur'].value_counts()

tr1    8108
tr2    8096
tr3    7394
tr4    7174
tr5    3419
tr6     774
Name: zonage_rur, dtype: int64

In [17]:
df_raw_typo_rur['zonage_rur_lib'] = df_raw_typo_rur['zonage_rur'].map({'tr1':'rural autonome très peu dense', 
                                                                       'tr2': 'rural autonome peu dense', 
                                                                       'tr3': 'rural sous faible influence d\'un pole', 
                                                                       'tr4': 'rural sous forte influence d\'un pole',
                                                                       'tr5': 'urbain densité intermédiaire',
                                                                       'tr6': 'urbain dense'
                                                                       }
                                                                      )

In [18]:
df_raw_typo_rur['zonage_rur_lib'] = df_raw_typo_rur['zonage_rur'].map({'tr1':'rural autonome très peu dense', 
                                                                       'tr2': 'rural autonome peu dense', 
                                                                       'tr3': 'rural sous faible influence d\'un pole', 
                                                                       'tr4': 'rural sous forte influence d\'un pole',
                                                                       'tr5': 'urbain densité intermédiaire',
                                                                       'tr6': 'urbain dense'
                                                                       }
                                                                      )

In [19]:
df_raw_typo_rur['zonage_rur'] = df_raw_typo_rur['zonage_rur'].map({'tr1': 1, 
                                                                       'tr2': 2, 
                                                                       'tr3': 3, 
                                                                       'tr4': 4,
                                                                       'tr5': 5,
                                                                       'tr6': 6
                                                                       }
                                                                      )

In [20]:
df_raw_typo_rur.head()

Unnamed: 0,codgeo,libgeo,zonage_rur,zonage_rur_lib
0,1001,L'Abergement-Clémenciat,2,rural autonome peu dense
1,1002,L'Abergement-de-Varey,1,rural autonome très peu dense
2,1004,Ambérieu-en-Bugey,5,urbain densité intermédiaire
3,1005,Ambérieux-en-Dombes,3,rural sous faible influence d'un pole
4,1006,Ambléon,1,rural autonome très peu dense


In [21]:
df_raw_typo_rur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34965 entries, 0 to 34964
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   codgeo          34965 non-null  object
 1   libgeo          34965 non-null  object
 2   zonage_rur      34965 non-null  int64 
 3   zonage_rur_lib  34965 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


##### Niveau_de_vie_2013_a_la_commune dataset

In [22]:
# Loading data
df_raw_revenus_par_commune = pd.read_excel("https://www.data.gouv.fr/fr/datasets/r/d3ce0107-416f-42cf-a335-d71f89b00b21")

### 5. Exporting raw data to CSV

In [23]:
# Raw datasets export
file_name = "ips-colleges" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_colleges.to_csv(data_in + file_name, index = False)

file_name = "ips-ecoles" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_ecoles.to_csv(data_in + file_name, index = False)

file_name = "dnb-par-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_dnb_par_etablissement.to_csv(data_in + file_name, index = False)

file_name = "geolocalisation-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_geolocalisation.to_csv(data_in + file_name, index = False)

file_name = "revenus-par-commune" + ".csv"
print(f"file name: {file_name}")
df_raw_revenus_par_commune.to_csv(data_in + file_name, index = False)

# file_name = "georef-par-commune" + ".csv"
# print(f"file name: {file_name}")
# df_raw_georef.to_csv(data_in + file_name, index = False)

file name: ips-colleges.csv
file name: ips-ecoles.csv
file name: dnb-par-etablissement.csv
file name: geolocalisation-etablissement.csv
file name: revenus-par-commune.csv


#### Exporting raw data to CSV (temporary code - to comment later)

In [24]:
# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "ips-colleges_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_ips_colleges.to_csv(data_in_temporary + file_name, index = False)


# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "ips-ecoles_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_ips_ecoles.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "dnb-par-etablissement_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_dnb_par_etablissement.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "geolocalisation-etablissement_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_geolocalisation.to_csv(data_in_temporary + file_name, index = False)

# timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
# file_name = "revenus-par-commune_" + timestr + ".csv"
# print(f"file name: {file_name}")
# df_raw_revenus_par_commune.to_csv(data_in_temporary + file_name, index = False)

### 6. Merging data into master file

##### ips_ecoles & ips_colleges dataframes

In [25]:
# Adding columns about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [26]:
df_master = pd.concat([df_raw_ips_colleges, df_raw_ips_ecoles])

In [27]:
#safety check
print(f"df_raw_ips_colleges N = {len(df_raw_ips_colleges)}")
print(f"df_raw_ips_ecoles N = {len(df_raw_ips_ecoles)}")
print(f"df_master N = {len(df_master)}")
print(len(df_raw_ips_colleges) + len(df_raw_ips_ecoles) == len(df_master))
print(f"unique uai = {df_master['uai'].nunique()}")

df_raw_ips_colleges N = 6967
df_raw_ips_ecoles N = 32091
df_master N = 39058
True
unique uai = 39058


##### dnb_par_etablissement

College certificate success rate is calculated based on admissions devived by attendees (not registrants). We will keep this convention to calculate honors rates

In [28]:
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
8927,2011,0180568V,COLLEGE,ST JEAN BAPTISTE DE LA SALLE,PRIVE,18033,BOURGES,18,CHER,18,...,4,CENTRE-VAL DE LOIRE,21,21,19,9,5,5,0,"90,40%"
91246,2010,0672477D,CFA,RATTACHE AU LEGTA,PUBLIC,67348,OBERNAI,67,BAS-RHIN,15,...,6,GRAND EST,17,17,15,7,5,3,0,"88,20%"
43781,2013,0820017Y,COLLEGE,FRANCOIS MITTERRAND,PUBLIC,82112,MOISSAC,82,TARN-ET-GARONNE,16,...,16,OCCITANIE,122,119,101,45,21,24,11,"84,80%"
81613,2008,0931489N,COLLEGE,FEDERICO GARCIA LORCA,PUBLIC,93066,SAINT-DENIS,93,SEINE-SAINT-DENIS,24,...,10,ILE-DE-FRANCE,118,110,79,47,23,7,2,"71,80%"
36738,2007,0590157A,COLLEGE,SAINT EXUPERY,PUBLIC,59447,ONNAING,59,NORD,9,...,9,HAUTS-DE-FRANCE,144,134,99,42,28,24,5,"73,80%"


In [29]:
# renaming key column
df_raw_dnb_par_etablissement = df_raw_dnb_par_etablissement.rename(columns = {'numero_d_etablissement': 'uai'})

In [30]:
#converting dnb string to float
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].str.strip('%').str.replace(',', '.')
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].astype('float')

In [31]:
df_raw_dnb_par_etablissement['dnb_taux_de_sans_mention'] = df_raw_dnb_par_etablissement['nombre_d_admis_sans_mention'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_ab'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_ab'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_b'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_b'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_tb'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_tb'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
5848,2007,0022042J,LYCEE,JEAN BOUIN,PUBLIC,2691,SAINT-QUENTIN,2,AISNE,20,...,21,17,4,0,0,100.0,0.809524,0.190476,0.0,0.0
12441,2009,0760030V,LYCEE,FERDINAND BUISSON,PUBLIC,76231,ELBEUF,76,SEINE MARITIME,21,...,35,28,5,2,0,94.5,0.756757,0.135135,0.054054,0.0
5953,2007,0130063Y,LYCEE PROFESSIONNEL,LEAU (BD),PUBLIC,13208,MARSEILLE 8E ARRONDISSEMENT,13,BOUCHES-DU-RHONE,2,...,28,22,4,2,0,71.7,0.564103,0.102564,0.051282,0.0
48229,2014,0131931D,COLLEGE,THIERS,PUBLIC,13201,MARSEILLE 1ER ARRONDISSEMENT,13,BOUCHES-DU-RHONE,2,...,92,22,29,29,12,82.8,0.198198,0.261261,0.261261,0.108108
17824,2015,0150061K,COLLEGE,NOTRE DAME DES MIRACLES,PRIVE,15120,MAURIAC,15,CANTAL,6,...,44,15,14,10,5,100.0,0.340909,0.318182,0.227273,0.113636


In [32]:
# Removing columns
df_raw_dnb_par_etablissement.drop(columns=['nombre_d_admis_sans_mention', 'nombre_d_admis_mention_ab', 'nombre_d_admis_mention_b', 'nombre_d_admis_mention_tb'], 
        errors='ignore', 
        inplace=True)
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
88891,2006,0831116M,COLLEGE,L'ESTEREL,PUBLIC,83118,SAINT-RAPHAEL,83,VAR,23,...,18,PROVENCE-ALPES-COTE D'AZUR,164,161,142,88.1,0.428571,0.267081,0.130435,0.055901
121850,2021,0580038B,COLLEGE,LES AMOGNES,PUBLIC,58232,SAINT-BENIN-D'AZY,58,NIEVRE,7,...,2,BOURGOGNE-FRANCHE-COMTE,47,46,41,89.1,0.23913,0.173913,0.152174,0.326087
90562,2010,0520822L,COLLEGE,CRESSOT,PUBLIC,52250,JOINVILLE,52,HAUTE-MARNE,19,...,6,GRAND EST,84,81,66,81.4,0.45679,0.197531,0.098765,0.061728
95159,2006,0280869T,COLLEGE,JEAN MONNET,PUBLIC,28214,LA LOUPE,28,EURE-ET-LOIR,18,...,4,CENTRE-VAL DE LOIRE,74,73,50,68.4,0.30137,0.232877,0.109589,0.041096
84660,2011,0830956N,COLLEGE,EMILE THOMAS,PUBLIC,83050,DRAGUIGNAN,83,VAR,23,...,18,PROVENCE-ALPES-COTE D'AZUR,147,139,118,84.8,0.489209,0.158273,0.086331,0.115108


In [33]:
#creating sub-dataframe for college cerficate dataframe based on session year  
dfs_dnb_par_etablissement = {}
for session in df_raw_dnb_par_etablissement['session'].unique():
    dfs_dnb_par_etablissement[session] = pd.DataFrame(df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == session])
    dfs_dnb_par_etablissement[session] = dfs_dnb_par_etablissement[session].rename(columns = 
                                                                                   {'nombre_d_inscrits': f'dnb_nombre_d_inscrits_{session}', 
                                                                                    'nombre_de_presents': f'dnb_nombre_de_presents_{session}',
                                                                                    'nombre_total_d_admis': f'dnb_nombre_de_presents_{session}',
                                                                                    'taux_de_reussite': f'dnb_taux_de_reussite_{session}',
                                                                                    'dnb_taux_de_sans_mention': f'dnb_taux_de_sans_mention_{session}',
                                                                                    'dnb_taux_de_mention_ab': f'dnb_taux_de_mention_ab_{session}',
                                                                                    'dnb_taux_de_mention_b': f'dnb_taux_de_mention_b_{session}',
                                                                                    'dnb_taux_de_mention_tb': f'dnb_taux_de_mention_tb_{session}',
                                                                                    })
    print(f"{session}: {len(dfs_dnb_par_etablissement[session])}")

2007: 8623
2008: 8656
2011: 8696
2012: 8697
2006: 8562
2009: 8672
2010: 8646
2013: 8732
2014: 8746
2019: 8797
2020: 8807
2015: 8752
2017: 8796
2018: 8802
2016: 8780
2021: 8816


In [34]:
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,COLLEGE
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,COLLEGE
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,COLLEGE
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,COLLEGE
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,COLLEGE


In [35]:
# merging
for session in df_raw_dnb_par_etablissement['session'].unique():
    # df_master = df_master.join(dfs_dnb_par_etablissement[session].set_index('uai'), on='uai', how='left', rsuffix=session)
    # print(session)
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_d_inscrits_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_de_presents_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_reussite_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_sans_mention_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_ab_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_b_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_tb_{session}']], on='uai', how='left')
    
    
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,...,0.119403,0.059701,61.0,61.0,58.0,95.1,0.114754,0.327869,0.196721,0.311475
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,...,0.209402,0.145299,236.0,233.0,207.0,88.8,0.111588,0.236052,0.227468,0.313305
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,...,0.211111,0.133333,120.0,120.0,111.0,92.5,0.166667,0.233333,0.3,0.225
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,...,0.247619,0.271429,242.0,235.0,214.0,91.1,0.119149,0.187234,0.268085,0.33617
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,...,0.203008,0.157895,141.0,141.0,131.0,92.9,0.156028,0.29078,0.255319,0.22695


In [36]:
df_master.rename(columns = {'code_du_departement':'code_departement', 
                            'nom_de_l_etablissment':'nom_etablissment',
                            'code_insee_de_la_commune':'code_insee_commune',
                            'nom_de_la_commune':'commune',
                            }, inplace = True)

In [37]:
# Moving columns
uai = df_master.pop('uai')
df_master.insert(0, 'uai', uai)
del uai

nom_etablissment = df_master.pop('nom_etablissment')
df_master.insert(1, 'nom_etablissment', nom_etablissment)
del nom_etablissment

type = df_master.pop('type_etablissement')
df_master.insert(3, 'type_etablissement', type)
del type

rentree_scolaire = df_master.pop('rentree_scolaire')
df_master.insert(11, 'rentree_scolaire', rentree_scolaire)
del rentree_scolaire

df_master.sample(5)

  df_master.insert(0, 'uai', uai)
  df_master.insert(1, 'nom_etablissment', nom_etablissment)
  df_master.insert(3, 'type_etablissement', type)
  df_master.insert(11, 'rentree_scolaire', rentree_scolaire)


Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
8247,0790684P,ECOLE PRIMAIRE,ECOLE,POITIERS,79,DEUX-SEVRES,79029,BEAULIEU SOUS PARTHENAY,public,109.1,...,,,,,,,,,,
13025,0932568L,ECOLE ELEMENTAIRE ANGELA DAVIS,ECOLE,CRETEIL,93,SEINE-SAINT-DENIS,93027,LA COURNEUVE,public,80.4,...,,,,,,,,,,
32899,0070126Z,ECOLE PRIMAIRE SAINT EXUPERY,ECOLE,GRENOBLE,7,ARDECHE,7292,SAINT ROMAIN D AY,public,101.2,...,,,,,,,,,,
37603,0770560R,ECOLE ELEMENTAIRE EMILE ZOLA,ECOLE,CRETEIL,77,SEINE-ET-MARNE,77513,VILLENOY,public,103.2,...,,,,,,,,,,
32974,0080629R,GROUPE SCOLAIRE ABEL DECOPONS,ECOLE,REIMS,8,ARDENNES,8081,BOGNY SUR MEUSE,public,84.0,...,,,,,,,,,,


In [38]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 139 entries, uai to dnb_taux_de_mention_tb_2021
dtypes: float64(129), object(10)
memory usage: 41.7+ MB


##### Merging geolocalisation dataset

In [39]:
df_raw_geolocalisation = df_raw_geolocalisation.rename(columns={'numero_uai': 'uai'})

In [40]:
df_master = df_master.join(df_raw_geolocalisation.set_index('uai'), on='uai', how='left', rsuffix='right')
df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
13585,0030647M,ECOLE ELEMENTAIRE,ECOLE,CLERMONT-FERRAND,3,ALLIER,3108,ECHASSIERES,public,93.3,...,3108,Allier,Auvergne-Rhône-Alpes,Clermont-Ferrand,"{'lon': 2.934629434199442, 'lat': 46.182957446...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1967-05-08
61,0061459D,COLLEGE PRIVE FENELON,COLLEGE,NICE,6,ALPES-MARITIMES,6069,GRASSE,privé sous contrat,137.2,...,6069,Alpes-Maritimes,Provence-Alpes-Côte d'Azur,Nice,"{'lon': 6.92750389959421, 'lat': 43.6551547224...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6.0,MINISTERE DE L'EDUCATION NATIONALE,1980-09-01
30767,0130414E,ECOLE PRIMAIRE LA CRAU,ECOLE,AIX-MARSEILLE,13,BOUCHES-DU-RHONE,13027,CHATEAURENARD,public,107.3,...,13027,Bouches-du-Rhône,Provence-Alpes-Côte d'Azur,Aix-Marseille,"{'lon': 4.835840589495739, 'lat': 43.868618978...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1965-07-10
38363,0890610E,ECOLE ELEMENTAIRE MUSCADET,ECOLE,DIJON,89,YONNE,89206,JOIGNY,public,107.7,...,89206,Yonne,Bourgogne-Franche-Comté,Dijon,"{'lon': 3.417429706169467, 'lat': 47.983642250...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1966-09-30
30049,0622833T,ECOLE PRIMAIRE,ECOLE,LILLE,62,PAS-DE-CALAIS,62569,MERCK SAINT LIEVIN,public,100.8,...,62569,Pas-de-Calais,Hauts-de-France,Lille,"{'lon': 2.113870693773724, 'lat': 50.627465965...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1972-09-01


In [41]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['appellation_officielle', 'denomination_principale', 'patronyme_uai', 'secteur_public_prive_libe', 'adresse_uai',
                     'boite_postale_uai', 'localite_acheminement_uai', 'libelle_commune', 'localisation', 'nature_uai_libe',
                     'etat_etablissement', 'etat_etablissement_libe', 'code_departementright', 'code_commune', 'libelle_departement', 'libelle_academie', 
                     'secteur_prive_code_type_contrat', 'secteur_prive_libelle_type_contrat', 'code_ministere', 'libelle_ministere', 'nature_uai', 'lieu_dit_uai'], 
        errors='ignore', 
        inplace=True)

In [42]:
# Moving columns
code_academie = df_master.pop('code_academie')
df_master.insert(3, 'code_academie', code_academie)
del code_academie

code_region = df_master.pop('code_region')
df_master.insert(8, 'code_region', code_region)
del code_region

libelle_region = df_master.pop('libelle_region')
df_master.insert(9, 'libelle_region', libelle_region)
del libelle_region

df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,libelle_region,...,dnb_taux_de_mention_tb_2021,code_postal_uai,coordonnee_x,coordonnee_y,epsg,latitude,longitude,appariement,position,date_ouverture
5007,0650835E,COLLEGE VICTOR HUGO,COLLEGE,16.0,TOULOUSE,65,HAUTES-PYRENEES,65440,76.0,Occitanie,...,0.336134,65000.0,461604.8,6239965.7,EPSG:2154,43.21905,0.06723,Parfaite,"{'lon': 0.067229784054182, 'lat': 43.219049733...",1977-03-09
14994,0301260G,ECOLE ELEMENTAIRE ROBERT LAVESQUE,ECOLE,11.0,MONTPELLIER,30,GARD,30269,76.0,Occitanie,...,,30270.0,771264.5,6334861.7,EPSG:2154,44.109144,3.89024,Parfaite,"{'lon': 3.890239611621374, 'lat': 44.109143753...",1974-05-15
872,0672304R,COLLEGE PRIVE ECOLE AQUIBA,COLLEGE,15.0,STRASBOURG,67,BAS-RHIN,67482,44.0,Grand Est,...,0.414634,67000.0,1050338.2,6842716.2,EPSG:2154,48.589724,7.75325,Parfaite,"{'lon': 7.7532499241539306, 'lat': 48.58972419...",1966-09-01
10511,0291990F,ECOLE PRIMAIRE PRIVEE NOTRE DAME DE LOURDES,ECOLE,14.0,RENNES,29,FINISTERE,29151,53.0,Bretagne,...,,29600.0,196239.1,6852259.6,EPSG:2154,48.574141,-3.83721,Parfaite,"{'lon': -3.837210373954562, 'lat': 48.57414098...",1984-07-04
37428,0760443U,ECOLE ELEMENTAIRE FRANCOIS CODET,ECOLE,70.0,NORMANDIE,76,SEINE MARITIME,76108,28.0,Normandie,...,,76230.0,563656.6,6931828.0,EPSG:2154,49.47123,1.11943,Parfaite,"{'lon': 1.1194301513227891, 'lat': 49.47122982...",1966-11-24


In [43]:
df_master = df_master.rename(columns={'libelle_region': 'region'})

In [44]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 151 entries, uai to date_ouverture
dtypes: float64(136), object(15)
memory usage: 45.3+ MB


##### Merging Niveau_de_vie_2013_a_la_commune dataset

In [45]:
df_raw_revenus_par_commune = df_raw_revenus_par_commune.rename(columns={'Code Commune': 'code_insee_commune'})

In [46]:
df_master = df_master.join(df_raw_revenus_par_commune.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [47]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['Nom Commune'], 
        errors='ignore', 
        inplace=True)

In [48]:
# Renaming columns
df_master.rename(columns = {'Niveau de vie Commune':'niveau_de_vie_commune', 
                            'Niveau de vie Département':'niveau_de_vie_departement'
                            }, inplace = True)

In [49]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Columns: 153 entries, uai to niveau_de_vie_departement
dtypes: float64(138), object(15)
memory usage: 45.9+ MB


##### Merging georef-france-commune dataset

##### typo_rur

In [50]:
df_raw_typo_rur = df_raw_typo_rur.rename(columns={'codgeo': 'code_insee_commune'})

In [51]:
df_master = df_master.join(df_raw_typo_rur.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [52]:
df_master

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,region,...,latitude,longitude,appariement,position,date_ouverture,niveau_de_vie_commune,niveau_de_vie_departement,libgeo,zonage_rur,zonage_rur_lib
0,0010025X,COLLEGE PAUL SIXDENIER,COLLEGE,10.0,LYON,001,AIN,01185,84.0,Auvergne-Rhône-Alpes,...,45.976219,5.600967,Correcte,"{'lon': 5.600967267017193, 'lat': 45.976219204...",1965-05-01,20198.148148,22343.574665,Plateau d'Hauteville,2.0,rural autonome peu dense
1,0010041P,COLLEGE VAUGELAS,COLLEGE,10.0,LYON,001,AIN,01244,84.0,Auvergne-Rhône-Alpes,...,45.907542,5.188640,Parfaite,"{'lon': 5.188640403667234, 'lat': 45.907541813...",1971-02-16,21367.619048,22343.574665,Meximieux,5.0,urbain densité intermédiaire
2,0010092V,COLLEGE PRIVE SAINT JOSEPH,COLLEGE,10.0,LYON,001,AIN,01283,84.0,Auvergne-Rhône-Alpes,...,46.259653,5.656330,Parfaite,"{'lon': 5.65632989664765, 'lat': 46.2596533673...",1967-01-19,16590.000000,22343.574665,Oyonnax,5.0,urbain densité intermédiaire
3,0010896U,COLLEGE INTERNATIONAL,COLLEGE,10.0,LYON,001,AIN,01160,84.0,Auvergne-Rhône-Alpes,...,46.264811,6.116650,Parfaite,"{'lon': 6.116650067099341, 'lat': 46.264811148...",1970-02-20,25508.333333,22343.574665,Ferney-Voltaire,5.0,urbain densité intermédiaire
4,0010938P,COLLEGE LES COTES,COLLEGE,10.0,LYON,001,AIN,01289,84.0,Auvergne-Rhône-Alpes,...,46.190009,5.205570,Parfaite,"{'lon': 5.205570413701443, 'lat': 46.190008673...",1972-01-25,21632.000000,22343.574665,Péronnas,5.0,urbain densité intermédiaire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39053,9760209Y,ECOLE ELEMENTAIRE DE CHICONI 5,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97605,6.0,Mayotte,...,-12.836574,45.115200,Parfaite,"{'lon': 45.11520022047355, 'lat': -12.83657434...",1998-09-01,,,Chiconi,5.0,urbain densité intermédiaire
39054,9760253W,ECOLE ELEMENTAIRE PUBLIQUE MAJICAVO KOROPA 3,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97610,6.0,Mayotte,...,-12.744716,45.218850,Parfaite,"{'lon': 45.218850302397314, 'lat': -12.7447159...",2001-09-01,,,Koungou,5.0,urbain densité intermédiaire
39055,9760301Y,ECOLE PRIMAIRE PUBLIQUE TSOUNDZOU 2,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97611,6.0,Mayotte,...,-12.817093,45.197320,Parfaite,"{'lon': 45.19731954390122, 'lat': -12.81709257...",2005-09-01,,,Mamoudzou,6.0,urbain dense
39056,9760302Z,ECOLE PRIMAIRE PUBLIQUE MROALE,ECOLE,43.0,MAYOTTE,976,MAYOTTE,97617,6.0,Mayotte,...,-12.788367,45.128749,Correcte,"{'lon': 45.12874934018609, 'lat': -12.78836708...",2005-09-01,,,Tsingoni,5.0,urbain densité intermédiaire


In [53]:
df_master['zonage_rur_lib'].isnull().value_counts()

False    37860
True      1198
Name: zonage_rur_lib, dtype: int64

### 6. Tidying up dataframe

In [54]:
# Making all strings lower case
df_master = df_master.applymap(lambda x: x.lower() if type(x) == str else x)

# renaming "privé sous contrat" in "prive"
df_master.loc[df_master["secteur"] == "privé sous contrat", "secteur"] = "prive"

# sorting
df_master.sort_values(by=['secteur', 'type_etablissement'], ascending=False, inplace=True)

In [55]:
df_master.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 6967 to 6962
Data columns (total 156 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  
 0    uai                            39058 non-null  object 
 1    nom_etablissment               39058 non-null  object 
 2    type_etablissement             39058 non-null  object 
 3    code_academie                  38932 non-null  float64
 4    academie                       39058 non-null  object 
 5    code_departement               39058 non-null  object 
 6    departement                    39058 non-null  object 
 7    code_insee_commune             39058 non-null  object 
 8    code_region                    38932 non-null  float64
 9    region                         38932 non-null  object 
 10   commune                        39058 non-null  object 
 11   secteur                        39058 non-null  object 
 12   ips                         

### 7. Exporting workfile data to CSV

In [56]:
file_name = "df_master" + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out + file_name, index = False)


# Code to comment at some point
timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
file_name = "df_master_" + timestr + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out_temporary + file_name, index = False)

file name: df_master.csv
file name: df_master_2022-11-12_13-13-35.csv


#### data-collection notebook execution time

In [57]:
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 104.7528932094574
