This notebook is dedicated to data collection, cleaning and creation of work files for the study of the public/private distribution of French schools

### 1. Importing librairies

In [1]:
import time
startTime = time.time()
 
import pandas as pd
import os
import requests

### 2. Creating the relevant folders and paths

In [2]:
# Defining current folder as our main directory
dirname = os.getcwd()

# location folders variables
data_in = dirname + "\\da_data_raw\\" #raw data
data_out = dirname + "\\da_data_workfiles\\" #clean data
graph_out = dirname + "\\graphs\\" #graphs
html_graph_out = "C:\\Users\\33671\\Documents\\my-website\\html5up-massively\\images\\graphs\\" #graphs to be used for html report

# Checking if data_in path is not present, then create it
if not os.path.exists(data_in):
    os.makedirs(data_in)
    print(f"Directory Created: {data_in}")
else: 
    print(f"Already existing directory: {data_in}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(data_out):
    os.makedirs(data_out)
    print(f"Directory Created: {data_out}")
else:
    print(f"Already existing directory: {data_out}")
    
# Checking if data_out path is not present then create it
if not os.path.exists(graph_out):
    os.makedirs(graph_out)
    print(f"Directory Created: {graph_out}")
else:
    print(f"Already existing directory: {graph_out}")    

# Printing main directories we will work with
print(f"\nMain directory: {dirname}")
print(f"Raw data folder: {data_in}")
print(f"Workfile folder: {data_out}")

Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Already existing directory: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\
Already existing directory: c:\Users\33671\Documents\Python\IPS\graphs\

Main directory: c:\Users\33671\Documents\Python\IPS
Raw data folder: c:\Users\33671\Documents\Python\IPS\da_data_raw\
Workfile folder: c:\Users\33671\Documents\Python\IPS\da_data_workfiles\


### 3. Verifying APIs responses

In [3]:
#fr-en-ips_colleges
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-ips_ecoles
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-dnb-par-etablissement
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "10"
r = requests.get(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
                 timeout=2)
print(f"{dataset_id}")
print(f"URL: {r.url}")
print(f"HTTP Response Status Code: {r.status_code}") 
print(f"HTTP Error: {r.raise_for_status()}")
print(f"Encoding: {r.encoding}\n")
r.close()

#georef-france-commune
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "10"
# r = requests.get(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}", 
#                  timeout=2)
# print(f"{dataset_id}")
# print(f"URL: {r.url}")
# print(f"HTTP Response Status Code: {r.status_code}") 
# print(f"HTTP Error: {r.raise_for_status()}")
# print(f"Encoding: {r.encoding}\n")
# r.close()

fr-en-ips_colleges
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_colleges/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-ips_ecoles
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-ips_ecoles/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-dnb-par-etablissement
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-dnb-par-etablissement/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8

fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre
URL: https://data.education.gouv.fr/api/v2/catalog/datasets/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre/exports/json?limit=10
HTTP Response Status Code: 200
HTTP Error: None
Encoding: utf-8



### 4. Data Collection

##### ips_colleges dataset

In [4]:
# Loading data
dataset_id = "fr-en-ips_colleges"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_colleges = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [5]:
df_raw_ips_colleges.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips
2313,2021-2022,GRENOBLE,73,SAVOIE,0730791N,COLLEGE PRIVE CATHOLIQUE LAMARTINE,73008,AIX LES BAINS,privé sous contrat,117.0
2473,2021-2022,NICE,83,VAR,0830069Z,COLLEGE PIERRE PUGET,83137,TOULON,public,81.5
3331,2021-2022,RENNES,56,MORBIHAN,0561546B,COLLEGE PRIVE SAINTE JEANNE D ARC,56066,GOURIN,privé sous contrat,92.9
4002,2021-2022,CRETEIL,94,VAL-DE-MARNE,0941971G,COLLEGE DULCIE SEPTEMBER,94003,ARCUEIL,public,100.4
101,2021-2022,TOULOUSE,12,AVEYRON,0120055V,COLLEGE PRIVE DOMINIQUE SAVIO,12198,RIEUPEYROUX,privé sous contrat,99.6


##### ips_ecoles dataset

In [6]:
# Loading data
dataset_id = "fr-en-ips_ecoles"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_ips_ecoles = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [7]:
df_raw_ips_ecoles.sample(5)

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips
14388,2021-2022,TOULOUSE,31,HAUTE-GARONNE,0310446S,ECOLE ELEMENTAIRE PUBLIQUE ROGER ANDRE DELUC,31161,DEYME,public,129.3
20178,2021-2022,NORMANDIE,76,SEINE MARITIME,0762844D,ECOLE ELEMENTAIRE JACQUES PREVERT,76164,RIVES EN SEINE,public,98.3
20181,2021-2022,NORMANDIE,76,SEINE MARITIME,0762882V,ECOLE PRIMAIRE LE PRE VERT,76563,SAINT AUBIN ROUTOT,public,103.2
31855,2021-2022,VERSAILLES,95,VAL-D'OISE,0951478R,ECOLE PRIMAIRE LES BOURGUIGNONS,95229,EZANVILLE,public,112.7
17620,2021-2022,PARIS,75,PARIS,0750972Z,E E PU POUCHET,75117,PARIS 17E ARRONDISSEMENT,public,108.0


##### dnb-par-etablissement dataset

In [8]:
dataset_id = "fr-en-dnb-par-etablissement"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_dnb_par_etablissement = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [9]:
df_raw_dnb_par_etablissement.sample(5) 

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
22340,2015,0491825T,COLLEGE,DAVID D ANGERS,PUBLIC,49007,ANGERS,49,MAINE-ET-LOIRE,17,...,17,PAYS DE LA LOIRE,146,146,125,35,33,31,26,"85,60%"
113671,2015,0750507U,COLLEGE,GEORGES BRASSENS,PUBLIC,75119,PARIS 19E ARRONDISSEMENT,75,PARIS,1,...,10,ILE-DE-FRANCE,129,127,97,29,23,31,14,"76,30%"
91670,2010,0770920G,LYCEE,LA FAYETTE,PUBLIC,77079,CHAMPAGNE-SUR-SEINE,77,SEINE-ET-MARNE,24,...,10,ILE-DE-FRANCE,18,17,13,9,2,2,0,"76,40%"
101364,2020,0542349K,COLLEGE,PAUL VERLAINE,PUBLIC,54322,LONGUYON,54,MEURTHE-ET-MOSELLE,12,...,6,GRAND EST,90,90,83,15,23,30,15,"92,20%"
57054,2006,0595773E,LYCEE PROFESSIONNEL,-,PRIVE,59289,HAUSSY,59,NORD,9,...,9,HAUTS-DE-FRANCE,32,30,18,14,4,0,0,"60,00%"


In [10]:
# Check school type distribution based on last avaiable year results
df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == 2021]['denomination_principale'].value_counts()

COLLEGE                6942
LYCEE PROFESSIONNEL    1315
LYCEE                   481
EREA                     58
AUTRE                    17
CFA                       3
Name: denomination_principale, dtype: int64

##### fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre dataset

In [11]:
# Loading data
dataset_id = "fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre"
format = "json"
limit = "-1" # argument to pass to get the full dataset 
df_raw_geolocalisation = pd.read_json(f"https://data.education.gouv.fr/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}")

In [12]:
df_raw_geolocalisation.sample(5)

Unnamed: 0,numero_uai,appellation_officielle,denomination_principale,patronyme_uai,secteur_public_prive_libe,adresse_uai,lieu_dit_uai,boite_postale_uai,code_postal_uai,localite_acheminement_uai,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
43777,0290927A,Ecole primaire publique Le Tourous,ECOLE PRIMAIRE,LE TOUROUS,Public,8 rue Aristide Briand,,,29800,LANDERNEAU,...,29103,Finistère,Bretagne,Rennes,"{'lon': -4.263430134689353, 'lat': 48.45442319...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1969-02-21
13361,0502013S,SAMUEL BECKETT,ECOLE PRIMAIRE PUBLIQUE,SAMUEL BECKETT,Public,Impasse du docteur Schweitzer,du docteur Schweitzer,,50000,ST LO,...,50502,Manche,Normandie,Normandie,"{'lon': -1.072711104294381, 'lat': 49.11471822...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,2018-09-01
55491,0530176D,Ecole primaire publique Le Vieux Tilleul,ECOLE PRIMAIRE PUBLIQUE,LE VIEUX TILLEUL,Public,1 rue André Royne,,,53800,CONGRIER,...,53073,Mayenne,Pays de la Loire,Nantes,"{'lon': -1.116329922885049, 'lat': 47.80981696...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1969-01-15
54804,0080947L,Section d'enseignement général et professionne...,SEGPA,ANNEXEE AU CLG VOUZIERS,Public,Rue DE LA FUSION,,52.0,8400,VOUZIERS,...,8490,Ardennes,Grand Est,Reims,"{'lon': 4.6949001970022834, 'lat': 49.40235281...",99.0,SANS OBJET,6,MINISTERE DE L'EDUCATION NATIONALE,1977-03-18
40937,0592922F,Lycée général privé Thérèse d'Avila,LYCEE EUROPEEN PRIVE,THERESE D'AVILA,Privé,254 rue Nationale,,41077.0,59011,LILLE CEDEX,...,59350,Nord,Hauts-de-France,Lille,"{'lon': 3.04769987326439, 'lat': 50.6312144398...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6,MINISTERE DE L'EDUCATION NATIONALE,1919-09-01


##### georef-france-commune

In [13]:
# dataset_id = "georef-france-commune"
# format = "json"
# limit = "-1" # argument to pass to get the full dataset 
# columns = "bv2012_code" + "%2C" + "bv2012_name" + "%2C" + "com_uu2020_status" 
# df_raw_georef = pd.read_json(f"https://public.opendatasoft.com/api/v2/catalog/datasets/{dataset_id}/exports/{format}?limit={limit}&select={columns}")

##### typo-rur

In [14]:
df_raw_typo_rur = pd.read_csv('./da_data_raw/typo-rur.csv')
df_raw_typo_rur

Unnamed: 0,codgeo,libgeo,zonage_rur
0,01001,L'Abergement-Clémenciat,tr2
1,01002,L'Abergement-de-Varey,tr1
2,01004,Ambérieu-en-Bugey,tr5
3,01005,Ambérieux-en-Dombes,tr3
4,01006,Ambléon,tr1
...,...,...,...
34960,97613,M'Tsangamouji,tr5
34961,97614,Ouangani,tr5
34962,97615,Pamandzi,tr5
34963,97616,Sada,tr5


In [15]:
df_raw_typo_rur['zonage_rur'].value_counts()

tr1    8108
tr2    8096
tr3    7394
tr4    7174
tr5    3419
tr6     774
Name: zonage_rur, dtype: int64

In [16]:
df_raw_typo_rur['zonage_rur'] = df_raw_typo_rur['zonage_rur'].map({'tr1': 1, 
                                                                       'tr2': 2, 
                                                                       'tr3': 3, 
                                                                       'tr4': 4,
                                                                       'tr5': 5,
                                                                       'tr6': 6
                                                                       }
                                                                      )

In [17]:
df_raw_typo_rur['zonage_rur_lib'] = df_raw_typo_rur['zonage_rur'].map({1:'rural autonome très peu dense', 
                                                                       2: 'rural autonome peu dense', 
                                                                       3: 'rural sous faible influence d\'un pole', 
                                                                       4: 'rural sous forte influence d\'un pole',
                                                                       5: 'urbain densité intermédiaire',
                                                                       6: 'urbain dense'
                                                                       }
                                                                      )

In [18]:
df_raw_typo_rur.head()

Unnamed: 0,codgeo,libgeo,zonage_rur,zonage_rur_lib
0,1001,L'Abergement-Clémenciat,2,rural autonome peu dense
1,1002,L'Abergement-de-Varey,1,rural autonome très peu dense
2,1004,Ambérieu-en-Bugey,5,urbain densité intermédiaire
3,1005,Ambérieux-en-Dombes,3,rural sous faible influence d'un pole
4,1006,Ambléon,1,rural autonome très peu dense


In [19]:
df_raw_typo_rur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34965 entries, 0 to 34964
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   codgeo          34965 non-null  object
 1   libgeo          34965 non-null  object
 2   zonage_rur      34965 non-null  int64 
 3   zonage_rur_lib  34965 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB


##### Niveau_de_vie_2013_a_la_commune dataset

In [20]:
# Loading data
df_raw_revenus_par_commune = pd.read_excel("https://www.data.gouv.fr/fr/datasets/r/d3ce0107-416f-42cf-a335-d71f89b00b21")

### 5. Exporting raw data to CSV

In [21]:
# Raw datasets export
file_name = "ips-colleges" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_colleges.to_csv(data_in + file_name, index = False)

file_name = "ips-ecoles" + ".csv"
print(f"file name: {file_name}")
df_raw_ips_ecoles.to_csv(data_in + file_name, index = False)

file_name = "dnb-par-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_dnb_par_etablissement.to_csv(data_in + file_name, index = False)

file_name = "geolocalisation-etablissement" + ".csv"
print(f"file name: {file_name}")
df_raw_geolocalisation.to_csv(data_in + file_name, index = False)

file_name = "revenus-par-commune" + ".csv"
print(f"file name: {file_name}")
df_raw_revenus_par_commune.to_csv(data_in + file_name, index = False)

# file_name = "georef-par-commune" + ".csv"
# print(f"file name: {file_name}")
# df_raw_georef.to_csv(data_in + file_name, index = False)

file name: ips-colleges.csv
file name: ips-ecoles.csv
file name: dnb-par-etablissement.csv
file name: geolocalisation-etablissement.csv
file name: revenus-par-commune.csv


### 6. Merging data into master file

##### ips_ecoles & ips_colleges dataframes

In [22]:
# Adding columns about the type of school
df_raw_ips_colleges['type_etablissement']="COLLEGE"
df_raw_ips_ecoles['type_etablissement']="ECOLE"

In [23]:
df_master = pd.concat([df_raw_ips_colleges, df_raw_ips_ecoles])

In [24]:
#safety check
print(f"df_raw_ips_colleges N = {len(df_raw_ips_colleges)}")
print(f"df_raw_ips_ecoles N = {len(df_raw_ips_ecoles)}")
print(f"df_master N = {len(df_master)}")
print(len(df_raw_ips_colleges) + len(df_raw_ips_ecoles) == len(df_master))
print(f"unique uai = {df_master['uai'].nunique()}")

df_raw_ips_colleges N = 6967
df_raw_ips_ecoles N = 32091
df_master N = 39058
True
unique uai = 39058


##### dnb_par_etablissement

College certificate success rate is calculated based on admissions devived by attendees (not registrants). We will keep this convention to calculate honors rates

In [25]:
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,numero_d_etablissement,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite
101717,2008,0131705H,COLLEGE,FERNAND LEGER,PUBLIC,13014,BERRE-L'ETANG,13,BOUCHES-DU-RHONE,2,...,18,PROVENCE-ALPES-COTE D'AZUR,144,139,92,57,23,8,4,"66,10%"
76220,2019,0681366R,COLLEGE,JEAN MOULIN,PUBLIC,68287,ROUFFACH,68,HAUT-RHIN,15,...,6,GRAND EST,113,112,104,21,15,34,34,"92,80%"
100829,2007,0801478X,COLLEGE,SAINTE COLETTE,PRIVE,80212,CORBIE,80,SOMME,20,...,9,HAUTS-DE-FRANCE,61,61,56,17,18,15,6,"91,80%"
29547,2011,0520032C,LYCEE PROFESSIONNEL,EMILE BAUDOT,PUBLIC,52550,WASSY,52,HAUTE-MARNE,19,...,6,GRAND EST,23,20,18,15,3,0,0,"90,00%"
109170,2018,0861072Y,COLLEGE,FRANCE BLOCH-SERAZIN,PUBLIC,86194,POITIERS,86,VIENNE,13,...,15,NOUVELLE-AQUITAINE,148,147,139,21,41,32,45,"94,50%"


In [26]:
# renaming key column
df_raw_dnb_par_etablissement = df_raw_dnb_par_etablissement.rename(columns = {'numero_d_etablissement': 'uai'})

In [27]:
#converting dnb string to float
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].str.strip('%').str.replace(',', '.')
df_raw_dnb_par_etablissement['taux_de_reussite'] = df_raw_dnb_par_etablissement['taux_de_reussite'].astype('float')

In [28]:
df_raw_dnb_par_etablissement['dnb_taux_de_sans_mention'] = df_raw_dnb_par_etablissement['nombre_d_admis_sans_mention'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_ab'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_ab'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_b'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_b'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement['dnb_taux_de_mention_tb'] = df_raw_dnb_par_etablissement['nombre_d_admis_mention_tb'] / df_raw_dnb_par_etablissement['nombre_de_presents']
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,nombre_total_d_admis,nombre_d_admis_sans_mention,nombre_d_admis_mention_ab,nombre_d_admis_mention_b,nombre_d_admis_mention_tb,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
3243,2012,0530818B,LYCEE,ROCHEFEUILLE,PRIVE,53147,MAYENNE,53,MAYENNE,17,...,37,19,16,2,0,86.0,0.44186,0.372093,0.046512,0.0
136470,2021,0290330B,COLLEGE,BOIS DE LOCQUERAN,PUBLIC,29197,PLOUHINEC,29,FINISTERE,14,...,52,10,15,14,13,85.2,0.163934,0.245902,0.229508,0.213115
111948,2017,0030049M,COLLEGE,LES CELESTINS,PUBLIC,3310,VICHY,3,ALLIER,6,...,63,20,6,20,17,80.7,0.25641,0.076923,0.25641,0.217949
116430,2018,0511214T,COLLEGE,TROIS FONTAINES,PUBLIC,51454,REIMS,51,MARNE,19,...,50,19,11,11,9,66.6,0.253333,0.146667,0.146667,0.12
93755,2008,0350875L,COLLEGE,ST JOSEPH,PRIVE,35219,PIPRIAC,35,ILLE-ET-VILAINE,14,...,72,23,26,18,5,86.7,0.277108,0.313253,0.216867,0.060241


In [29]:
# Removing columns
df_raw_dnb_par_etablissement.drop(columns=['nombre_d_admis_sans_mention', 'nombre_d_admis_mention_ab', 'nombre_d_admis_mention_b', 'nombre_d_admis_mention_tb'], 
        errors='ignore', 
        inplace=True)
df_raw_dnb_par_etablissement.sample(5)

Unnamed: 0,session,uai,denomination_principale,patronyme,secteur_d_enseignement,commune_et_arrondissement,commune_et_arrondissement_lib_l,departement,departement_libelle,academie,...,region,region_libelle,nombre_d_inscrits,nombre_de_presents,nombre_total_d_admis,taux_de_reussite,dnb_taux_de_sans_mention,dnb_taux_de_mention_ab,dnb_taux_de_mention_b,dnb_taux_de_mention_tb
11188,2013,0640133X,COLLEGE,STE URSULE,PRIVE,64445,PAU,64,PYRENEES-ATLANTIQUES,4,...,15,NOUVELLE-AQUITAINE,134,134,131,97.7,0.208955,0.291045,0.350746,0.126866
116556,2018,0593134L,COLLEGE,DOMINIQUE SAVIO,PRIVE,59328,LAMBERSART,59,NORD,9,...,9,HAUTS-DE-FRANCE,157,157,156,99.3,0.082803,0.254777,0.33121,0.324841
26341,2008,0561384A,COLLEGE,JOSEPH KERBELLEC,PUBLIC,56185,QUEVEN,56,MORBIHAN,14,...,3,BRETAGNE,173,165,143,86.6,0.369697,0.266667,0.133333,0.09697
48310,2014,0180849A,LYCEE PROFESSIONNEL,AUBIGNY SUR NERE,PRIVE,18015,AUBIGNY-SUR-NERE,18,CHER,18,...,4,CENTRE-VAL DE LOIRE,30,30,26,86.6,0.5,0.333333,0.033333,0.0
118190,2020,0642114A,COLLEGE,BERNAT ETXEPARE,PRIVE,64102,BAYONNE,64,PYRENEES-ATLANTIQUES,4,...,15,NOUVELLE-AQUITAINE,21,21,21,100.0,0.142857,0.095238,0.142857,0.619048


In [30]:
#creating sub-dataframe for college cerficate dataframe based on session year  
dfs_dnb_par_etablissement = {}
for session in df_raw_dnb_par_etablissement['session'].unique():
    dfs_dnb_par_etablissement[session] = pd.DataFrame(df_raw_dnb_par_etablissement[df_raw_dnb_par_etablissement['session'] == session])
    dfs_dnb_par_etablissement[session] = dfs_dnb_par_etablissement[session].rename(columns = 
                                                                                   {'nombre_d_inscrits': f'dnb_nombre_d_inscrits_{session}', 
                                                                                    'nombre_de_presents': f'dnb_nombre_de_presents_{session}',
                                                                                    'nombre_total_d_admis': f'dnb_nombre_de_presents_{session}',
                                                                                    'taux_de_reussite': f'dnb_taux_de_reussite_{session}',
                                                                                    'dnb_taux_de_sans_mention': f'dnb_taux_de_sans_mention_{session}',
                                                                                    'dnb_taux_de_mention_ab': f'dnb_taux_de_mention_ab_{session}',
                                                                                    'dnb_taux_de_mention_b': f'dnb_taux_de_mention_b_{session}',
                                                                                    'dnb_taux_de_mention_tb': f'dnb_taux_de_mention_tb_{session}',
                                                                                    })
    print(f"{session}: {len(dfs_dnb_par_etablissement[session])}")

2007: 8623
2008: 8656
2011: 8696
2012: 8697
2006: 8562
2009: 8672
2010: 8646
2013: 8732
2014: 8746
2019: 8797
2020: 8807
2015: 8752
2017: 8796
2018: 8802
2016: 8780
2021: 8816


In [31]:
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,type_etablissement
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,COLLEGE
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,COLLEGE
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,COLLEGE
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,COLLEGE
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,COLLEGE


In [32]:
# merging
for session in df_raw_dnb_par_etablissement['session'].unique():
    # df_master = df_master.join(dfs_dnb_par_etablissement[session].set_index('uai'), on='uai', how='left', rsuffix=session)
    # print(session)
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_d_inscrits_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_nombre_de_presents_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_reussite_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_sans_mention_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_ab_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_b_{session}']], on='uai', how='left')
    df_master = pd.merge(df_master,dfs_dnb_par_etablissement[session][['uai', f'dnb_taux_de_mention_tb_{session}']], on='uai', how='left')
    
    
df_master.head()

Unnamed: 0,rentree_scolaire,academie,code_du_departement,departement,uai,nom_de_l_etablissment,code_insee_de_la_commune,nom_de_la_commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
0,2021-2022,LYON,1,AIN,0010025X,COLLEGE PAUL SIXDENIER,1185,PLATEAU D HAUTEVILLE,public,110.4,...,0.119403,0.059701,61.0,61.0,58.0,95.1,0.114754,0.327869,0.196721,0.311475
1,2021-2022,LYON,1,AIN,0010041P,COLLEGE VAUGELAS,1244,MEXIMIEUX,public,111.4,...,0.209402,0.145299,236.0,233.0,207.0,88.8,0.111588,0.236052,0.227468,0.313305
2,2021-2022,LYON,1,AIN,0010092V,COLLEGE PRIVE SAINT JOSEPH,1283,OYONNAX,privé sous contrat,111.3,...,0.211111,0.133333,120.0,120.0,111.0,92.5,0.166667,0.233333,0.3,0.225
3,2021-2022,LYON,1,AIN,0010896U,COLLEGE INTERNATIONAL,1160,FERNEY VOLTAIRE,public,122.6,...,0.247619,0.271429,242.0,235.0,214.0,91.1,0.119149,0.187234,0.268085,0.33617
4,2021-2022,LYON,1,AIN,0010938P,COLLEGE LES COTES,1289,PERONNAS,public,102.2,...,0.203008,0.157895,141.0,141.0,131.0,92.9,0.156028,0.29078,0.255319,0.22695


In [33]:
df_master.rename(columns = {'code_du_departement':'code_departement', 
                            'nom_de_l_etablissment':'nom_etablissment',
                            'code_insee_de_la_commune':'code_insee_commune',
                            'nom_de_la_commune':'commune',
                            }, inplace = True)

In [34]:
# Moving columns
uai = df_master.pop('uai')
df_master.insert(0, 'uai', uai)
del uai

nom_etablissment = df_master.pop('nom_etablissment')
df_master.insert(1, 'nom_etablissment', nom_etablissment)
del nom_etablissment

type = df_master.pop('type_etablissement')
df_master.insert(3, 'type_etablissement', type)
del type

rentree_scolaire = df_master.pop('rentree_scolaire')
df_master.insert(11, 'rentree_scolaire', rentree_scolaire)
del rentree_scolaire

df_master.sample(5)

  df_master.insert(0, 'uai', uai)
  df_master.insert(1, 'nom_etablissment', nom_etablissment)
  df_master.insert(3, 'type_etablissement', type)
  df_master.insert(11, 'rentree_scolaire', rentree_scolaire)


Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,dnb_taux_de_mention_b_2016,dnb_taux_de_mention_tb_2016,dnb_nombre_d_inscrits_2021,dnb_nombre_de_presents_2021,dnb_nombre_de_presents_2021.1,dnb_taux_de_reussite_2021,dnb_taux_de_sans_mention_2021,dnb_taux_de_mention_ab_2021,dnb_taux_de_mention_b_2021,dnb_taux_de_mention_tb_2021
38872,7200594H,ECOLE PRIMAIRE ARENA VESCOVATO,ECOLE,CORSE,02B,HAUTE-CORSE,2B346,VESCOVATO,public,84.8,...,,,,,,,,,,
24890,0780091A,ECOLE ELEMENTAIRE PRIVEE MERCIER SAINT PAUL,ECOLE,VERSAILLES,078,YVELINES,78401,MEULAN EN YVELINES,privé sous contrat,126.1,...,,,,,,,,,,
1177,0860764N,COLLEGE PRIVE SACRE COEUR LA SALLE,COLLEGE,POITIERS,086,VIENNE,86115,JAUNAY MARIGNY,privé sous contrat,118.5,...,0.232877,0.205479,99.0,99.0,92.0,92.9,0.141414,0.151515,0.363636,0.272727
2069,0590164H,COLLEGE JEAN ZAY,COLLEGE,LILLE,059,NORD,59183,DUNKERQUE,public,92.8,...,0.220588,0.132353,78.0,77.0,62.0,80.5,0.324675,0.142857,0.194805,0.142857
34250,0311503R,ECOLE PRIMAIRE PRIVEE SAINT THOMAS D AQUIN,ECOLE,TOULOUSE,031,HAUTE-GARONNE,31555,TOULOUSE,privé sous contrat,130.8,...,,,,,,,,,,


In [35]:
df_master.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Data columns (total 139 columns):
 #    Column                         Dtype  
---   ------                         -----  
 0    uai                            object 
 1    nom_etablissment               object 
 2    type_etablissement             object 
 3    academie                       object 
 4    code_departement               object 
 5    departement                    object 
 6    code_insee_commune             object 
 7    commune                        object 
 8    secteur                        object 
 9    ips                            float64
 10   dnb_nombre_d_inscrits_2007     float64
 11   rentree_scolaire               object 
 12   dnb_nombre_de_presents_2007    float64
 13   dnb_nombre_de_presents_2007    float64
 14   dnb_taux_de_reussite_2007      float64
 15   dnb_taux_de_sans_mention_2007  float64
 16   dnb_taux_de_mention_ab_2007    float64
 17   dnb_taux_de_mention_b_2007   

##### Merging geolocalisation dataset

In [36]:
df_raw_geolocalisation = df_raw_geolocalisation.rename(columns={'numero_uai': 'uai'})

In [37]:
df_master = df_master.join(df_raw_geolocalisation.set_index('uai'), on='uai', how='left', rsuffix='right')
df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,academie,code_departement,departement,code_insee_commune,commune,secteur,ips,...,code_commune,libelle_departement,libelle_region,libelle_academie,position,secteur_prive_code_type_contrat,secteur_prive_libelle_type_contrat,code_ministere,libelle_ministere,date_ouverture
34736,0380472G,ECOLE ELEMENTAIRE CLAUDE COHEN TANNOUDJI,ECOLE,GRENOBLE,38,ISERE,38162,FAVERGES DE LA TOUR,public,109.4,...,38162,Isère,Auvergne-Rhône-Alpes,Grenoble,"{'lon': 5.524979409610382, 'lat': 45.590617154...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1965-07-28
34845,0390335C,ECOLE PRIMAIRE CONCORDIA,ECOLE,BESANCON,39,JURA,39451,RANCHOT,public,107.7,...,39451,Jura,Bourgogne-Franche-Comté,Besançon,"{'lon': 5.7225296313507386, 'lat': 47.14792884...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1966-10-10
37921,0810166R,GROUPE SCOLAIRE PASTEUR,ECOLE,TOULOUSE,81,TARN,81120,LABRUGUIERE,public,94.9,...,81120,Tarn,Occitanie,Toulouse,"{'lon': 2.263699409960678, 'lat': 43.535727857...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1968-12-15
33991,0280769J,ECOLE PRIMAIRE THEREZIA ET ROGER BRETON,ECOLE,ORLEANS-TOURS,28,EURE-ET-LOIR,28064,BU,public,122.0,...,28064,Eure-et-Loir,Centre-Val de Loire,Orléans-Tours,"{'lon': 1.493489574004492, 'lat': 48.795506035...",99.0,SANS OBJET,6.0,MINISTERE DE L'EDUCATION NATIONALE,1970-06-04
32983,0080876J,ECOLE PRIMAIRE PRIVEE SAINTE ANNE,ECOLE,REIMS,8,ARDENNES,8105,CHARLEVILLE MEZIERES,privé sous contrat,124.6,...,8105,Ardennes,Grand Est,Reims,"{'lon': 4.722910249563768, 'lat': 49.772970799...",30.0,CONTRAT D'ASSOCIATION TOUTES CLASSES,6.0,MINISTERE DE L'EDUCATION NATIONALE,1971-03-24


In [38]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['appellation_officielle', 'denomination_principale', 'patronyme_uai', 'secteur_public_prive_libe', 'adresse_uai',
                     'boite_postale_uai', 'localite_acheminement_uai', 'libelle_commune', 'localisation', 'nature_uai_libe',
                     'etat_etablissement', 'etat_etablissement_libe', 'code_departementright', 'code_commune', 'libelle_departement', 'libelle_academie', 
                     'secteur_prive_code_type_contrat', 'secteur_prive_libelle_type_contrat', 'code_ministere', 'libelle_ministere', 'nature_uai', 'lieu_dit_uai'], 
        errors='ignore', 
        inplace=True)

In [39]:
# Moving columns
code_academie = df_master.pop('code_academie')
df_master.insert(3, 'code_academie', code_academie)
del code_academie

code_region = df_master.pop('code_region')
df_master.insert(8, 'code_region', code_region)
del code_region

libelle_region = df_master.pop('libelle_region')
df_master.insert(9, 'libelle_region', libelle_region)
del libelle_region

df_master.sample(5)

Unnamed: 0,uai,nom_etablissment,type_etablissement,code_academie,academie,code_departement,departement,code_insee_commune,code_region,libelle_region,...,dnb_taux_de_mention_tb_2021,code_postal_uai,coordonnee_x,coordonnee_y,epsg,latitude,longitude,appariement,position,date_ouverture
16180,0490569C,ECOLE PRIMAIRE PUBLIQUE VAL DE SUINE,ECOLE,17.0,NANTES,049,MAINE-ET-LOIRE,49330,52.0,Pays de la Loire,...,,49330.0,429123.2,6730721.9,EPSG:2154,47.62118,-0.60753,Parfaite,"{'lon': -0.607529639166077, 'lat': 47.62117984...",1965-10-15
32368,7200129C,ECOLE PRIMAIRE RIVENTOSA,ECOLE,27.0,CORSE,02B,HAUTE-CORSE,2B260,94.0,Corse,...,,20250.0,1210562.6,6147956.2,EPSG:2154,42.251781,9.18239,Moyenne,"{'lon': 9.182390444982547, 'lat': 42.251780610...",1994-09-08
10944,0351172J,ECOLE PRIMAIRE PRIVEE ST JOSEPH,ECOLE,14.0,RENNES,035,ILLE-ET-VILAINE,35013,53.0,Bretagne,...,,35600.0,320170.6,6745748.7,EPSG:2154,47.702388,-2.067981,Parfaite,"{'lon': -2.06798054023334, 'lat': 47.702388209...",1971-03-18
13721,0061047F,ECOLE PRIMAIRE PRIVEE SAINT JOSEPH CARNOLES,ECOLE,23.0,NICE,006,ALPES-MARITIMES,06104,93.0,Provence-Alpes-Côte d'Azur,...,,6190.0,1060838.8,6306503.2,EPSG:2154,43.76534,7.48285,Parfaite,"{'lon': 7.482850259648129, 'lat': 43.765339810...",1971-03-17
17777,0691335B,ECOLE PRIMAIRE MARCEL PAGNOL,ECOLE,10.0,LYON,069,RHONE,69066,84.0,Auvergne-Rhône-Alpes,...,,69470.0,803614.7,6558620.6,EPSG:2154,46.119462,4.34179,Correcte,"{'lon': 4.3417904435402015, 'lat': 46.11946248...",1966-12-09


In [40]:
df_master = df_master.rename(columns={'libelle_region': 'region'})

In [41]:
df_master.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Data columns (total 151 columns):
 #    Column                         Dtype  
---   ------                         -----  
 0    uai                            object 
 1    nom_etablissment               object 
 2    type_etablissement             object 
 3    code_academie                  float64
 4    academie                       object 
 5    code_departement               object 
 6    departement                    object 
 7    code_insee_commune             object 
 8    code_region                    float64
 9    region                         object 
 10   commune                        object 
 11   secteur                        object 
 12   ips                            float64
 13   dnb_nombre_d_inscrits_2007     float64
 14   rentree_scolaire               object 
 15   dnb_nombre_de_presents_2007    float64
 16   dnb_nombre_de_presents_2007    float64
 17   dnb_taux_de_reussite_2007    

##### Merging Niveau_de_vie_2013_a_la_commune dataset

In [42]:
df_raw_revenus_par_commune = df_raw_revenus_par_commune.rename(columns={'Code Commune': 'code_insee_commune'})

In [43]:
df_master = df_master.join(df_raw_revenus_par_commune.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [44]:
# Removing joined columns not bringing any or enough value
df_master.drop(columns=['Nom Commune'], 
        errors='ignore', 
        inplace=True)

In [45]:
# Renaming columns
df_master.rename(columns = {'Niveau de vie Commune':'niveau_de_vie_commune', 
                            'Niveau de vie Département':'niveau_de_vie_departement'
                            }, inplace = True)

In [46]:
df_master.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 0 to 39057
Data columns (total 153 columns):
 #    Column                         Dtype  
---   ------                         -----  
 0    uai                            object 
 1    nom_etablissment               object 
 2    type_etablissement             object 
 3    code_academie                  float64
 4    academie                       object 
 5    code_departement               object 
 6    departement                    object 
 7    code_insee_commune             object 
 8    code_region                    float64
 9    region                         object 
 10   commune                        object 
 11   secteur                        object 
 12   ips                            float64
 13   dnb_nombre_d_inscrits_2007     float64
 14   rentree_scolaire               object 
 15   dnb_nombre_de_presents_2007    float64
 16   dnb_nombre_de_presents_2007    float64
 17   dnb_taux_de_reussite_2007    

##### Merging georef-france-commune dataset

##### typo_rur

In [47]:
df_raw_typo_rur = df_raw_typo_rur.rename(columns={'codgeo': 'code_insee_commune'})

In [48]:
df_master = df_master.join(df_raw_typo_rur.set_index('code_insee_commune'), on='code_insee_commune', how='left', rsuffix='right')

In [49]:
df_master['zonage_rur_lib'].isnull().value_counts()

False    37860
True      1198
Name: zonage_rur_lib, dtype: int64

In [50]:
df_master[df_master['zonage_rur_lib'].isnull()]['commune'].value_counts()

PARIS 20E ARRONDISSEMENT        65
PARIS 19E ARRONDISSEMENT        62
PARIS 18E ARRONDISSEMENT        58
PARIS 13E ARRONDISSEMENT        57
PARIS 15E ARRONDISSEMENT        50
PARIS 17E ARRONDISSEMENT        45
PARIS 16E ARRONDISSEMENT        43
MARSEILLE 13E ARRONDISSEMENT    40
MARSEILLE 15E ARRONDISSEMENT    39
PARIS 12E ARRONDISSEMENT        37
PARIS 11E ARRONDISSEMENT        35
MARSEILLE 9E ARRONDISSEMENT     33
PARIS 14E ARRONDISSEMENT        32
MARSEILLE 14E ARRONDISSEMENT    31
MARSEILLE 8E ARRONDISSEMENT     30
LYON 3E ARRONDISSEMENT          28
PARIS 10E ARRONDISSEMENT        26
LYON 8E ARRONDISSEMENT          26
LYON 7E ARRONDISSEMENT          25
MARSEILLE 11E ARRONDISSEMENT    25
MARSEILLE 12E ARRONDISSEMENT    25
LYON 5E ARRONDISSEMENT          23
MARSEILLE 4E ARRONDISSEMENT     23
MARSEILLE 3E ARRONDISSEMENT     22
MARSEILLE 10E ARRONDISSEMENT    21
PARIS 5E ARRONDISSEMENT         21
LYON 9E ARRONDISSEMENT          21
MARSEILLE 6E ARRONDISSEMENT     21
LYON 6E ARRONDISSEME

In [51]:
# allocating zone 3 to saint martin
df_master['zonage_rur'][df_master['code_insee_commune']=='97801'] = 3.00

# allocating zone 3 to saint barthelemy
df_master['zonage_rur'][df_master['code_insee_commune']=='97701'] = 3.00

# allocating the rest (Paris, Marseille, Lyon) to zone 6
df_master['zonage_rur'][df_master['zonage_rur'].isnull()] = 6.00

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['zonage_rur'][df_master['code_insee_commune']=='97801'] = 3.00
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['zonage_rur'][df_master['code_insee_commune']=='97701'] = 3.00
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master['zonage_rur'][df_master['zonage_rur'].isnull()] = 6.00


In [52]:
df_master['zonage_rur_lib'] = df_master['zonage_rur'].map({1:'rural autonome très peu dense', 
                                                                       2: 'rural autonome peu dense', 
                                                                       3: 'rural sous faible influence d\'un pole', 
                                                                       4: 'rural sous forte influence d\'un pole',
                                                                       5: 'urbain densité intermédiaire',
                                                                       6: 'urbain dense'
                                                                       }
                                                                      )

In [53]:
df_master[df_master['zonage_rur_lib'].isnull()]['commune'].value_counts()

Series([], Name: commune, dtype: int64)

In [54]:
#Simplifying our classification with only two categories
df_master['zone_rur_simple']=df_master['zonage_rur'].map({1: 'rural',
                                            2: 'rural',
                                            3: 'rural',
                                            4: 'urbain',
                                            5: 'urbain',
                                            6: 'urbain'
                                            }
                                           )

### 6. Tidying up dataframe

In [55]:
# Making all strings lower case
df_master = df_master.applymap(lambda x: x.lower() if type(x) == str else x)

# renaming "privé sous contrat" in "prive"
df_master.loc[df_master["secteur"] == "privé sous contrat", "secteur"] = "prive"

# sorting
df_master.sort_values(by=['secteur', 'type_etablissement'], ascending=False, inplace=True)

In [56]:
df_master.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39058 entries, 6967 to 6962
Data columns (total 157 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  
 0    uai                            39058 non-null  object 
 1    nom_etablissment               39058 non-null  object 
 2    type_etablissement             39058 non-null  object 
 3    code_academie                  38932 non-null  float64
 4    academie                       39058 non-null  object 
 5    code_departement               39058 non-null  object 
 6    departement                    39058 non-null  object 
 7    code_insee_commune             39058 non-null  object 
 8    code_region                    38932 non-null  float64
 9    region                         38932 non-null  object 
 10   commune                        39058 non-null  object 
 11   secteur                        39058 non-null  object 
 12   ips                         

### 7. Exporting workfile data to CSV

In [57]:
file_name = "df_master" + ".csv"
print(f"file name: {file_name}")
df_master.to_csv(data_out + file_name, index = False)

file name: df_master.csv


#### data-collection notebook execution time

In [58]:
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 136.45366597175598
