In [2]:
import pandas as pd
import numpy as np
import glob

In [20]:
fields = ['nom_dept', 'nom_poll', 'unite', 'valeur']

In [21]:
# Auvergne-Rhone Alpes
path = 'aura' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, usecols=fields)
    li.append(df)

aura_df = pd.concat(li, axis=0, ignore_index=True, sort=True)
aura_df.sample(5)

Unnamed: 0,nom_dept,nom_poll,unite,valeur
239,Isère,Particules PM10,µg/m3,18.8
445,Loire,Monoxyde d'azote,µg/m3,13.0
60,Haute-Savoie,Ozone,µg/m3,5.0
444,Loire,Monoxyde d'azote,µg/m3,7.0
253,Rhône,Particules PM10,µg/m3,22.4


In [22]:
# Paris Ile de France
path = 'idf' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, usecols=fields)
    li.append(df)

idf_df = pd.concat(li, axis=0, ignore_index=True, sort=True)
idf_df.sample(5)

Unnamed: 0,nom_dept,nom_poll,unite,valeur
98,SEINE-ET-MARNE,NO2,ug.m-3,27.0
213,SEINE-ET-MARNE,O3,ug.m-3,43.0
228,YVELINES,O3,ug.m-3,48.0
224,YVELINES,O3,ug.m-3,43.0
4,PARIS,PM10,ug.m-3,28.2


In [23]:
# Grand est
est_df = pd.read_csv('mes_atmo_grand_est_annuel_poll_princ.csv', index_col=None, header=0, usecols=fields)
est_df.sample(5)

Unnamed: 0,nom_dept,nom_poll,valeur,unite
140,Meurthe-et-Moselle,Oxydes d'azote,50.0,µg/m3
1322,Aube,Ozone,44.0,µg/m3
1299,Aube,Particules PM10,,µg/m3
925,Vosges,Dioxyde d'azote,16.0,µg/m3
1440,Bas-Rhin,Particules PM10,,µg/m3


In [24]:
# Haut de France
hdf_df = pd.read_csv('mes_hdf_annuel_poll_princ.csv', index_col=None, header=0, usecols=fields)
hdf_df.sample(5)

Unnamed: 0,nom_dept,nom_poll,valeur,unite
596,OISE,Dioxyde d'azote,23.0,ug.m-3
368,OISE,Particules fines PM2.5,13.0,ug.m-3
190,NORD,Particules PM10,16.2,ug.m-3
30,NORD,Ozone,50.0,ug.m-3
455,NORD,Particules PM10,20.4,ug.m-3


In [25]:
# PACA
paca_df = pd.read_csv('mes_sudpaca_annuelle.csv', index_col=None, header=0, usecols=fields)
paca_df.sample(5)

Unnamed: 0,nom_dept,nom_poll,valeur,unite
341,BOUCHES-DU-RHONE,Ozone,60.9,ug.m-3
509,VAUCLUSE,Dioxyde d'azote,19.0,ug.m-3
649,HAUTES-ALPES,Oxydes d'azote,32.3,ug.m-3
704,ALPES-MARITIMES,Particules PM10,,ug.m-3
316,BOUCHES-DU-RHONE,Particules PM10,,ug.m-3


In [49]:
# Super merging
merged_df = pd.concat([aura_df, idf_df, est_df, hdf_df, paca_df], axis=0, ignore_index=True, sort=True)

# We need to clean the polluant name here:
pol_cleaner = {"Dioxyde d'azote": "NO2", "Monoxyde d'azote": "NO", "Oxydes d'azote": "NO2", "Ozone": "O3", \
              "Particules PM10": "PM10", "Particules PM2,5": "PM2.5", "Particules fines PM2,5": "PM2.5", "Particules fines PM2.5": "PM2.5"}

merged_df = merged_df.replace({"nom_poll": pol_cleaner})


# Aggregate the values to have a global average over the years:
agg_df = merged_df.groupby(['nom_dept', 'nom_poll'],as_index=False).agg({'valeur': 'mean'})

# Pivot to extract the features of pollution
FR_pol_df = agg_df.pivot(index='nom_dept', columns='nom_poll', values='valeur')
FR_pol_df = FR_pol_df[['NO2', 'O3', 'PM10', 'PM2.5']]
FR_pol_df.dropna()

nom_poll,NO2,O3,PM10,PM2.5
nom_dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AISNE,20.714286,48.75,17.6,14.0
ALPES-DE-HAUTE-PROVENCE,13.675,73.27,11.08,7.42
ALPES-MARITIMES,40.071429,62.155882,24.973913,13.383333
Allier,11.5875,56.233333,12.166667,7.8
Ardennes,12.945946,50.571429,18.789474,9.5
BOUCHES-DU-RHONE,36.666364,61.508696,22.393617,12.245455
Bas-Rhin,36.015873,50.64,22.285714,14.636364
Drôme,11.8125,58.2,16.775,11.416667
ESSONNE,27.433333,53.08,14.62,8.48
HAUTES-ALPES,40.683333,51.94,18.9,12.37


In [43]:
len(FR_pol_df)

42