# Traitement des données avec pandas

- types **NumPy**
- renommer des colonnes : `df.rename(columns={...))`
- trouver/supprimer les données dupliquées : vdf.duplicated()` / `df.drop_duplicates()`
- trouver les NaN : `df.isna()` / `df.notna()` / `df.dropna()`
- travail sur les chaînes : `series.str.extract()`, `series.str.contains()`, `series.get_dummies()`
- mapping : `series.map()`
- changer le type d'une série (cast) : `df.astype(type)` / `pd.to_numeric()` / `pd.to_datetime()`
- remplacer n'importe quelle valeur : `df.replace({...})`
- remplacer les NaN : `df.fillna()`, `series.combine_first()`

In [1]:
# imports
import numpy as np
import pandas as pd

#### Chargement et analyse des données

In [2]:
pd.read_csv?

In [3]:
# load data
df = pd.read_csv('people.csv')
df0 = df.copy()
df

Unnamed: 0,id,first_name,last_name,email address,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,Female,63,$55.18,18.284100,49.632552,0136319724,2019/04/16,False,1.559566e+09,"Palkovice, Czech Republic",entrée/plat/dessert
1,27570,Ruthi,Ross,rross3w@sohu.com,Female,57,$20.37,19.230220,50.466575,,2018/10/23,False,1.567165e+09,"Siewierz, Poland",entrée/plat/dessert
2,27572,Silas,Stourton,silas.stourton3y@answers.com,Male,22,"€32,99",118.831081,24.984813,,2018/12/30,False,1.553692e+09,"Longbo, China",entrée/plat/dessert
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64,"£98,93",121.648987,-8.844744,,2019/03/12,False,1.552349e+09,"Potulando, Indonesia",entrée/plat
4,27558,Margaux,Gowanson,nope@thankyou.,Female,54,$13.30,14.772557,45.160472,,2018/08/13,False,1.543383e+09,"Bribir, Croatia",entrée/plat/dessert
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,27465,Adelind,Christin,achristinz@blogs.com,Female,30,"€81,84",41.427853,52.415968,,2019/06/04,False,1.559606e+09,"Znamenka, Russia",entrée/plat/dessert
205,27476,Petronella,Pickance,ppickance1a@uiuc.edu,Female,28,€37.15,132.702111,33.762296,,2017/11/22,False,1.532870e+09,"Iyo, Japan",entrée/plat/dessert
206,27521,Moritz,Issacof,missacof2j@wired.com,Male,21,$91.97,72.311099,31.263396,0298949280,2018/08/20,False,1.549817e+09,"Jhang Sadr, Pakistan",entrée/plat/dessert
207,27441,Quintana,Foulstone,qfoulstoneb@newsvine.com,F,32,"€55,64",111.138442,-6.783653,0734595126,2018/10/25,False,1.540426e+09,"Mantingantengah, Indonesia",entrée/plat/dessert


In [4]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             209 non-null    int64  
 1   first_name     207 non-null    object 
 2   last_name      207 non-null    object 
 3   email address  203 non-null    object 
 4   gender         207 non-null    object 
 5   age            207 non-null    object 
 6   money          190 non-null    object 
 7   lon            207 non-null    float64
 8   lat            207 non-null    float64
 9   phone          83 non-null     object 
 10  registration   207 non-null    object 
 11  inactive       207 non-null    object 
 12  last_seen      190 non-null    float64
 13  address        207 non-null    object 
 14  preference     207 non-null    object 
dtypes: float64(3), int64(1), object(11)
memory usage: 24.6+ KB


**Attention** : **pandas** utilise par défaut les types numériques les plus gourmands en mémoire.

In [5]:
for subtype in ["int8", "int16", "int32", "int64"]:
    print(np.iinfo(subtype))

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------



In [6]:
for subtype in ["uint8", "uint16", "uint32", "uint64"]:
    print(np.iinfo(subtype))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for uint16
---------------------------------------------------------------
min = 0
max = 65535
---------------------------------------------------------------

Machine parameters for uint32
---------------------------------------------------------------
min = 0
max = 4294967295
---------------------------------------------------------------

Machine parameters for uint64
---------------------------------------------------------------
min = 0
max = 18446744073709551615
---------------------------------------------------------------



In [7]:
for subtype in ["float16", "float32", "float64"]:
    print(np.finfo(subtype))

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =       6.10352e-05
maxexp =     16   max =        6.55040e+04
nexp =        5   min =        -max
---------------------------------------------------------------

Machine parameters for float32
---------------------------------------------------------------
precision =   6   resolution = 1.0000000e-06
machep =    -23   eps =        1.1920929e-07
negep =     -24   epsneg =     5.9604645e-08
minexp =   -126   tiny =       1.1754944e-38
maxexp =    128   max =        3.4028235e+38
nexp =        8   min =        -max
---------------------------------------------------------------

Machine parameters for float64
---------------------------------------------------------------
precision =  15   resolution = 1.0000000000000001e-15
machep =    -52   e

In [8]:
df = pd.read_csv('people.csv', dtype={"id":"uint16", "lon":"float16", "lat":"float16", "last_seen":"float32"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             209 non-null    uint16 
 1   first_name     207 non-null    object 
 2   last_name      207 non-null    object 
 3   email address  203 non-null    object 
 4   gender         207 non-null    object 
 5   age            207 non-null    object 
 6   money          190 non-null    object 
 7   lon            207 non-null    float16
 8   lat            207 non-null    float16
 9   phone          83 non-null     object 
 10  registration   207 non-null    object 
 11  inactive       207 non-null    object 
 12  last_seen      190 non-null    float32
 13  address        207 non-null    object 
 14  preference     207 non-null    object 
dtypes: float16(2), float32(1), object(11), uint16(1)
memory usage: 20.1+ KB


In [10]:
s = pd.Series(range(10), dtype="uint8")
s**3

0      0
1      1
2      8
3     27
4     64
5    125
6    216
7     87
8      0
9    217
dtype: uint8

In [11]:
df1 = pd.read_csv('people.csv', usecols=["id", "lon", "lat", "last_seen"])
df1.memory_usage(deep=True).sum()

6816

In [12]:
df1 = pd.read_csv('people.csv', usecols=["id", "lon", "lat", "last_seen"],
                  dtype={"id":"uint16", "lon":"float16", "lat":"float16", "last_seen":"float32"})
df1.memory_usage(deep=True).sum()

2218

In [None]:
# gain de temps au chargement
# type dtype={"id":str}

#### Renommage de la colonne 'email address'

In [13]:
# renommer les colonnes
df = df.rename(columns={'email address': 'email'})
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,Female,63,$55.18,18.28125,49.625,136319724.0,2019/04/16,False,1559566000.0,"Palkovice, Czech Republic",entrée/plat/dessert
1,27570,Ruthi,Ross,rross3w@sohu.com,Female,57,$20.37,19.234375,50.46875,,2018/10/23,False,1567165000.0,"Siewierz, Poland",entrée/plat/dessert
2,27572,Silas,Stourton,silas.stourton3y@answers.com,Male,22,"€32,99",118.8125,24.984375,,2018/12/30,False,1553692000.0,"Longbo, China",entrée/plat/dessert
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64,"£98,93",121.625,-8.84375,,2019/03/12,False,1552349000.0,"Potulando, Indonesia",entrée/plat
4,27558,Margaux,Gowanson,nope@thankyou.,Female,54,$13.30,14.773438,45.15625,,2018/08/13,False,1543383000.0,"Bribir, Croatia",entrée/plat/dessert


#### Suppression des lignes dupliquées

- `duplicated()` : `True` ou `False` selon si une ligne est dupliquée
- `drop_duplicates()` : suppression des lignes dupliquées

In [14]:
# lignes dupliquées
df.duplicated().value_counts()

False    205
True       4
dtype: int64

In [18]:
df.duplicated?

In [15]:
# toutes les lignes dupliquées
df.loc[df.duplicated(keep=False)].sort_values('id')

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
51,27445,Gardener,Brosi,gbrosif@flickr.com,M,47,€35.62,-39.0625,-11.257812,321573819.0,2017/03/30,False,1557537000.0,"Araci, Brazil",entrée/plat/dessert
148,27445,Gardener,Brosi,gbrosif@flickr.com,M,47,€35.62,-39.0625,-11.257812,321573819.0,2017/03/30,False,1557537000.0,"Araci, Brazil",entrée/plat/dessert
140,27472,Kennedy,Matzl,kmatzl16@prweb.com,Male,24,€66.59,118.6875,27.453125,,2017/03/14,False,1527821000.0,"Xinpu, China",entrée/plat/dessert
202,27472,Kennedy,Matzl,kmatzl16@prweb.com,Male,24,€66.59,118.6875,27.453125,,2017/03/14,False,1527821000.0,"Xinpu, China",entrée/plat/dessert
5,27497,Gordie,Bodicum,gbodicum1v@apache.org,Male,60,$47.26,13.976562,49.78125,146943857.0,2017/04/29,False,1545810000.0,"Jince, Czech Republic",entrée/plat/dessert
79,27497,Gordie,Bodicum,gbodicum1v@apache.org,Male,60,$47.26,13.976562,49.78125,146943857.0,2017/04/29,False,1545810000.0,"Jince, Czech Republic",entrée/plat/dessert
57,27547,Vasily,Moggach,vmoggach39@g.co,Male,39,"€47,01",20.25,50.75,133026726.0,2018/01/07,False,1538875000.0,"Złotniki, Poland",entrée/plat/dessert
93,27547,Vasily,Moggach,vmoggach39@g.co,Male,39,"€47,01",20.25,50.75,133026726.0,2018/01/07,False,1538875000.0,"Złotniki, Poland",entrée/plat/dessert


In [16]:
# suppression des lignes dupliquées
df = df.drop_duplicates()
len(df)

205

In [17]:
# doc
df.drop_duplicates?

#### Analyse des données manquantes

`numpy.nan` est utilisé dans **pandas** pour représenter des valeurs manquantes.

In [19]:
# Not A Number
np.nan

nan

In [20]:
# type
type(np.nan)

float

In [21]:
# élément super absorbant
np.nan + 1

nan

In [22]:
# élément super absorbant
np.sqrt(np.nan)

nan

In [23]:
# élément super absorbant
np.nan == np.nan

False

In [24]:
np.nan > 1

False

In [25]:
# au passage, infinis numpy
np.NINF, np.inf

(-inf, inf)

In [26]:
np.inf > 1e100

True

In [27]:
np.inf + 1e100

inf

In [28]:
np.inf == np.inf

True

In [29]:
np.inf + np.NINF

nan

In [30]:
np.inf + np.inf > np.inf

False

#### Tests sur les données manquantes

- `isna()` ou `isnull()`
- `notna()` ou `notnull()`

In [31]:
# ne fonctionne pas
df.loc[df['first_name']==np.nan]

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference


In [32]:
# chercher les first_name Nan
df.loc[df['first_name'].isna()]

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
96,27565,,,,,,,,,,,,,,
200,27630,,,,,,,,,,,,,,


In [33]:
# sur tout le dataframe
df.isna()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
205,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
206,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
207,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [34]:
# chercher tous les lignes avec au moins un NaN
df.loc[df.isna().any(axis=1)]

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
1,27570,Ruthi,Ross,rross3w@sohu.com,Female,57,$20.37,19.234375,50.468750,,2018/10/23,False,1.567165e+09,"Siewierz, Poland",entrée/plat/dessert
2,27572,Silas,Stourton,silas.stourton3y@answers.com,Male,22,"€32,99",118.812500,24.984375,,2018/12/30,False,1.553692e+09,"Longbo, China",entrée/plat/dessert
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64,"£98,93",121.625000,-8.843750,,2019/03/12,False,1.552349e+09,"Potulando, Indonesia",entrée/plat
4,27558,Margaux,Gowanson,nope@thankyou.,Female,54,$13.30,14.773438,45.156250,,2018/08/13,False,1.543383e+09,"Bribir, Croatia",entrée/plat/dessert
6,27539,Dulcine,Stopforth,dstopforth31@forbes.com,Female,22,€12.60,-35.593750,-6.253906,,2017/12/13,False,1.547567e+09,"Serrinha, Brazil",entrée/plat/dessert
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,27630,,,,,,,,,,,,,,
201,27519,Leonora,Guild,lguild2h@census.gov,Female,54,€54.57,122.187500,7.550781,,2018/12/17,False,1.545005e+09,"Siraway, Philippines",entrée/plat/dessert
203,27612,Torrance,Sivills,tsivills52@mapquest.com,Male,24,$36.43,-91.125000,15.023438,,2018/04/03,False,1.549641e+09,"Santa Cruz del Quiché, Guatemala",entrée/plat/dessert
204,27465,Adelind,Christin,achristinz@blogs.com,Female,30,"€81,84",41.437500,52.406250,,2019/06/04,False,1.559606e+09,"Znamenka, Russia",entrée/plat/dessert


In [35]:
# suppression de toutes les lignes avec un NaN
df.dropna()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,Female,63,$55.18,18.281250,49.625000,0136319724,2019/04/16,False,1.559566e+09,"Palkovice, Czech Republic",entrée/plat/dessert
5,27497,Gordie,Bodicum,gbodicum1v@apache.org,Male,60,$47.26,13.976562,49.781250,0146943857,2017/04/29,False,1.545810e+09,"Jince, Czech Republic",entrée/plat/dessert
15,27619,Shelley,Skeemor,sskeemor59@princeton.edu,Female,44,$41.50,111.875000,-7.585938,0547556034,2019/01/24,False,1.556976e+09,"Guyangan, Indonesia",entrée/plat/dessert
20,27462,Graig,Brownsea,gbrownseaw@arstechnica.com,M,54,$80.23,110.375000,-7.214844,0262855559,2018/05/22,False,1.569677e+09,"Sidomukti, Indonesia",entrée/plat/dessert
21,27466,Baron,Spurrett,bspurrett10@psu.edu,Male,36,$57.56,16.578125,-18.359375,06etcetera,2018/07/20,False,1.549132e+09,"Omuthiya, Namibia",entrée/plat/dessert
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,27607,Mel,Pelz,mpelz4x@cbslocal.com,Male,57,€22.55,28.671875,7.265625,01.75.20.82.23,2018/10/16,False,1.563696e+09,"Tonj, South Sudan",entrée/plat/dessert
198,27624,Dur,Hallaways,dhallaways5e@delicious.com,Male,25,"€69,16",106.750000,28.734375,0317009357,2019/07/27,False,1.569215e+09,"Ganshui, China",entrée/plat/dessert
206,27521,Moritz,Issacof,missacof2j@wired.com,Male,21,$91.97,72.312500,31.265625,0298949280,2018/08/20,False,1.549817e+09,"Jhang Sadr, Pakistan",entrée/plat/dessert
207,27441,Quintana,Foulstone,qfoulstoneb@newsvine.com,F,32,"€55,64",111.125000,-6.785156,0734595126,2018/10/25,False,1.540426e+09,"Mantingantengah, Indonesia",entrée/plat/dessert


In [36]:
# supprimer uniquement les lignes dont le first_name NaN 
df = df.dropna(subset=['first_name'])
len(df)

203

#### Ajout d'une colonne 'full_name'

In [37]:
# 'full_name'  = 'first_name last_name'
df['full_name'] = df['first_name'] + ' ' + df['last_name']
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,Female,63,$55.18,18.28125,49.625,136319724.0,2019/04/16,False,1559566000.0,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik
1,27570,Ruthi,Ross,rross3w@sohu.com,Female,57,$20.37,19.234375,50.46875,,2018/10/23,False,1567165000.0,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross
2,27572,Silas,Stourton,silas.stourton3y@answers.com,Male,22,"€32,99",118.8125,24.984375,,2018/12/30,False,1553692000.0,"Longbo, China",entrée/plat/dessert,Silas Stourton
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64,"£98,93",121.625,-8.84375,,2019/03/12,False,1552349000.0,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew
4,27558,Margaux,Gowanson,nope@thankyou.,Female,54,$13.30,14.773438,45.15625,,2018/08/13,False,1543383000.0,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson


#### Analyse de la colonne 'address'

In [38]:
# analyse de address
df['address'].value_counts()

Turka, Ukraine          3
Nova Odesa, Ukraine     2
Baroh, Indonesia        1
Salam, Indonesia        1
Potulando, Indonesia    1
                       ..
Chico, United States    1
Azogues, Ecuador        1
Ganshui, China          1
Sanjie, China           1
Yinchuan, China         1
Name: address, Length: 200, dtype: int64

#### Ajout des colonnes 'city' et 'country'

In [39]:
# calcul de city et country à partir de address
df[['city', 'country']] = df['address'].str.extract('(.*), (.*)')
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,Female,63,$55.18,18.28125,49.625,136319724.0,2019/04/16,False,1559566000.0,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic
1,27570,Ruthi,Ross,rross3w@sohu.com,Female,57,$20.37,19.234375,50.46875,,2018/10/23,False,1567165000.0,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland
2,27572,Silas,Stourton,silas.stourton3y@answers.com,Male,22,"€32,99",118.8125,24.984375,,2018/12/30,False,1553692000.0,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64,"£98,93",121.625,-8.84375,,2019/03/12,False,1552349000.0,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia
4,27558,Margaux,Gowanson,nope@thankyou.,Female,54,$13.30,14.773438,45.15625,,2018/08/13,False,1543383000.0,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia


In [41]:
# regegx avec captures nommées
df['address'].str.extract('(?P<city>.*), (?P<country>.*)')

Unnamed: 0,city,country
0,Palkovice,Czech Republic
1,Siewierz,Poland
2,Longbo,China
3,Potulando,Indonesia
4,Bribir,Croatia
...,...,...
204,Znamenka,Russia
205,Iyo,Japan
206,Jhang Sadr,Pakistan
207,Mantingantengah,Indonesia


#### Analyse des données

In [42]:
# nunique : modalités par colonne
df.nunique()

id              201
first_name      195
last_name       201
email           197
gender            4
age              54
money           184
lon             188
lat             191
phone            80
registration    183
inactive          1
last_seen       184
address         200
preference        5
full_name       201
city            200
country          60
dtype: int64

In [43]:
df.notna().sum()

id              203
first_name      203
last_name       203
email           199
gender          203
age             203
money           186
lon             203
lat             203
phone            80
registration    203
inactive        203
last_seen       186
address         203
preference      203
full_name       203
city            203
country         203
dtype: int64

#### Mapping du genre

In [44]:
# analyse du gender
df['gender'].unique()

array(['Female', 'Male', 'F', 'M'], dtype=object)

In [45]:
# analyse du gender
df['gender'].value_counts()

Male      89
Female    79
M         22
F         13
Name: gender, dtype: int64

In [46]:
# traitement du gender
mapping = {'Female': 'F', 'Male': 'M', 'F': 'F', 'M': 'M'}
df['gender'] = df['gender'].map(mapping)
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63,$55.18,18.28125,49.625,136319724.0,2019/04/16,False,1559566000.0,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57,$20.37,19.234375,50.46875,,2018/10/23,False,1567165000.0,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22,"€32,99",118.8125,24.984375,,2018/12/30,False,1553692000.0,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64,"£98,93",121.625,-8.84375,,2019/03/12,False,1552349000.0,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia
4,27558,Margaux,Gowanson,nope@thankyou.,F,54,$13.30,14.773438,45.15625,,2018/08/13,False,1543383000.0,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia


In [47]:
# dictionnaire incomplet
mapping0 = {'Female': 'F', 'Male': 'M'}
s = df0['gender'].map(mapping0)
df0['gender'].count(), s.count()

(207, 171)

In [48]:
s.value_counts(dropna=False)

M      92
F      79
NaN    38
Name: gender, dtype: int64

In [49]:
# avec dictionnaire incomplet
df0['gender'].apply(lambda x: mapping0.get(x, x))

0      F
1      F
2      M
3      F
4      F
      ..
204    F
205    F
206    M
207    F
208    M
Name: gender, Length: 209, dtype: object

In [50]:
# traitement du gender, map() avec une Series
mapping = pd.Series({'Female': 'F', 'Male': 'M', 'F': 'F', 'M': 'M'})
mapping

Female    F
Male      M
F         F
M         M
dtype: object

In [51]:
# traitement du gender avec une Series
df['gender'] = df['gender'].map(mapping)

In [52]:
# au final
df['gender'].value_counts()

M    111
F     92
Name: gender, dtype: int64

#### Analyse du genre

In [53]:
# analyse gender NaN
len(df0.loc[df0['gender'].isna()])

2

In [54]:
# analyse gender NaN
df0['gender'].isna().sum()

2

In [55]:
# analyse prénom avec gender NaN
df0.loc[df0['gender'].isna(), 'first_name'].nunique()

0

In [56]:
# analyse gender
df0.loc[df0['gender'].isna(), 'first_name'].unique()

array([nan], dtype=object)

Compléter le genre :

1. Autocomplétion avec le fichier people.csv (mais très peu de cas)
2. Gender API : https://gender-api.com/fr (simple mais API payante si gros volumes + de 500/mois)
3. US SSA baby names : https://www.ssa.gov/oact/babynames/limits.html ("gratuit", stats à produire, éventuellement affiner par année de naissance)

#### Analyse de l'âge

`pandas.Series.astype()` : types

`pandas.to_numeric()` : data avec gestion des erreurs

In [58]:
df.dtypes

id               uint16
first_name       object
last_name        object
email            object
gender           object
age              object
money            object
lon             float16
lat             float16
phone            object
registration     object
inactive         object
last_seen       float32
address          object
preference       object
full_name        object
city             object
country          object
dtype: object

In [59]:
import re

re.search('[^0-9]', 'toto')

<re.Match object; span=(0, 1), match='t'>

In [62]:
re.search('\D', 'toto')

<re.Match object; span=(0, 1), match='t'>

In [61]:
re.findall('[^0-9]+', 'toto')

['toto']

In [63]:
# analyse de l'âge
df.loc[df['age'].astype(str).str.contains('[^0-9\.]'), 'age'].value_counts()

no       4
nope     2
sorry    2
Name: age, dtype: int64

In [66]:
pd.to_numeric?

In [65]:
pd.to_numeric(df['age'])

ValueError: Unable to parse string "no" at position 19

In [67]:
# traitement de l'âge
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.281250,49.625000,0136319724,2019/04/16,False,1.559566e+09,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.468750,,2018/10/23,False,1.567165e+09,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.812500,24.984375,,2018/12/30,False,1.553692e+09,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625000,-8.843750,,2019/03/12,False,1.552349e+09,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.156250,,2018/08/13,False,1.543383e+09,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,27465,Adelind,Christin,achristinz@blogs.com,F,30.0,"€81,84",41.437500,52.406250,,2019/06/04,False,1.559606e+09,"Znamenka, Russia",entrée/plat/dessert,Adelind Christin,Znamenka,Russia
205,27476,Petronella,Pickance,ppickance1a@uiuc.edu,F,28.0,€37.15,132.750000,33.750000,,2017/11/22,False,1.532870e+09,"Iyo, Japan",entrée/plat/dessert,Petronella Pickance,Iyo,Japan
206,27521,Moritz,Issacof,missacof2j@wired.com,M,21.0,$91.97,72.312500,31.265625,0298949280,2018/08/20,False,1.549817e+09,"Jhang Sadr, Pakistan",entrée/plat/dessert,Moritz Issacof,Jhang Sadr,Pakistan
207,27441,Quintana,Foulstone,qfoulstoneb@newsvine.com,F,32.0,"€55,64",111.125000,-6.785156,0734595126,2018/10/25,False,1.540426e+09,"Mantingantengah, Indonesia",entrée/plat/dessert,Quintana Foulstone,Mantingantengah,Indonesia


In [68]:
df["age"].unique()

array([63., 57., 22., 64., 54., 60., 47., 23., 29., 49., 65., 70., 55.,
       44., 59., 67., 56., nan, 36., 24., 30., 41., 20., 43., 52., 61.,
       35., 27., 33., 48., 42., 46., 26., 66., 39., 58., 68., 62., 37.,
       45., 38., 40., 31., 28., 21., 25., 34., 50., 32., 51., 69., 53.])

In [69]:
df.dtypes

id               uint16
first_name       object
last_name        object
email            object
gender           object
age             float64
money            object
lon             float16
lat             float16
phone            object
registration     object
inactive         object
last_seen       float32
address          object
preference       object
full_name        object
city             object
country          object
dtype: object

In [71]:
df["age"].dropna().astype(int)

0      63
1      57
2      22
3      64
4      54
       ..
204    30
205    28
206    21
207    32
208    34
Name: age, Length: 195, dtype: int32

#### Traitement des dates


`pandas.to_datetime()` : data, gestion des formats et des erreurs

`pandas.Series.combine_first()` : équivalent à `fillna()`

In [72]:
cols_time = ['registration', 'last_seen']
df[cols_time]

Unnamed: 0,registration,last_seen
0,2019/04/16,1.559566e+09
1,2018/10/23,1.567165e+09
2,2018/12/30,1.553692e+09
3,2019/03/12,1.552349e+09
4,2018/08/13,1.543383e+09
...,...,...
204,2019/06/04,1.559606e+09
205,2017/11/22,1.532870e+09
206,2018/08/20,1.549817e+09
207,2018/10/25,1.540426e+09


In [73]:
df[cols_time].dtypes

registration     object
last_seen       float32
dtype: object

In [79]:
pd.to_datetime?

In [75]:
df0.loc[df0['last_seen'].isna()]

Unnamed: 0,id,first_name,last_name,email address,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference
12,27548,Stanley,McCullen,smccullen3a@timesonline.co.uk,Male,60,,119.141473,28.074649,875503094.0,2017/06/05,False,,"Longquan, China",entrée/plat/dessert
16,27442,Mort,Midford,mmidfordc@yale.edu,M,59,,6.848501,5.959253,811321968.0,2019/06/25,False,,"Ozubulu, Nigeria",entrée/plat
23,27498,Pattie,Egiloff,pegiloff1w@liveinternet.ru,Male,44,,16.931279,53.680826,,2019/06/06,False,,"Czarne, Poland",entrée/plat/dessert
46,27475,Early,Stubley,estubley19@qq.com,Male,46,,-8.287188,38.988242,,2018/07/17,False,,"Couço, Portugal",entrée/plat/dessert
55,27576,Kial,Gooble,kgooble42@parallels.com,Female,66,,122.018799,47.233746,,2018/03/05,False,,"Haduohe, China",entrée/plat/dessert
61,27543,Ogden,Jertz,ojertz35@aboutads.info,Male,20,,15.559672,46.073321,228944761.0,2017/07/10,False,,"Kozje, Slovenia",entrée/plat/dessert
96,27565,,,,,,,,,,,,,,
114,27583,Petr,Swann,pSwann49@msn.com,Male,44,,73.921228,31.81679,,2019/01/02,False,,"Jāndiāla Sher Khān, Pakistan",entrée/plat/dessert
120,27581,Aldus,Dreghorn,adreghorn47@thetimes.co.uk,Male,38,,113.016085,29.39441,222360265.0,2018/10/16,False,,"Gangdong, China",entrée/plat/dessert
132,27598,Sharai,Winchcum,swinchcum4o@apple.com,Female,34,,113.247509,23.124399,,2018/12/30,False,,"Xinbu, China",entrée/plat/dessert


In [76]:
# conversion des dates
df['registration'] = pd.to_datetime(df['registration'])
df['last_seen'] = pd.to_datetime(df['last_seen'], unit='s')
# si last_seen est NaN, prendre registration
df['last_seen'] = df['last_seen'].fillna(df['registration'])
# idem
df['last_seen'] = df['last_seen'].combine_first(df['registration'])

In [77]:
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.28125,49.625,136319724.0,2019-04-16,False,2019-06-03 12:39:28,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.46875,,2018-10-23,False,2019-08-30 11:41:52,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.8125,24.984375,,2018-12-30,False,2019-03-27 13:11:28,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625,-8.84375,,2019-03-12,False,2019-03-12 00:00:00,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.15625,,2018-08-13,False,2018-11-28 05:24:16,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 0 to 208
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            203 non-null    uint16        
 1   first_name    203 non-null    object        
 2   last_name     203 non-null    object        
 3   email         199 non-null    object        
 4   gender        203 non-null    object        
 5   age           195 non-null    float64       
 6   money         186 non-null    object        
 7   lon           203 non-null    float16       
 8   lat           203 non-null    float16       
 9   phone         80 non-null     object        
 10  registration  203 non-null    datetime64[ns]
 11  inactive      203 non-null    object        
 12  last_seen     203 non-null    datetime64[ns]
 13  address       203 non-null    object        
 14  preference    203 non-null    object        
 15  full_name     203 non-null    object    

#### applymap() et apply() pour DataFrames

In [80]:
# longueur de chaque élément passé en string
df.astype(str).applymap(len)

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country
0,5,7,7,19,1,4,6,5,5,10,10,5,19,25,19,15,9,14
1,5,5,4,16,1,4,6,5,5,3,10,5,19,16,19,10,8,6
2,5,5,8,28,1,4,6,5,5,3,10,5,19,13,19,14,6,5
3,5,6,9,24,1,4,6,5,5,3,10,5,19,20,11,16,9,9
4,5,7,8,14,1,4,6,5,5,3,10,5,19,15,19,16,6,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,5,7,8,20,1,4,6,5,4,3,10,5,19,16,19,16,8,6
205,5,10,8,20,1,4,6,5,5,3,10,5,19,10,19,19,3,5
206,5,6,7,20,1,4,6,4,5,10,10,5,19,20,19,14,10,8
207,5,8,9,24,1,4,6,5,6,10,10,5,19,26,19,18,15,9


In [81]:
# par défaut, la fonction s'applique aux colonnes => résultat = ligne
df.apply(len, axis=0)

id              203
first_name      203
last_name       203
email           203
gender          203
age             203
money           203
lon             203
lat             203
phone           203
registration    203
inactive        203
last_seen       203
address         203
preference      203
full_name       203
city            203
country         203
dtype: int64

In [82]:
# exemple de ligne
df.iloc[0]

id                                  27625
first_name                        Leandra
last_name                         Pabelik
email                 lpabelik5f@yale.edu
gender                                  F
age                                  63.0
money                              $55.18
lon                              18.28125
lat                                49.625
phone                          0136319724
registration          2019-04-16 00:00:00
inactive                            False
last_seen             2019-06-03 12:39:28
address         Palkovice, Czech Republic
preference            entrée/plat/dessert
full_name                 Leandra Pabelik
city                            Palkovice
country                    Czech Republic
Name: 0, dtype: object

In [83]:
# la fonction s'applique aux lignes => résultat = colonne
df.apply(len, axis=1)

0      18
1      18
2      18
3      18
4      18
       ..
204    18
205    18
206    18
207    18
208    18
Length: 203, dtype: int64

In [84]:
# exemple de colonne
df.iloc[:,0]

0      27625
1      27570
2      27572
3      27435
4      27558
       ...  
204    27465
205    27476
206    27521
207    27441
208    27597
Name: id, Length: 203, dtype: uint16

#### Analyse de currency

In [109]:
# échantillon
np.random.seed(0)
df.sample(10)

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency,money_eur
18,27513,Dominick,Hryncewicz,dhryncewicz2b@amazonaws.com,M,56.0,$80.71,68.1875,38.53125,,2017-08-04,False,2019-07-26 02:33:36,"Tursunzoda, Tajikistan",entrée/plat/dessert,Dominick Hryncewicz,Tursunzoda,Tajikistan,USD,81.8
45,27432,Gardener,Kempson,gkempson2@furl.net,M,33.0,$41.30,25.90625,49.09375,774378059.0,2017-09-08,False,2019-03-18 18:59:12,"Kopychyntsi, Ukraine",entrée/plat/dessert,Gardener Kempson,Kopychyntsi,Ukraine,USD,41.86
33,27463,Caesar,Ferrarotti,cferrarottix@qq.com,M,59.0,€60.84,112.875,-6.921875,346857462.0,2018-12-10,False,2019-10-02 02:44:16,"Trogan Barat, Indonesia",entrée/plat/dessert,Caesar Ferrarotti,Trogan Barat,Indonesia,EUR,60.84
37,27617,Smitty,Richter,srichter57@pinterest.com,M,54.0,$19.07,37.15625,56.75,789.0,2018-10-20,False,2019-04-14 11:56:48,"Zarya, Russia",entrée/plat/dessert,Smitty Richter,Zarya,Russia,USD,19.33
112,27544,Cristiano,Midghall,cmidghall36@npr.org,M,47.0,€43.94,-75.625,4.828125,574769298.0,2018-01-25,False,2019-06-15 14:02:40,"Dos Quebradas, Colombia",entrée/plat/dessert,Cristiano Midghall,Dos Quebradas,Colombia,EUR,43.94
91,27618,Murielle,Beston,mbeston58@youku.com,F,38.0,"€99,47",173.125,1.386719,,2017-02-19,False,2018-09-10 16:17:04,"Bonriki Village, Kiribati",entrée/plat/dessert,Murielle Beston,Bonriki Village,Kiribati,EUR,99.47
5,27497,Gordie,Bodicum,gbodicum1v@apache.org,M,60.0,$47.26,13.976562,49.78125,146943857.0,2017-04-29,False,2018-12-26 07:40:48,"Jince, Czech Republic",entrée/plat/dessert,Gordie Bodicum,Jince,Czech Republic,USD,47.9
127,27431,Sinclair,Witherow,switherow1@joomla.org,M,41.0,€55.95,22.0,56.6875,298040087.0,2018-12-26,False,2018-12-26 00:00:00,"Skrunda, Latvia",entrée/plat/dessert,Sinclair Witherow,Skrunda,Latvia,EUR,55.95
12,27548,Stanley,McCullen,smccullen3a@timesonline.co.uk,M,60.0,,119.125,28.078125,875503094.0,2017-06-05,False,2017-06-05 00:00:00,"Longquan, China",entrée/plat/dessert,Stanley McCullen,Longquan,China,,
157,27526,Sarajane,Fidler,sfidler2o@sitemeter.com,F,70.0,$90.09,95.3125,32.59375,511530696.0,2017-04-16,False,2018-08-09 11:29:04,"Jieduo, China",entrée/plat/dessert,Sarajane Fidler,Jieduo,China,USD,91.3


#### Traitement de 'currency'

Produire une nouvelle colonne numérique 'money_eur'.

Pour la conversion USD/EUR, on utilise l'API https://api.exchangeratesapi.io/latest

In [92]:
# API
import json

import requests

response = requests.get('https://open.er-api.com/v6/latest/EUR')
rates = json.loads(response.content)
rates

{'result': 'success',
 'provider': 'https://www.exchangerate-api.com',
 'documentation': 'https://www.exchangerate-api.com/docs/free',
 'terms_of_use': 'https://www.exchangerate-api.com/terms',
 'time_last_update_unix': 1663804952,
 'time_last_update_utc': 'Thu, 22 Sep 2022 00:02:32 +0000',
 'time_next_update_unix': 1663892382,
 'time_next_update_utc': 'Fri, 23 Sep 2022 00:19:42 +0000',
 'time_eol_unix': 0,
 'base_code': 'EUR',
 'rates': {'EUR': 1,
  'AED': 3.623927,
  'AFN': 88.446428,
  'ALL': 116.322217,
  'AMD': 414.762796,
  'ANG': 1.766325,
  'AOA': 436.794,
  'ARS': 143.648492,
  'AUD': 1.485208,
  'AWG': 1.766325,
  'AZN': 1.684667,
  'BAM': 1.95583,
  'BBD': 1.973547,
  'BDT': 101.03545,
  'BGN': 1.95583,
  'BHD': 0.371027,
  'BIF': 2010.521,
  'BMD': 0.986774,
  'BND': 1.398002,
  'BOB': 6.845888,
  'BRL': 5.10279,
  'BSD': 0.986774,
  'BTN': 78.824549,
  'BWP': 13.052149,
  'BYN': 2.506968,
  'BZD': 1.973547,
  'CAD': 1.324456,
  'CDF': 1996.356469,
  'CHF': 0.953107,
  'CLP

In [93]:
df['money'].value_counts()

$42.44    3
€46.01    1
€31,30    1
$23.07    1
€46.75    1
         ..
$69.55    1
€77,50    1
€61.93    1
€20,59    1
$53.52    1
Name: money, Length: 184, dtype: int64

In [95]:
df['money'].str[0].value_counts()

€    94
$    91
£     1
Name: money, dtype: int64

In [96]:
df['currency'] = df['money'].str[0].map({'€': 'EUR', '$': 'USD', '£': 'GBP'})
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.28125,49.625,136319724.0,2019-04-16,False,2019-06-03 12:39:28,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic,USD
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.46875,,2018-10-23,False,2019-08-30 11:41:52,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland,USD
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.8125,24.984375,,2018-12-30,False,2019-03-27 13:11:28,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China,EUR
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625,-8.84375,,2019-03-12,False,2019-03-12 00:00:00,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia,GBP
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.15625,,2018-08-13,False,2018-11-28 05:24:16,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia,USD


In [97]:
rates['rates']

{'EUR': 1,
 'AED': 3.623927,
 'AFN': 88.446428,
 'ALL': 116.322217,
 'AMD': 414.762796,
 'ANG': 1.766325,
 'AOA': 436.794,
 'ARS': 143.648492,
 'AUD': 1.485208,
 'AWG': 1.766325,
 'AZN': 1.684667,
 'BAM': 1.95583,
 'BBD': 1.973547,
 'BDT': 101.03545,
 'BGN': 1.95583,
 'BHD': 0.371027,
 'BIF': 2010.521,
 'BMD': 0.986774,
 'BND': 1.398002,
 'BOB': 6.845888,
 'BRL': 5.10279,
 'BSD': 0.986774,
 'BTN': 78.824549,
 'BWP': 13.052149,
 'BYN': 2.506968,
 'BZD': 1.973547,
 'CAD': 1.324456,
 'CDF': 1996.356469,
 'CHF': 0.953107,
 'CLP': 924.667344,
 'CNY': 6.962707,
 'COP': 4396.804254,
 'CRC': 628.154723,
 'CUP': 23.682569,
 'CVE': 110.265,
 'CZK': 24.663407,
 'DJF': 175.370414,
 'DKK': 7.46038,
 'DOP': 53.286713,
 'DZD': 139.974565,
 'EGP': 19.269337,
 'ERN': 14.801606,
 'ETB': 53.09985,
 'FJD': 2.230641,
 'FKP': 0.873721,
 'FOK': 7.46038,
 'GBP': 0.873774,
 'GEL': 2.828808,
 'GGP': 0.873721,
 'GHS': 10.354113,
 'GIP': 0.873721,
 'GMD': 55.477418,
 'GNF': 8471.82228,
 'GTQ': 7.726989,
 'GYD': 2

In [98]:
df0['money'].str[0].value_counts()

€    97
$    92
£     1
Name: money, dtype: int64

In [101]:
df['money'].str[1:].str.replace(',', '.').astype(float)

0      55.18
1      20.37
2      32.99
3      98.93
4      13.30
       ...  
204    81.84
205    37.15
206    91.97
207    55.64
208    81.54
Name: money, Length: 203, dtype: float64

In [106]:
df['currency']

0      USD
1      USD
2      EUR
3      GBP
4      USD
      ... 
204    EUR
205    EUR
206    USD
207    EUR
208    USD
Name: currency, Length: 203, dtype: object

In [107]:
rates['rates']

{'EUR': 1,
 'AED': 3.623927,
 'AFN': 88.446428,
 'ALL': 116.322217,
 'AMD': 414.762796,
 'ANG': 1.766325,
 'AOA': 436.794,
 'ARS': 143.648492,
 'AUD': 1.485208,
 'AWG': 1.766325,
 'AZN': 1.684667,
 'BAM': 1.95583,
 'BBD': 1.973547,
 'BDT': 101.03545,
 'BGN': 1.95583,
 'BHD': 0.371027,
 'BIF': 2010.521,
 'BMD': 0.986774,
 'BND': 1.398002,
 'BOB': 6.845888,
 'BRL': 5.10279,
 'BSD': 0.986774,
 'BTN': 78.824549,
 'BWP': 13.052149,
 'BYN': 2.506968,
 'BZD': 1.973547,
 'CAD': 1.324456,
 'CDF': 1996.356469,
 'CHF': 0.953107,
 'CLP': 924.667344,
 'CNY': 6.962707,
 'COP': 4396.804254,
 'CRC': 628.154723,
 'CUP': 23.682569,
 'CVE': 110.265,
 'CZK': 24.663407,
 'DJF': 175.370414,
 'DKK': 7.46038,
 'DOP': 53.286713,
 'DZD': 139.974565,
 'EGP': 19.269337,
 'ERN': 14.801606,
 'ETB': 53.09985,
 'FJD': 2.230641,
 'FKP': 0.873721,
 'FOK': 7.46038,
 'GBP': 0.873774,
 'GEL': 2.828808,
 'GGP': 0.873721,
 'GHS': 10.354113,
 'GIP': 0.873721,
 'GMD': 55.477418,
 'GNF': 8471.82228,
 'GTQ': 7.726989,
 'GYD': 2

In [105]:
df['currency'].map(rates['rates'])

0      0.986733
1      0.986733
2      1.000000
3      0.873774
4      0.986733
         ...   
204    1.000000
205    1.000000
206    0.986733
207    1.000000
208    0.986733
Name: currency, Length: 203, dtype: float64

In [102]:
# extraction de la currency
df['currency'] = df['money'].str[0].map({'€': 'EUR', '$': 'USD', '£': 'GBP'})
df['money_eur'] = df['money'].str[1:].str.replace(',', '.')  # extraction des derniers chars + , => .
df['money_eur'] = pd.to_numeric(df['money_eur'])  # conversion en nombre

# conversion des monnaies en euros
df['money_eur'] = df['money_eur'] / (df['currency'].map(rates['rates']))
#np.random.seed(0)
#df.sample(10)
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency,money_eur
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.28125,49.625,136319724.0,2019-04-16,False,2019-06-03 12:39:28,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic,USD,55.921916
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.46875,,2018-10-23,False,2019-08-30 11:41:52,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland,USD,20.643882
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.8125,24.984375,,2018-12-30,False,2019-03-27 13:11:28,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China,EUR,32.99
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625,-8.84375,,2019-03-12,False,2019-03-12 00:00:00,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia,GBP,113.221497
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.15625,,2018-08-13,False,2018-11-28 05:24:16,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia,USD,13.478824


In [103]:
df['money_eur'] = df['money_eur'].round(2)
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency,money_eur
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.28125,49.625,136319724.0,2019-04-16,False,2019-06-03 12:39:28,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic,USD,55.92
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.46875,,2018-10-23,False,2019-08-30 11:41:52,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland,USD,20.64
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.8125,24.984375,,2018-12-30,False,2019-03-27 13:11:28,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China,EUR,32.99
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625,-8.84375,,2019-03-12,False,2019-03-12 00:00:00,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia,GBP,113.22
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.15625,,2018-08-13,False,2018-11-28 05:24:16,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia,USD,13.48


In [114]:
dico ={"x":[1, 2, 3], "y":[4, 5, 6]}
pd.DataFrame(dico)

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


#### Analyse des emails

On va utiliser des regex pour nettoyer les emails mais mieux vaut utiliser une librairie spécialisée. Par exemple, https://github.com/syrusakbary/validate_email

In [115]:
# email NaN
df['email'].isna().sum()

4

In [116]:
# suppression des emails absents
df = df.dropna(subset=['email'])
df

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency,money_eur
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.281250,49.625000,0136319724,2019-04-16,False,2019-06-03 12:39:28,"Palkovice, Czech Republic",entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic,USD,55.92
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.468750,,2018-10-23,False,2019-08-30 11:41:52,"Siewierz, Poland",entrée/plat/dessert,Ruthi Ross,Siewierz,Poland,USD,20.64
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.812500,24.984375,,2018-12-30,False,2019-03-27 13:11:28,"Longbo, China",entrée/plat/dessert,Silas Stourton,Longbo,China,EUR,32.99
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625000,-8.843750,,2019-03-12,False,2019-03-12 00:00:00,"Potulando, Indonesia",entrée/plat,Roxine Pettecrew,Potulando,Indonesia,GBP,113.22
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.156250,,2018-08-13,False,2018-11-28 05:24:16,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia,USD,13.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,27465,Adelind,Christin,achristinz@blogs.com,F,30.0,"€81,84",41.437500,52.406250,,2019-06-04,False,2019-06-04 00:00:00,"Znamenka, Russia",entrée/plat/dessert,Adelind Christin,Znamenka,Russia,EUR,81.84
205,27476,Petronella,Pickance,ppickance1a@uiuc.edu,F,28.0,€37.15,132.750000,33.750000,,2017-11-22,False,2018-07-29 13:17:52,"Iyo, Japan",entrée/plat/dessert,Petronella Pickance,Iyo,Japan,EUR,37.15
206,27521,Moritz,Issacof,missacof2j@wired.com,M,21.0,$91.97,72.312500,31.265625,0298949280,2018-08-20,False,2019-02-10 16:38:24,"Jhang Sadr, Pakistan",entrée/plat/dessert,Moritz Issacof,Jhang Sadr,Pakistan,USD,93.21
207,27441,Quintana,Foulstone,qfoulstoneb@newsvine.com,F,32.0,"€55,64",111.125000,-6.785156,0734595126,2018-10-25,False,2018-10-25 00:00:00,"Mantingantengah, Indonesia",entrée/plat/dessert,Quintana Foulstone,Mantingantengah,Indonesia,EUR,55.64


In [117]:
# emails avec chars non admis
df.loc[df['email'].str.contains('[^A-Za-z0-9_\-%+.@]'), 'email'].values

array(['ycurston4a@nifty.com  ', '  agilmartin4b@un.org ',
       '  swinchcum4o@apple.com', '  erowcliffe43@chronoengine.com',
       'rbellenie54@topsy.com ', 'dbrindle1p@trellian.com ',
       ' efeatonby4i@hc360.com', ' mlowcock4u@nba.com'], dtype=object)

In [127]:
# suppression des blancs
# lstrip() rstrip()
df = df.copy()
df['email'] = df['email'].str.strip()
df.loc[df['email'].str.contains('[^A-Za-z0-9_\-%+.@]'), 'email']

Series([], Name: email, dtype: object)

In [122]:
d = pd.DataFrame([{"a":1, "b":1, "c":1}, {"a":2, "b":3, "c":4}, {"a":3, "b":3, "c":3}])
d

Unnamed: 0,a,b,c
0,1,1,1
1,2,3,4
2,3,3,3


In [123]:
e = d.loc[d["b"]==3]
e

Unnamed: 0,a,b,c
1,2,3,4
2,3,3,3


In [124]:
e["a"]=0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e["a"]=0


In [125]:
f = d.loc[d["b"]==3].copy()
f

Unnamed: 0,a,b,c
1,2,3,4
2,3,3,3


In [126]:
f["a"]=0
f

Unnamed: 0,a,b,c
1,0,3,4
2,0,3,3


In [128]:
# regex pour vérifier les domaines
df.loc[~df['email'].str.contains('.+@.+\.[A-Za-z]{2,}$')]

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency,money_eur
4,27558,Margaux,Gowanson,nope@thankyou.,F,54.0,$13.30,14.773438,45.15625,,2018-08-13,False,2018-11-28 05:24:16,"Bribir, Croatia",entrée/plat/dessert,Margaux Gowanson,Bribir,Croatia,USD,13.48
35,27560,Vinnie,Stansfield,non@merci,F,35.0,$21.85,-64.6875,10.1875,,2018-04-16,False,2019-06-27 13:30:40,"Lecherías, Venezuela",entrée/plat/dessert,Vinnie Stansfield,Lecherías,Venezuela,USD,22.14
43,27575,Kaycee,Geillier,k.geillier@gmail,F,48.0,$53.88,96.0,5.246094,,2018-05-18,False,2018-07-10 23:57:52,"Baroh, Indonesia",entrée/plat/dessert,Kaycee Geillier,Baroh,Indonesia,USD,54.6
78,27561,Peggy,Scuffham,nein@danke,F,38.0,$43.86,125.5,11.5,511039244.0,2017-04-08,False,2018-08-20 16:40:32,"Maydolong, Philippines",entrée/plat/dessert,Peggy Scuffham,Maydolong,Philippines,USD,44.45
113,27562,Maure,Cronk,no@thanks,F,41.0,"€57,41",111.875,37.25,385826731.0,2017-04-26,False,2018-12-21 12:45:52,"Sanxi, China",entrée/plat/dessert,Maure Cronk,Sanxi,China,EUR,57.41


In [129]:
# emails avec noms de domaine invalides
df = df.loc[df['email'].str.contains('.+@.+\.[A-Za-z]{2,}$')]

In [130]:
# emails avec aliases (char +)
df.loc[df['email'].str.contains('\+'), 'email']

186    a.gorz+alias@gmail.com
Name: email, dtype: object

In [131]:
import re
re.sub(r'([^+]+)(?:\+.*)?(@.+)', r'\1\2', "a.gorz+alias@gmail.com")

'a.gorz@gmail.com'

In [140]:
re.sub(r'([a-z]+)([0-9]+)', r'\1', "abc123")

'abc'

In [141]:
re.sub(r'([a-z]+)([0-9]+)', r'\2', "abc123")

'123'

In [142]:
# suppression des aliases (char +)
s = df['email'].str.replace(r'([^+]+)(?:\+.*)?(@.+)', r'\1\2', regex=True)
s.loc[186]

'a.gorz@gmail.com'

In [None]:
# [abc]
# [a-zA-Z]
# [^abc] : pas a, b ou c
# [^0-9] : pas digit
# '<[^>]+>' : tag HTML
# '<.+>'

In [None]:
# caractères parenthèses
# \([0-9]+\)

In [None]:
# parenthèses non capturante
# (?:regex)

In [None]:
# back references
# (.*)@(.*)  \1  \2

In [143]:
# suppresion des aliases (char +)
df['email'] = df['email'].str.replace(r'([^+]+)(?:\+.*)?(@.+)', r'\1\2', regex=True)

In [144]:
# suppression des emails en double, on conserve la première ligne
df = df.drop_duplicates(subset=['email'])
df.sort_values('email')

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,registration,inactive,last_seen,address,preference,full_name,city,country,currency,money_eur
89,27579,Izzy,Clemence,Izzy.Clemence45@hhs.gov,M,48.0,€46.75,97.500000,40.312500,,2019-08-10,False,2019-08-10 00:00:00,"Xigebi, China",entrée/plat/dessert,Izzy Clemence,Xigebi,China,EUR,46.75
185,27596,André,Gorz,a.gorz@gmail.com,M,43.0,€17.20,88.187500,39.031250,,2019-05-23,False,2019-10-11 20:05:20,"Vosnon, France",entrée/plat/dessert,André Gorz,Vosnon,France,EUR,17.20
40,27566,Amalie,Matthisson,aMatthisson3s@ted.com,F,63.0,"€13,89",16.296875,58.000000,0409272683,2019-01-20,False,2019-01-20 00:00:00,"Linköping, Sweden",entrée/plat/dessert,Amalie Matthisson,Linköping,Sweden,EUR,13.89
39,27505,Alvis,Bonar,abonar23@wufoo.com,M,27.0,$90.88,14.429688,49.656250,,2017-03-09,False,2019-03-09 13:05:04,"Sedlčany, Czech Republic",entrée/plat/dessert,Alvis Bonar,Sedlčany,Czech Republic,USD,92.10
204,27465,Adelind,Christin,achristinz@blogs.com,F,30.0,"€81,84",41.437500,52.406250,,2019-06-04,False,2019-06-04 00:00:00,"Znamenka, Russia",entrée/plat/dessert,Adelind Christin,Znamenka,Russia,EUR,81.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,27523,Wolfgang,Concannon,wconcannon2l@merriam-webster.com,M,59.0,€51.68,120.187500,30.390625,,2017-07-10,False,2019-10-06 16:02:08,"Chongxian, China",entrée/plat/dessert,Wolfgang Concannon,Chongxian,China,EUR,51.68
53,27467,Worden,Shewery,wshewery11@smugmug.com,M,26.0,$15.67,19.890625,40.906250,0517301167,2018-08-22,False,2018-11-22 07:42:56,"Kajan, Albania",entrée/plat/dessert,Worden Shewery,Kajan,Albania,USD,15.88
74,27584,Yehudit,Curston,ycurston4a@nifty.com,M,63.0,$86.99,-106.437500,31.968750,0157438239,2019-01-09,False,2019-01-09 00:00:00,"El Paso, United States",entrée/plat/dessert,Yehudit Curston,El Paso,United States,USD,88.16
80,27517,Zack,Reinhard,zreinhard2f@github.com,M,35.0,$91.20,-68.125000,-38.968750,0056565079,2017-10-20,False,2019-04-21 06:56:00,"Las Lajas, Argentina",entrée/plat/dessert,Zack Reinhard,Las Lajas,Argentina,USD,92.43


#### Analyse de la colonne 'preference'

In [145]:
# analyse de preference
df['preference'].nunique()

5

In [146]:
# analyse de preference
df['preference'].value_counts()

entrée/plat/dessert    180
plat/dessert             4
plat                     3
entrée/plat              3
boisson                  1
Name: preference, dtype: int64

In [147]:
# modalités de preference
s = set()
df['preference'].apply(lambda x: s.update(x.split('/')))
s

{'boisson', 'dessert', 'entrée', 'plat'}

In [148]:
# ajout d'un booléen par preference
for x in sorted(s):
    df[x] = df['preference'].str.contains(x)
    
df

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,...,preference,full_name,city,country,currency,money_eur,boisson,dessert,entrée,plat
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.281250,49.625000,0136319724,...,entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic,USD,55.92,False,True,True,True
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.468750,,...,entrée/plat/dessert,Ruthi Ross,Siewierz,Poland,USD,20.64,False,True,True,True
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.812500,24.984375,,...,entrée/plat/dessert,Silas Stourton,Longbo,China,EUR,32.99,False,True,True,True
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625000,-8.843750,,...,entrée/plat,Roxine Pettecrew,Potulando,Indonesia,GBP,113.22,False,False,True,True
5,27497,Gordie,Bodicum,gbodicum1v@apache.org,M,60.0,$47.26,13.976562,49.781250,0146943857,...,entrée/plat/dessert,Gordie Bodicum,Jince,Czech Republic,USD,47.90,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,27465,Adelind,Christin,achristinz@blogs.com,F,30.0,"€81,84",41.437500,52.406250,,...,entrée/plat/dessert,Adelind Christin,Znamenka,Russia,EUR,81.84,False,True,True,True
205,27476,Petronella,Pickance,ppickance1a@uiuc.edu,F,28.0,€37.15,132.750000,33.750000,,...,entrée/plat/dessert,Petronella Pickance,Iyo,Japan,EUR,37.15,False,True,True,True
206,27521,Moritz,Issacof,missacof2j@wired.com,M,21.0,$91.97,72.312500,31.265625,0298949280,...,entrée/plat/dessert,Moritz Issacof,Jhang Sadr,Pakistan,USD,93.21,False,True,True,True
207,27441,Quintana,Foulstone,qfoulstoneb@newsvine.com,F,32.0,"€55,64",111.125000,-6.785156,0734595126,...,entrée/plat/dessert,Quintana Foulstone,Mantingantengah,Indonesia,EUR,55.64,False,True,True,True


In [149]:
# autre façon avec get_dummies
df['preference'].str.get_dummies(sep='/')

Unnamed: 0,boisson,dessert,entrée,plat
0,0,1,1,1
1,0,1,1,1
2,0,1,1,1
3,0,0,1,1
5,0,1,1,1
...,...,...,...,...
204,0,1,1,1
205,0,1,1,1
206,0,1,1,1
207,0,1,1,1


In [150]:
# assignation des préférences
tab_preference = df['preference'].str.get_dummies(sep='/')
df[tab_preference.columns] = tab_preference.astype(bool)
df

Unnamed: 0,id,first_name,last_name,email,gender,age,money,lon,lat,phone,...,preference,full_name,city,country,currency,money_eur,boisson,dessert,entrée,plat
0,27625,Leandra,Pabelik,lpabelik5f@yale.edu,F,63.0,$55.18,18.281250,49.625000,0136319724,...,entrée/plat/dessert,Leandra Pabelik,Palkovice,Czech Republic,USD,55.92,False,True,True,True
1,27570,Ruthi,Ross,rross3w@sohu.com,F,57.0,$20.37,19.234375,50.468750,,...,entrée/plat/dessert,Ruthi Ross,Siewierz,Poland,USD,20.64,False,True,True,True
2,27572,Silas,Stourton,silas.stourton3y@answers.com,M,22.0,"€32,99",118.812500,24.984375,,...,entrée/plat/dessert,Silas Stourton,Longbo,China,EUR,32.99,False,True,True,True
3,27435,Roxine,Pettecrew,rpettecrew5@gravatar.com,F,64.0,"£98,93",121.625000,-8.843750,,...,entrée/plat,Roxine Pettecrew,Potulando,Indonesia,GBP,113.22,False,False,True,True
5,27497,Gordie,Bodicum,gbodicum1v@apache.org,M,60.0,$47.26,13.976562,49.781250,0146943857,...,entrée/plat/dessert,Gordie Bodicum,Jince,Czech Republic,USD,47.90,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,27465,Adelind,Christin,achristinz@blogs.com,F,30.0,"€81,84",41.437500,52.406250,,...,entrée/plat/dessert,Adelind Christin,Znamenka,Russia,EUR,81.84,False,True,True,True
205,27476,Petronella,Pickance,ppickance1a@uiuc.edu,F,28.0,€37.15,132.750000,33.750000,,...,entrée/plat/dessert,Petronella Pickance,Iyo,Japan,EUR,37.15,False,True,True,True
206,27521,Moritz,Issacof,missacof2j@wired.com,M,21.0,$91.97,72.312500,31.265625,0298949280,...,entrée/plat/dessert,Moritz Issacof,Jhang Sadr,Pakistan,USD,93.21,False,True,True,True
207,27441,Quintana,Foulstone,qfoulstoneb@newsvine.com,F,32.0,"€55,64",111.125000,-6.785156,0734595126,...,entrée/plat/dessert,Quintana Foulstone,Mantingantengah,Indonesia,EUR,55.64,False,True,True,True


In [151]:
# OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
var = encoder.fit_transform(df[['preference']])
var.toarray()

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0.

In [152]:
pd.DataFrame(var.toarray(), columns=encoder.get_feature_names())
# pd.DataFrame(var.toarray(), columns=encoder.get_feature_names_out())

Unnamed: 0,x0_boisson,x0_entrée/plat,x0_entrée/plat/dessert,x0_plat,x0_plat/dessert
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
186,0.0,0.0,1.0,0.0,0.0
187,0.0,0.0,1.0,0.0,0.0
188,0.0,0.0,1.0,0.0,0.0
189,0.0,0.0,1.0,0.0,0.0


In [153]:
import sklearn
sklearn.__version__

'0.23.2'

#### clean

In [154]:
import json

import requests
import pandas as pd

def clean_people(df):
    
    # suppression des lignes dupliquées
    df = df.drop_duplicates()
    
    # renommer les colonnes
    df = df.rename(columns={'email address': 'email'})
    
    # supprimer uniquement les lignes dont le first_name vaut NaN 
    df = df.dropna(subset=['first_name'])
    
    # ajout d'une colonne 'full_name'
    df['full_name'] = df['first_name'] + ' ' + df['last_name']

    # calcul de city et country à partir de address
    df[['city', 'country']] = df['address'].str.extract('(.*), (.*)')

    # traitement du gender
    mapping = {'Female': 'F', 'Male': 'M'}
    df['gender'] = df['gender'].map(mapping)
 
    # traitement de l'âge
    df['age'] = pd.to_numeric(df['age'], errors='coerce')

    # conversion des dates
    df['registration'] = pd.to_datetime(df.registration)
    df['last_seen'] = pd.to_datetime(df.last_seen, unit='s')
    # si last_seen est NaN, prendre registration
    df['last_seen'] = df['last_seen'].fillna(df['registration'])

    # récupération des taux de change
    response = requests.get('https://open.er-api.com/v6/latest/EUR')
    rates = json.loads(response.content)

    # extraction de la currency
    df['currency'] = df['money'].str[0].map({'€': 'EUR', '$': 'USD'})
    df['money_eur'] = df['money'].str[1:].str.replace(',', '.')  # extraction des derniers chars + , => .
    df['money_eur'] = pd.to_numeric(df['money_eur'])  # conversion en nombre

    # conversion des monnaies en euros
    rates['rates']['EUR'] = 1.0  # ajour de EUR pour pouvoir utiliser map()
    df['money_eur'] = df['money_eur'] * df['currency'].map(rates['rates'])

    # suppression des emails absents
    df = df.dropna(subset=['email'])

    # suppression des blancs
    df['email'] = df['email'].str.strip()

    # emails avec noms de domaine valides
    df = df.loc[df['email'].str.contains('.+@[A-Za-z0-9_\-.]+\.[A-Za-z]{2,}')]
    
    # extraction des aliases (char +)
    df['email'] = df['email'].str.replace(r'([^+]+)(?:\+.*)?(@.+)', r'\1\2', regex=True)

    # suppression des emails en double, on conserve la première ligne
    df = df.drop_duplicates(subset=['email'])

    # assignation des préférences
    tab_preference = df['preference'].str.get_dummies(sep='/')
    df[tab_preference.columns] = tab_preference.astype(bool)

    return df

In [155]:
# run
import pandas as pd

df0 = pd.read_csv('people.csv')
print(df0.shape)

df = clean_people(df0)
print(df.shape)

(209, 15)
(191, 24)


In [156]:
# method chaining

def clean_people2(df):
    
    # récupération des taux de change
    response = requests.get('https://open.er-api.com/v6/latest/EUR')
    rates = json.loads(response.content)
    rates['rates']['EUR'] = 1.0  # ajour de EUR pour pouvoir utiliser map()
    
    df = (df
          .drop_duplicates()
          .rename(columns={'email address': 'email'})
          .dropna(subset=['first_name'])
          .assign(full_name=lambda df_: df_.first_name + ' ' + df_.last_name,
                  gender=lambda df_: df_.gender.map({'Female': 'F', 'Male': 'M'}),
                  age=lambda df_: pd.to_numeric(df_.age, errors='coerce'),
                  registration=lambda df_: pd.to_datetime(df.registration),
                  last_seen=lambda df_: pd.to_datetime(df.last_seen, unit='s'))
          .assign(last_seen=lambda df_: df.last_seen.fillna(df_.registration))
          .pipe(lambda df_: df_.assign(**df_.address.str.extract('(?P<city>.*), (?P<country>.*)')))
          .assign(currency=lambda df_: df_.money.str[0].map({'€': 'EUR', '$': 'USD'}),
                  money_eur=lambda df_: df_.money.str[1:].str.replace(',', '.'))
          .assign(money_eur=lambda df_: pd.to_numeric(df_.money_eur) * df_.currency.map(rates['rates']))
          .dropna(subset=['email'])
          .assign(email=lambda df_: df_.email.str.strip())
          .assign(email=lambda df_: df_.email.str.replace(r'([^+]+)(?:\+.*)?(@.+)', r'\1\2', regex=True))
          .loc[lambda df_: df_.email.str.contains('.+@[A-Za-z0-9_\-.]+\.[A-Za-z]{2,}')]
          .drop_duplicates(subset=['email'])
          .pipe(lambda df_: df_.assign(**df_.preference.str.get_dummies(sep='/').astype(bool)))
         )

    return df

In [157]:
# run
import pandas as pd

df0 = pd.read_csv('people.csv')
print(df0.shape)

df = clean_people2(df0)
print(df.shape)

(209, 15)
(191, 24)


#### Visualisation des tables

Voir : https://pandas.pydata.org/docs/user_guide/style.html

Taux de remplissage d'un dataframe en bar chart

In [165]:
df = pd.read_csv('people.csv')

(df.notna()
 .sum()
 .to_frame()
 .rename({0: "completion"}, axis=1)
 .style
 .bar(color='lightgreen')
# .bar(color='#FF0000')
)

Unnamed: 0,completion
id,209
first_name,207
last_name,207
email address,203
gender,207
age,207
money,190
lon,207
lat,207
phone,83


Taux de remplissage d'un dataframe en color map

In [177]:
(df.notna()
 .sum()
 .mul(100/len(df))
 .to_frame()
 .rename({0: "completion"}, axis=1)
 .style
 .background_gradient(cmap="RdYlGn", vmin=0, vmax=100)
 .format("{:.1f}%")
)

Unnamed: 0,completion
id,100.0%
first_name,99.0%
last_name,99.0%
email address,97.1%
gender,99.0%
age,99.0%
money,90.9%
lon,99.0%
lat,99.0%
phone,39.7%


**Exercice**

1. Téléchargez le fichier Excel "FranceTHD_Open_Data_Observatoire_2017_T2.xlsx" sur le niveau des débits sur les réseaux d'accès à Internet : ADSL, câble, Fibre FttH (T2 2015 - T2 2017) de la page : https://www.data.gouv.fr/fr/datasets/niveau-des-debits-sur-les-reseaux-dacces-a-internet-adsl-cable-fibre-ftth-t2-2015-t2-2017/

2. Chargez avec pd.read_excel() dans un DataFrame le dernier onglet "2017 T2" en mesurant le temps avec %%time en première instruction de cellule.

3. Modifiez le nom des 4 premières colonnes en : 'code INSEE', 'commune', 'département', 'nb locaux' par exemple.

4. Sauvegardez le DataFrame avec pd.to_pickle().

5. Rechargez le DataFrame à partir du fichier pickle en mesurant le temps avec %%time en première instruction de cellule et comparez.

6. Effectuez une opération de sélection sur les communes : par exemple, les communes qui commencent par "SAINT".

7. Diagnostiquez le message d'erreur.

8. Corrigez le DataFrame.

In [2]:
%%time
df = pd.read_excel('FranceTHD_Open_Data_Observatoire_2017_T2.xlsx',
               sheet_name=-1,
               header=1)
df

Wall time: 23 s


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
0,01001,L'Abergement-Clémenciat,01,361,1.000,0.456,0.052,0.000,0.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
1,01002,L'Abergement-de-Varey,01,168,0.288,0.076,0.024,0.024,0.024,0.276,...,0.0,0.0,0.0,0.0,0.0,0.024,0.024,0.024,0.024,0.024
2,01004,Ambérieu-en-Bugey,01,7501,0.998,0.974,0.898,0.430,0.198,0.998,...,0.0,0.0,0.0,0.0,0.0,0.198,0.198,0.198,0.198,0.198
3,01005,Ambérieux-en-Dombes,01,725,1.000,0.993,0.945,0.667,0.036,1.000,...,0.0,0.0,0.0,0.0,0.0,0.036,0.036,0.036,0.036,0.036
4,01006,Ambléon,01,73,1.000,1.000,1.000,1.000,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,1.000,1.000,1.000,1.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36641,97615,Pamandzi,976,2670,1.000,0.998,0.994,0.564,0.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
36642,97616,Sada,976,2690,0.991,0.965,0.839,0.153,0.000,0.991,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
36643,97617,Tsingoni,976,3099,0.987,0.708,0.706,0.475,0.000,0.987,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
36644,97701,Saint-Barthélemy,977,4693,1.000,0.936,0.762,0.146,0.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000


In [None]:
# Excel et Python
# xlwings, openpyxl

In [3]:
df = df.rename(columns={'Unnamed: 0':'Code INSEE','Unnamed: 1':'Commune','Unnamed: 2':'Département','Unnamed: 3':'nb locaux'})
df.head()

Unnamed: 0,Code INSEE,Commune,Département,nb locaux,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
0,1001,L'Abergement-Clémenciat,1,361,1.0,0.456,0.052,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1002,L'Abergement-de-Varey,1,168,0.288,0.076,0.024,0.024,0.024,0.276,...,0.0,0.0,0.0,0.0,0.0,0.024,0.024,0.024,0.024,0.024
2,1004,Ambérieu-en-Bugey,1,7501,0.998,0.974,0.898,0.43,0.198,0.998,...,0.0,0.0,0.0,0.0,0.0,0.198,0.198,0.198,0.198,0.198
3,1005,Ambérieux-en-Dombes,1,725,1.0,0.993,0.945,0.667,0.036,1.0,...,0.0,0.0,0.0,0.0,0.0,0.036,0.036,0.036,0.036,0.036
4,1006,Ambléon,1,73,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [4]:
df.to_pickle('THD.pkl')

In [5]:
%%time
df = pd.read_pickle('THD.pkl')
df.shape

Wall time: 70 ms


(36646, 24)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36646 entries, 0 to 36645
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Code INSEE   36646 non-null  object 
 1   Commune      36646 non-null  object 
 2   Département  36646 non-null  object 
 3   nb locaux    36646 non-null  int64  
 4   éligibles    36646 non-null  float64
 5   3M et +      36646 non-null  float64
 6   8M et +      36646 non-null  float64
 7   30M et +     36646 non-null  float64
 8   100M et +    36646 non-null  float64
 9   éligibles.1  36646 non-null  float64
 10  3M et +.1    36646 non-null  float64
 11  8M et +.1    36646 non-null  float64
 12  30M et +.1   36646 non-null  float64
 13  100M et +.1  36646 non-null  int64  
 14  éligibles.2  36646 non-null  float64
 15  3M et +.2    36646 non-null  float64
 16  8M et +.2    36646 non-null  float64
 17  30M et +.2   36646 non-null  float64
 18  100M et +.2  36646 non-null  float64
 19  élig

In [7]:
df.loc[df['Commune'].str.startswith('Saint')]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [8]:
df.loc[df['Commune'].str.startswith('Saint', na=False)]

Unnamed: 0,Code INSEE,Commune,Département,nb locaux,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
298,01331,Saint-Alban,01,101,1.000,0.733,0.554,0.000,0.000,1.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000,0.000
299,01332,Saint-André-de-Bâgé,01,289,1.000,1.000,0.882,0.071,0.021,1.000,...,0.000,0.000,0.000,0.000,0.0,0.021,0.021,0.021,0.021,0.021
300,01333,Saint-André-de-Corcy,01,1453,1.000,0.987,0.975,0.522,0.048,1.000,...,0.000,0.000,0.000,0.000,0.0,0.048,0.048,0.048,0.048,0.048
301,01334,Saint-André-d'Huiriat,01,256,1.000,0.942,0.881,0.819,0.819,1.000,...,0.000,0.000,0.000,0.000,0.0,0.819,0.819,0.819,0.819,0.819
302,01335,Saint-André-le-Bouchoux,01,143,0.986,0.007,0.007,0.007,0.007,0.986,...,0.000,0.000,0.000,0.000,0.0,0.007,0.007,0.007,0.007,0.007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36619,97419,Sainte-Rose,974,2723,0.999,0.960,0.838,0.396,0.000,0.999,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000,0.000
36620,97420,Sainte-Suzanne,974,8901,0.995,0.958,0.936,0.776,0.722,0.995,...,0.000,0.000,0.000,0.000,0.0,0.722,0.722,0.722,0.722,0.722
36626,97502,Saint-Pierre,975,2653,0.991,0.991,0.991,0.991,0.000,0.000,...,0.991,0.991,0.991,0.991,0.0,0.000,0.000,0.000,0.000,0.000
36644,97701,Saint-Barthélemy,977,4693,1.000,0.936,0.762,0.146,0.000,1.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000,0.000


In [10]:
df['Commune'].apply(type).value_counts()

<class 'str'>     36644
<class 'bool'>        2
Name: Commune, dtype: int64

In [11]:
df.loc[df['Commune'].apply(lambda x: isinstance(x, bool))]

Unnamed: 0,Code INSEE,Commune,Département,nb locaux,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
2579,8165,False,8,26,1.0,0.8,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8439,24177,False,24,384,1.0,1.0,0.991,0.617,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df2 = pd.read_excel('FranceTHD_Open_Data_Observatoire_2017_T2.xlsx',
               sheet_name=-1,
               header=1,
               dtype={"Commune":str})
df2

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
0,01001,L'Abergement-Clémenciat,01,361,1.000,0.456,0.052,0.000,0.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
1,01002,L'Abergement-de-Varey,01,168,0.288,0.076,0.024,0.024,0.024,0.276,...,0.0,0.0,0.0,0.0,0.0,0.024,0.024,0.024,0.024,0.024
2,01004,Ambérieu-en-Bugey,01,7501,0.998,0.974,0.898,0.430,0.198,0.998,...,0.0,0.0,0.0,0.0,0.0,0.198,0.198,0.198,0.198,0.198
3,01005,Ambérieux-en-Dombes,01,725,1.000,0.993,0.945,0.667,0.036,1.000,...,0.0,0.0,0.0,0.0,0.0,0.036,0.036,0.036,0.036,0.036
4,01006,Ambléon,01,73,1.000,1.000,1.000,1.000,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,1.000,1.000,1.000,1.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36641,97615,Pamandzi,976,2670,1.000,0.998,0.994,0.564,0.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
36642,97616,Sada,976,2690,0.991,0.965,0.839,0.153,0.000,0.991,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
36643,97617,Tsingoni,976,3099,0.987,0.708,0.706,0.475,0.000,0.987,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000
36644,97701,Saint-Barthélemy,977,4693,1.000,0.936,0.762,0.146,0.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.000,0.000


In [14]:
df2 = df2.rename(columns={'Unnamed: 0':'Code INSEE','Unnamed: 1':'Commune','Unnamed: 2':'Département','Unnamed: 3':'nb locaux'})
df2.head()

Unnamed: 0,Code INSEE,Commune,Département,nb locaux,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
0,1001,L'Abergement-Clémenciat,1,361,1.0,0.456,0.052,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1002,L'Abergement-de-Varey,1,168,0.288,0.076,0.024,0.024,0.024,0.276,...,0.0,0.0,0.0,0.0,0.0,0.024,0.024,0.024,0.024,0.024
2,1004,Ambérieu-en-Bugey,1,7501,0.998,0.974,0.898,0.43,0.198,0.998,...,0.0,0.0,0.0,0.0,0.0,0.198,0.198,0.198,0.198,0.198
3,1005,Ambérieux-en-Dombes,1,725,1.0,0.993,0.945,0.667,0.036,1.0,...,0.0,0.0,0.0,0.0,0.0,0.036,0.036,0.036,0.036,0.036
4,1006,Ambléon,1,73,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [17]:
type(df2.loc[2579,"Commune"])

bool

In [18]:
df.loc[df['Commune']==False, 'Commune'] = 'Faux'

In [19]:
df.loc[df['Commune'].str.startswith('Saint')]

Unnamed: 0,Code INSEE,Commune,Département,nb locaux,éligibles,3M et +,8M et +,30M et +,100M et +,éligibles.1,...,éligibles.2,3M et +.2,8M et +.2,30M et +.2,100M et +.2,éligibles.3,3M et +.3,8M et +.3,30M et +.3,100M et +.3
298,01331,Saint-Alban,01,101,1.000,0.733,0.554,0.000,0.000,1.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000,0.000
299,01332,Saint-André-de-Bâgé,01,289,1.000,1.000,0.882,0.071,0.021,1.000,...,0.000,0.000,0.000,0.000,0.0,0.021,0.021,0.021,0.021,0.021
300,01333,Saint-André-de-Corcy,01,1453,1.000,0.987,0.975,0.522,0.048,1.000,...,0.000,0.000,0.000,0.000,0.0,0.048,0.048,0.048,0.048,0.048
301,01334,Saint-André-d'Huiriat,01,256,1.000,0.942,0.881,0.819,0.819,1.000,...,0.000,0.000,0.000,0.000,0.0,0.819,0.819,0.819,0.819,0.819
302,01335,Saint-André-le-Bouchoux,01,143,0.986,0.007,0.007,0.007,0.007,0.986,...,0.000,0.000,0.000,0.000,0.0,0.007,0.007,0.007,0.007,0.007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36619,97419,Sainte-Rose,974,2723,0.999,0.960,0.838,0.396,0.000,0.999,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000,0.000
36620,97420,Sainte-Suzanne,974,8901,0.995,0.958,0.936,0.776,0.722,0.995,...,0.000,0.000,0.000,0.000,0.0,0.722,0.722,0.722,0.722,0.722
36626,97502,Saint-Pierre,975,2653,0.991,0.991,0.991,0.991,0.000,0.000,...,0.991,0.991,0.991,0.991,0.0,0.000,0.000,0.000,0.000,0.000
36644,97701,Saint-Barthélemy,977,4693,1.000,0.936,0.762,0.146,0.000,1.000,...,0.000,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000,0.000


#### Etudier les multiples options de read_csv()

En particulier:

<pre>
pd.read_csv(
    <strong>filepath_or_buffer: Union[str, pathlib.Path, IO[~AnyStr]],</strong>
    <strong>sep=',',</strong>
    delimiter=None,
    <strong>header='infer',</strong>
    <strong>names=None,</strong>
    <strong>index_col=None,</strong>
    <strong>usecols=None,</strong>
    squeeze=False,
    prefix=None,
    mangle_dupe_cols=True,
    <strong>dtype=None,</strong>
    <strong>engine=None,</strong>
    <strong>converters=None,</strong>
    true_values=None,
    false_values=None,
    skipinitialspace=False,
    <strong>skiprows=None,</strong>
    <strong>skipfooter=0,</strong>
    <strong>nrows=None,</strong>
    <strong>na_values=None,</strong>
    <strong>keep_default_na=True,</strong>
    na_filter=True,
    verbose=False,
    skip_blank_lines=True,
    <strong>parse_dates=False,</strong>
    infer_datetime_format=False,
    keep_date_col=False,
    date_parser=None,
    dayfirst=False,
    cache_dates=True,
    iterator=False,
    <strong>chunksize=None,</strong>
    compression='infer',
    <strong>thousands=None,</strong>
    <strong>decimal='.',</strong>
    lineterminator=None,
    quotechar='"',
    quoting=0,
    doublequote=True,
    escapechar=None,
    comment=None,
    encoding=None,
    encoding_errors=None,
    dialect=None,
    error_bad_lines=True,
    warn_bad_lines=True,
    on_bad_lines=None,
    delim_whitespace=False,
    low_memory=True,
    memory_map=False,
    float_precision=None,
    storage_options=None
)
</pre>

#### Analyse automatique avec pandas_profiling

https://github.com/pandas-profiling/pandas-profiling

**ATTENTION, il vaut mieux installer `pandas_profiling` dans un nouvel environnement**

<pre>
conda create --name profiling

activate profiling OU conda activate profiling

conda install -c conda-forge pandas-profiling
</pre>

In [None]:
# profiling raw people
from pandas_profiling import ProfileReport

df = pd.read_csv('people.csv')

profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

profile.to_file("people.html")

In [None]:
# profiling clean people
profile = ProfileReport(clean_people(df), title='Pandas Profiling Report', explorative=True)

profile.to_file("clean_people.html")