In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
""" Cleaning Data """

' Cleaning Data '

In [4]:
""" 7.1 Handling Missing Data """

' 7.1 Handling Missing Data '

In [5]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [6]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [7]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
string_data[0] = None

In [9]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [10]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [11]:
""" Filtrer les données manquantes"""

' Filtrer les données manquantes'

In [12]:
# dropna() : ne pas prendre en compte les NA
from numpy import nan as NA

In [13]:
data = pd.Series([1,NA,3.5,7])

In [14]:
# Ne renvoie pas les NA
data.dropna()

0    1.0
2    3.5
3    7.0
dtype: float64

In [15]:
#Equivalent à 
data[data.notnull()]

0    1.0
2    3.5
3    7.0
dtype: float64

In [16]:
# Pour les DataFrame c'est plus compliqué. il faut retirer les lignes ou colonnes qui ont un NA ou plus ou bien qui sont tous NA

In [17]:
#dropna() sur un Dataframe retire la ligne avec un NA
data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
cleaned = data.dropna()

In [20]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [21]:
# retirer les lignes qui sont exclusivement NA
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [22]:
data[4] = NA

In [23]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [24]:
#retirer les colonnes qui sont exclusivement NA
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
#garder les lignes contenant un certain nombre d'observations. thresh
df = pd.DataFrame(np.random.randn(7,3))

In [26]:
df.iloc[:4,1] = NA

In [27]:
df

Unnamed: 0,0,1,2
0,-1.384245,,1.680229
1,-0.647481,,0.2097
2,-0.815958,,-0.660107
3,1.358233,,-0.903007
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [28]:
df.iloc[:2,2] = NA

In [29]:
df

Unnamed: 0,0,1,2
0,-1.384245,,
1,-0.647481,,
2,-0.815958,,-0.660107
3,1.358233,,-0.903007
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [30]:
df.dropna()

Unnamed: 0,0,1,2
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [31]:
df.dropna(thresh=2)#garder les lignes avec au moins 2 observations

Unnamed: 0,0,1,2
2,-0.815958,,-0.660107
3,1.358233,,-0.903007
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [32]:
""" Filling in Missing Data """

' Filling in Missing Data '

In [33]:
#Remplir les données manquantes avec des '0'
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.384245,0.0,0.0
1,-0.647481,0.0,0.0
2,-0.815958,0.0,-0.660107
3,1.358233,0.0,-0.903007
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [34]:
#REMPLIR les données avec une valeur différente par colonne
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-1.384245,0.5,0.0
1,-0.647481,0.5,0.0
2,-0.815958,0.5,-0.660107
3,1.358233,0.5,-0.903007
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [35]:
# fillna() retourne un nouvel objet. mais on peut modifier l'objet même
_ = df.fillna(0,inplace=True)

In [36]:
df

Unnamed: 0,0,1,2
0,-1.384245,0.0,0.0
1,-0.647481,0.0,0.0
2,-0.815958,0.0,-0.660107
3,1.358233,0.0,-0.903007
4,2.528107,-0.284595,0.849475
5,-0.659756,-2.39682,-0.046299
6,-0.499475,0.671998,-1.17364


In [37]:
#Remplir les NA avec la dernière observation d'une colonne
df = pd.DataFrame(np.random.randn(6,3))

In [38]:
df.iloc[2:,1] = NA

In [39]:
df.iloc[4:,2] = NA

In [40]:
df

Unnamed: 0,0,1,2
0,-0.142701,0.252024,1.252054
1,-0.679211,-1.340939,0.140821
2,-2.057029,,0.759409
3,1.033584,,-0.268919
4,0.474293,,
5,0.929122,,


In [41]:
#ffill
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.142701,0.252024,1.252054
1,-0.679211,-1.340939,0.140821
2,-2.057029,-1.340939,0.759409
3,1.033584,-1.340939,-0.268919
4,0.474293,-1.340939,-0.268919
5,0.929122,-1.340939,-0.268919


In [42]:
# limiter le nombre de remplissage
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,-0.142701,0.252024,1.252054
1,-0.679211,-1.340939,0.140821
2,-2.057029,-1.340939,0.759409
3,1.033584,-1.340939,-0.268919
4,0.474293,,-0.268919
5,0.929122,,-0.268919


In [43]:
data = pd.Series([1.,NA,3.5,NA,7])

In [44]:
#remplir avec la valeur moyenne
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64