# Lección 14,15 - Datos Perdidos y Faltantes

In [1]:
import numpy as np
import pandas as pd

## Handling Missing Data

In [3]:
float_data = pd.Series([1.2,-3.5,np.nan,0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

El método **isna()** nos regresa un valor Boolean Series con True donde los valores son null.

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
float_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

Datos String

In [7]:
string_data = pd.Series(["aadvark",np.nan,None,"avocado"])
string_data

0    aadvark
1        NaN
2       None
3    avocado
dtype: object

In [8]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [10]:
string_data.isnull()

0    False
1     True
2     True
3    False
dtype: bool

Datos Float

In [12]:
float_data = pd.Series([1,2,None],dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [13]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## Filtering Out Missing Data

Metodo **dropna()** elimina datos de tipo **nan**

In [14]:
data = pd.Series([1,np.nan,3.5,np.nan,7])

In [15]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

Al aplicar el metodo **dropna()** en una matríz, se eliminan las filas que contengan al menos un **nan**

In [24]:
data = pd.DataFrame(
    [[1, 6.5, 5], [1, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 4]]
)
data

Unnamed: 0,0,1,2
0,1.0,6.5,5.0
1,1.0,,
2,,,
3,,6.5,4.0


In [20]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,5.0


El parametro **how='all'** elimina sólo las filas donde todas sus celdas sean de tipo **nan**

In [21]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,5.0
1,1.0,,
3,,6.5,4.0


El parametro **axis=columns** elimina sólo las columnas donde todas sus celdas sean de tipop **nan**

In [26]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,5.0,
1,1.0,,,
2,,,,
3,,6.5,4.0,


In [27]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,5.0
1,1.0,,
2,,,
3,,6.5,4.0


Umbral para indicar el numero de celdas perdidas

In [47]:
# mean = 7, standar_desviation = 3
df = pd.DataFrame(np.random.standard_normal((7,3)))
df

Unnamed: 0,0,1,2
0,0.672225,-1.187833,-0.481827
1,-1.002762,1.549488,1.583276
2,-0.76163,-0.73794,1.002093
3,-1.334756,-1.31288,-0.985875
4,0.710548,-0.601707,1.127818
5,-0.268375,-0.179387,0.392851
6,0.928893,-0.103183,0.594739


In [48]:
df.iloc[:4,1] = np.nan

In [49]:
df.iloc[:2,2] = np.nan

In [50]:
df

Unnamed: 0,0,1,2
0,0.672225,,
1,-1.002762,,
2,-0.76163,,1.002093
3,-1.334756,,-0.985875
4,0.710548,-0.601707,1.127818
5,-0.268375,-0.179387,0.392851
6,0.928893,-0.103183,0.594739


In [35]:
df.dropna()

Unnamed: 0,0,1,2
4,1.522882,1.031779,3.10504
5,-1.334365,1.132382,-1.153891
6,0.537908,-0.21037,-1.344169


In [43]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.22436,,-0.470608
3,0.168214,,0.653652
4,1.522882,1.031779,3.10504
5,-1.334365,1.132382,-1.153891
6,0.537908,-0.21037,-1.344169


## Filling In Missing Data

In [44]:
df

Unnamed: 0,0,1,2
0,0.361866,,
1,0.091063,,
2,-1.22436,,-0.470608
3,0.168214,,0.653652
4,1.522882,1.031779,3.10504
5,-1.334365,1.132382,-1.153891
6,0.537908,-0.21037,-1.344169


In [45]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.361866,0.0,0.0
1,0.091063,0.0,0.0
2,-1.22436,0.0,-0.470608
3,0.168214,0.0,0.653652
4,1.522882,1.031779,3.10504
5,-1.334365,1.132382,-1.153891
6,0.537908,-0.21037,-1.344169


Rellena con el ultimo valor conocido. En este ejemplo sería **-0.601707** y **1.002093**

In [51]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,0.672225,,
1,-1.002762,,
2,-0.76163,,1.002093
3,-1.334756,,-0.985875
4,0.710548,-0.601707,1.127818
5,-0.268375,-0.179387,0.392851
6,0.928893,-0.103183,0.594739


In [52]:
df.bfill()

Unnamed: 0,0,1,2
0,0.672225,-0.601707,1.002093
1,-1.002762,-0.601707,1.002093
2,-0.76163,-0.601707,1.002093
3,-1.334756,-0.601707,-0.985875
4,0.710548,-0.601707,1.127818
5,-0.268375,-0.179387,0.392851
6,0.928893,-0.103183,0.594739


In [54]:
df.bfill(limit=2)

Unnamed: 0,0,1,2
0,0.672225,,1.002093
1,-1.002762,,1.002093
2,-0.76163,-0.601707,1.002093
3,-1.334756,-0.601707,-0.985875
4,0.710548,-0.601707,1.127818
5,-0.268375,-0.179387,0.392851
6,0.928893,-0.103183,0.594739


Rellenar con la **media** de los datos en la columna

In [56]:
data = pd.Series([1,np.nan,3.5,np.nan,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [57]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64