## Tratamiento de valores faltantes

**Librerías**

In [1]:
import pandas as pd, numpy as np

**Importación de datos**

In [2]:
import base64
def create_onedrive_directdownload (onedrive_link):
    data_bytes64 = base64.b64encode(bytes(onedrive_link, 'utf-8'))
    data_bytes64_String = data_bytes64.decode('utf-8').replace('/','_').replace('+','-').rstrip("=")
    resultUrl = f"https://api.onedrive.com/v1.0/shares/u!{data_bytes64_String}/root/content"
    return resultUrl

In [3]:
onedrive_link = 'https://1drv.ms/u/s!AneKqxx3Qjofh8A2IFtajXgZdo5ucw?e=DKb3Tn'
onedrive_direct_link = create_onedrive_directdownload(onedrive_link)
df = pd.read_csv(filepath_or_buffer = onedrive_direct_link)

In [4]:
df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


### 1. Identificación de valores faltantes (null - NAN)

El método info() proporciona información sobre un Dataset completo. Podemos consultar los valores que no nulos entre las variables disponibles.

In [10]:
# Existen valores faltantes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35549 entries, 0 to 35548
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   record_id        35549 non-null  int64  
 1   month            35549 non-null  int64  
 2   day              35549 non-null  int64  
 3   year             35549 non-null  int64  
 4   plot_id          35549 non-null  int64  
 5   species_id       34786 non-null  object 
 6   sex              33038 non-null  object 
 7   hindfoot_length  31438 non-null  float64
 8   weight           32283 non-null  float64
dtypes: float64(2), int64(5), object(2)
memory usage: 2.4+ MB


In [95]:
# Cantidad total de valores faltantes en el Dataset
df.isnull().values.sum()

10651

In [16]:
# Verificación directa
# Si el valor es True, entonces existe al menos un valor faltante en el dataset
df.isnull().values.any()
# isna() es una función equivalente para isnull()
df.isna().values.any()

True

In [21]:
# Los valores True pueden sumarse, su valor es de 1
# hindfoot_length tiene la mayor cantidad de valores nulos
df.isnull().sum().sort_values(ascending = False)

hindfoot_length    4111
weight             3266
sex                2511
species_id          763
record_id             0
month                 0
day                   0
year                  0
plot_id               0
dtype: int64

Podemos aislar las columnas con valores faltantes

In [97]:
# Aislar variables que contienen valores null
mask = df.isnull().any(axis = 0)
print(mask)
df.loc[:,mask].head()

record_id          False
month              False
day                False
year               False
plot_id            False
species_id          True
sex                 True
hindfoot_length     True
weight              True
dtype: bool


Unnamed: 0,species_id,sex,hindfoot_length,weight
0,NL,M,32.0,
1,NL,M,33.0,
2,DM,F,37.0,
3,DM,M,36.0,
4,DM,M,35.0,


In [98]:
# Filtrar filas que tienen todos los valores null
# El dataset no tiene filas vacías
df.loc[df.isnull().all(axis = 1)]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight


In [100]:
# Filtrar filas que tienen al menos un valor faltante
df.loc[df.isnull().any(axis = 1)]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
...,...,...,...,...,...,...,...,...,...
35530,35531,12,31,2002,13,PB,F,27.0,
35543,35544,12,31,2002,15,US,,,
35544,35545,12,31,2002,15,AH,,,
35545,35546,12,31,2002,15,AH,,,


### 2. Identificación de valores **no** faltantes 

In [23]:
# Existen valores no faltantes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35549 entries, 0 to 35548
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   record_id        35549 non-null  int64  
 1   month            35549 non-null  int64  
 2   day              35549 non-null  int64  
 3   year             35549 non-null  int64  
 4   plot_id          35549 non-null  int64  
 5   species_id       34786 non-null  object 
 6   sex              33038 non-null  object 
 7   hindfoot_length  31438 non-null  float64
 8   weight           32283 non-null  float64
dtypes: float64(2), int64(5), object(2)
memory usage: 2.4+ MB


In [27]:
# Verificación directa
# Si el valor es True, entonces existe al menos un valor no faltante en el dataset
df.notnull().values.any()
# notna() es una función equivalente para notnull()
df.notna().values.any()

True

In [33]:
# Esta método es equivalente a la información proporcionada por .info()
df.notnull().sum()

record_id          35549
month              35549
day                35549
year               35549
plot_id            35549
species_id         34786
sex                33038
hindfoot_length    31438
weight             32283
dtype: int64

In [104]:
# Filtrar filas que no tienen valores vacíos
df.loc[df.notnull().all(axis = 1)]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
62,63,8,19,1977,3,DM,M,35.0,40.0
63,64,8,19,1977,7,DM,M,37.0,48.0
64,65,8,19,1977,4,DM,F,34.0,29.0
65,66,8,19,1977,4,DM,F,35.0,46.0
66,67,8,19,1977,7,DM,M,35.0,36.0
...,...,...,...,...,...,...,...,...,...
35540,35541,12,31,2002,15,PB,F,24.0,31.0
35541,35542,12,31,2002,15,PB,F,26.0,29.0
35542,35543,12,31,2002,15,PB,F,27.0,34.0
35546,35547,12,31,2002,10,RM,F,15.0,14.0
