# Curso de Manejo de Datos Faltantes: Detecciòn y Exploraciòn

## Configuraciòn de ambiente de trabajo

```bash
pip install --upgrade pip
```

```bash
pip install pyjanior matplotlib missingno numpy pandas pyreadr seaborn session-info upsetplot

or

pip install -r requirements.txt
```

## Importar librerìas

In [2]:
import janitor
import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import pyreadr
import seaborn as sns
import session_info
import upsetplot


## Operar con valores faltantes

### Python

In [11]:
print(
    None or True,
    None or False,
    None == None,
    None is None,
    #None + None,
    #None / None,
    type(None),
    sep="\n",
)

True
False
True
True
<class 'NoneType'>


### NumPy

In [17]:
print(
    np.nan or True,
    np.nan == np.nan,
    np.nan is np.nan,
    np.nan / 2,
    type(np.nan),
    np.isnan(np.nan),
    sep="\n",
)

nan
False
True
nan
<class 'float'>
True


### Pandas

In [20]:
test_missing_df = pd.DataFrame.from_dict(
    data=dict(
        x=[0, 1, np.nan, np.nan, None],
        y=[0, 1, pd.NA, np.nan, None]
    )
)

test_missing_df

Unnamed: 0,x,y
0,0.0,0.0
1,1.0,1.0
2,,
3,,
4,,


In [36]:
test_missing_df.isna()


Unnamed: 0,x,y
0,False,False
1,False,False
2,True,True
3,True,True
4,True,True


In [38]:
test_missing_df.isnull()

Unnamed: 0,x,y
0,False,False
1,False,False
2,True,True
3,True,True
4,True,True


In [39]:
test_missing_df.x.isnull()

0    False
1    False
2     True
3     True
4     True
Name: x, dtype: bool

In [44]:
pd.Series([1, np.nan])

0    1.0
1    NaN
dtype: float64

In [45]:
pd.Series([pd.to_datetime("2022-01-01"), np.nan])

0   2022-01-01
1          NaT
dtype: datetime64[ns]

In [47]:
pd.Series([-1]).isnull()

0    False
dtype: bool

## Cargar los conjuntos de datos

### Pima Indians Diabetes

In [49]:
pima_indians_diabetes_url = "https://nrvis.com/data/mldata/pima-indians-diabetes.csv"

In [51]:
!wget -O ../data/pima-indians-diabetes.csv { pima_indians_diabetes_url } -q

In [61]:
diabetes_df = pd.read_csv(
    "../data/pima-indians-diabetes.csv", # or pima_indians_diabetes_url
    sep=",",
    names=[
            "pregnancies",
            "glucose",
            "blood_pressure",
            "skin_thickness",
            "insulin",
            "bmi",
            "diabetes_pedigree_function",
            "age",
            "outcome",
        ],
)

diabetes_df

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### naniar (oceanbuoys, pedestrian, riskfactors)

#### Crear unidades de informaciòn de los conjuntos de datos

In [65]:
base_url = "https://github.com/njtierney/naniar/raw/master/data/"
datasets_names = ("oceanbuoys", "pedestrian", "riskfactors")
extension = ".rda"

#### Descargar y cargar los conjuntos de datos

In [70]:
datasets_dfs = {}

for dataset_name in datasets_names:

    dataset_file = f"{ dataset_name }{ extension }"
    dataset_output_file = f"../data/{ dataset_file }"
    dataset_url = f"{ base_url }{dataset_file}"

    !wget -q -O  { dataset_output_file } { dataset_url }

    datasets_dfs[f"{ dataset_name }_df"] = pyreadr.read_r(dataset_output_file).get(dataset_name)

datasets_dfs.keys()

dict_keys(['oceanbuoys_df', 'pedestrian_df', 'riskfactors_df'])