# Curso de Manejo de Datos Faltantes: Imputacion

## Configuracion de ambiente de trabajo

```bash
pip install --upgrade pip
```

```bash
pip install pyjanitor matplotlib missingno nhanes pandas scipy seaborn session-info sklearn statsmodels upsetplot
```

or 

```bash
pip install -r requirements.txt

## Importar librerias

In [1]:
import janitor  #limpieza de datos
import matplotlib.pyplot as plt #visualizar datos
import missingno    #explorar missings
import nhanes.load  #dataframe del cual importaremos los datos
import numpy as np  #calculos numerios con df
import pandas as pd #manejo de df
import scipy.stats  #para realizar estadisticos
import seaborn as sns   #visualizacion estadistica
import session_info #ver que paquetes tenemos
import sklearn.compose  #cotiene todos los modelos y submodulos para hacer imputacion sencilla a missings
import sklearn.impute   
import sklearn.preprocessing
import statsmodels.api as sm    #crearemos modelos y accederemos a conjunto de datos de prueba
import statsmodels.datasets
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.mosaicplot import mosaic

  @_expand_grid.register(pd.arrays.PandasArray)


## Importar funciones personalizadas

In [2]:
%run pandas-missing-extension.ipynb

## configurar el aspecto generarl de las graficas del proyecto

In [5]:
%matplotlib inline

sns.set(
    rc={
        "figure.figsize": (8, 6)
    }
)

sns.set_style("whitegrid")
sns.set_style("whitegrid")

## El problema de trabajar con valres faltantes

In [10]:
airquality_df = (
    sm.datasets.get_rdataset("airquality")  #accedesmos a un df que provee statmodels que provienen de R
    .data
)

airquality_df

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,,,14.3,56,5,5
...,...,...,...,...,...,...
148,30.0,193.0,6.9,70,9,26
149,,145.0,13.2,77,9,27
150,14.0,191.0,14.3,75,9,28
151,18.0,131.0,8.0,76,9,29


In [18]:
airquality_df = (
    sm.datasets.get_rdataset("airquality")  #accedesmos a un df que provee statmodels que provienen de R
    .data
    .clean_names(   #arreglamos nombres de variables
        case_type = "snake"
    )
    .add_column("year", 1973)   #añadimos variable/columna year
    .assign(    #añadimos variable/columna date, convirtiendo variable a fecha, con datos en formato year-month-day
        date = lambda df: pd.to_datetime(df[["year", "month", "day"]])
    )
    .sort_values(by = "date")   #ordenamos df por variable date
    .set_index("date")  #ponemos com indice del df a variable date
)

airquality_df

Unnamed: 0_level_0,ozone,solar_r,wind,temp,month,day,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1973-05-01,41.0,190.0,7.4,67,5,1,1973
1973-05-02,36.0,118.0,8.0,72,5,2,1973
1973-05-03,12.0,149.0,12.6,74,5,3,1973
1973-05-04,18.0,313.0,11.5,62,5,4,1973
1973-05-05,,,14.3,56,5,5,1973
...,...,...,...,...,...,...,...
1973-09-26,30.0,193.0,6.9,70,9,26,1973
1973-09-27,,145.0,13.2,77,9,27,1973
1973-09-28,14.0,191.0,14.3,75,9,28,1973
1973-09-29,18.0,131.0,8.0,76,9,29,1973


#### Ajustamos modelos de regresion lineal

In [19]:
(
    smf.ols(
        formula="temp ~ ozone",
        data=airquality_df
    )
    .fit()
    .summary()
    .tables[0]
)

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)


0,1,2,3
Dep. Variable:,temp,R-squared:,0.488
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,108.5
Date:,"Sat, 09 Dec 2023",Prob (F-statistic):,2.93e-18
Time:,18:09:39,Log-Likelihood:,-386.27
No. Observations:,116,AIC:,776.5
Df Residuals:,114,BIC:,782.1
Df Model:,1,,
Covariance Type:,nonrobust,,


In [20]:
#consideramos variable adicional
(
    smf.ols(
        formula="temp ~ ozone + solar_r",
        data=airquality_df
    )
    .fit()
    .summary()
    .tables[0]
)

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)


0,1,2,3
Dep. Variable:,temp,R-squared:,0.491
Model:,OLS,Adj. R-squared:,0.481
Method:,Least Squares,F-statistic:,52.07
Date:,"Sat, 09 Dec 2023",Prob (F-statistic):,1.47e-16
Time:,18:10:58,Log-Likelihood:,-369.78
No. Observations:,111,AIC:,745.6
Df Residuals:,108,BIC:,753.7
Df Model:,2,,
Covariance Type:,nonrobust,,


#### Reto: Datos de supervivientes

In [22]:
survival_df = sm.datasets.get_rdataset("flchain", "survival").data

survival_df

Unnamed: 0,age,sex,sample.yr,kappa,lambda,flc.grp,creatinine,mgus,futime,death,chapter
0,97,F,1997,5.700,4.860,10,1.7,0,85,1,Circulatory
1,92,F,2000,0.870,0.683,1,0.9,0,1281,1,Neoplasms
2,94,F,1997,4.360,3.850,10,1.4,0,69,1,Circulatory
3,92,F,1996,2.420,2.220,9,1.0,0,115,1,Circulatory
4,93,F,1996,1.320,1.690,6,1.1,0,1039,1,Circulatory
...,...,...,...,...,...,...,...,...,...,...,...
7869,52,F,1995,1.210,1.610,6,1.0,0,4997,0,
7870,52,F,1999,0.858,0.581,1,0.8,0,3652,0,
7871,54,F,2002,1.700,1.720,8,,0,2507,0,
7872,53,F,1995,1.710,2.690,9,,0,4982,0,


In [26]:
#consideramos variable adicional
(
    smf.ols(
        formula="death ~ age + chapter",
        data=survival_df
    )
    .fit()
    .summary()
    .tables[0]
)

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return 1 - self.ssr/self.centered_tss


0,1,2,3
Dep. Variable:,death,R-squared:,-inf
Model:,OLS,Adj. R-squared:,-inf
Method:,Least Squares,F-statistic:,-134.5
Date:,"Sat, 09 Dec 2023",Prob (F-statistic):,1.0
Time:,18:22:44,Log-Likelihood:,65649.0
No. Observations:,2169,AIC:,-131300.0
Df Residuals:,2152,BIC:,-131200.0
Df Model:,16,,
Covariance Type:,nonrobust,,


In [25]:
#consideramos variable adicional
(
    smf.ols(
        formula="death ~ futime + chapter",
        data=survival_df
    )
    .fit()
    .summary()
    .tables[0]
)

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return 1 - self.ssr/self.centered_tss


0,1,2,3
Dep. Variable:,death,R-squared:,-inf
Model:,OLS,Adj. R-squared:,-inf
Method:,Least Squares,F-statistic:,-134.5
Date:,"Sat, 09 Dec 2023",Prob (F-statistic):,1.0
Time:,18:20:01,Log-Likelihood:,62644.0
No. Observations:,2169,AIC:,-125300.0
Df Residuals:,2152,BIC:,-125200.0
Df Model:,16,,
Covariance Type:,nonrobust,,
