In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from utils.constants import RAW_DIR
from utils.dataload import load_data
from utils.display import rdisplay

In [None]:
train_df = load_data(RAW_DIR / 'train.csv', load_func=pd.read_csv)
test_df = load_data(RAW_DIR / 'test.csv', load_func=pd.read_csv)

In [None]:
train_df.head()

## Check variables with same values

In [None]:
train_df['hogar_total'].eq(train_df['tamhog']).all()

In [None]:
train_df[['hogar_total', 'tamhog']].value_counts(dropna=False)

## Check NaN variables

In [None]:
train_df.isna().sum()[train_df.isna().sum() > 0]

In [None]:
# Given that v18q1 is NaN when v18q is 0 because of the survey design, then
# it's a valid strategy to fill it with 0 (meaning absense of tablets)
train_df[['v18q', 'v18q1']].value_counts(dropna=False)

In [None]:
# The mean education and the squared mean education are related, so the Nan
# values are found in the same observations. A good imputation strategy 
# `sqbmeaned` is to solve NaN in `meaneduc` and then square the values
train_df.loc[
    train_df['meaneduc'].isna(),
    ['meaneduc', 'sqbmeaned']
].value_counts(dropna=False)

In [None]:
train_df.loc[
    train_df['meaneduc'].isna(),
    ['escolari', 'age', 'idhogar']
].value_counts(dropna=False)

In [None]:
# It doesn't seem to be any reason for the NaN values
train_df.loc[
    train_df['idhogar'].isin(['faaebf71a', 'a874b7ce7', '1b31fd159']),
    ['idhogar', 'escolari', 'age', 'meaneduc']
]

In [None]:
# Because this match the right value of `meaneduc` for this household, then
# the NaN values in `meaneduc` will be calculated as the mean of `escolari`
train_df.loc[
    train_df['idhogar'].isin(['fff7d6be1']),
    ['escolari']
].mean()

In [None]:
# Given that `rez_esc` only state the amount of years the individual has
# delayed in their education, if it is NaN, then it could be set to 0
rdisplay(train_df[['rez_esc', 'escolari']] \
    .value_counts(dropna=False)\
    .sort_index())

In [None]:
# All the people that owned and already paid for their household or have other
# type of property, don't have any monthly rent set. In this case, we can
# easily impute a 0 in NaN places.
train_df.loc[
    train_df['v2a1'].isna(),
    ['v2a1', 'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']
].value_counts(dropna=False).sort_index()

In [None]:
train_df[
    ['tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']
].value_counts(dropna=False).sort_index()

## Check household variables

In [None]:
train_df[['dis', 'male', 'female']].value_counts(dropna=False)

In [None]:
train_df[['qmobilephone']].value_counts(dropna=False).sort_index()

In [None]:
train_df[['idhogar', 'qmobilephone']].value_counts(dropna=True).sort_index()

## Check variable with labels `yes` or `no` instead of numbers

In [None]:
columns = train_df.isin(['yes', 'no']).any(axis=0)
columns[columns]

In [None]:
train_df['dependency'].value_counts()