In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from utils.constants import RAW_DIR
from utils.dataload import load_data
from utils.display import rdisplay

In [None]:
train_df = load_data(RAW_DIR / 'train.csv', load_func=pd.read_csv)
test_df = load_data(RAW_DIR / 'test.csv', load_func=pd.read_csv)

In [None]:
train_df.head()

## Check variables with same values

In [None]:
train_df['hogar_total'].eq(train_df['tamhog']).all()

In [None]:
train_df[['hogar_total', 'tamhog']].value_counts(dropna=False)

## Check NaN variables

In [None]:
train_df.isna().sum()[train_df.isna().sum() > 0]

In [None]:
# Given that v18q1 is NaN when v18q is 0 because of the survey design, then
# it's a valid strategy to fill it with 0 (meaning absense of tablets)
train_df[['v18q', 'v18q1']].value_counts(dropna=False)

In [None]:
# The mean education and the squared mean education are related, so the Nan
# values are found in the same observations. A good imputation strategy 
# `sqbmeaned` is to solve NaN in `meaneduc` and then square the values
train_df.loc[
    train_df['meaneduc'].isna(),
    ['meaneduc', 'sqbmeaned']
].value_counts(dropna=False)

In [None]:
train_df.loc[
    train_df['meaneduc'].isna(),
    ['escolari', 'age', 'idhogar']
].value_counts(dropna=False)

In [None]:
# It doesn't seem to be any reason for the NaN values
train_df.loc[
    train_df['idhogar'].isin(['faaebf71a', 'a874b7ce7', '1b31fd159']),
    ['idhogar', 'escolari', 'age', 'meaneduc']
]

In [None]:
# Because this match the right value of `meaneduc` for this household, then
# the NaN values in `meaneduc` will be calculated as the mean of `escolari`
train_df.loc[
    train_df['idhogar'].isin(['fff7d6be1']),
    ['escolari']
].mean()

In [None]:
# Given that `rez_esc` only state the amount of years the individual has
# delayed in their education, if it is NaN, then it could be set to 0
rdisplay(train_df[['rez_esc', 'escolari']] \
    .value_counts(dropna=False)\
    .sort_index())

In [None]:
# All the people that owned and already paid for their household or have other
# type of property, don't have any monthly rent set. In this case, we can
# easily impute a 0 in NaN places.
train_df.loc[
    train_df['v2a1'].isna(),
    ['v2a1', 'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']
].value_counts(dropna=False).sort_index()

In [None]:
train_df[
    ['tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']
].value_counts(dropna=False).sort_index()

## Check household variables

In [52]:
train_df[['dis', 'male', 'female']].value_counts(dropna=False)

dis  male  female
0    0     1         4657
     1     0         4350
1    0     1          280
     1     0          270
dtype: int64

In [64]:
train_df[['qmobilephone']].value_counts(dropna=False).sort_index()

qmobilephone
0                236
1               1233
2               3164
3               2219
4               1560
5                669
6                324
7                 50
8                 67
9                 18
10                17
dtype: int64

In [65]:
train_df[['idhogar', 'qmobilephone']].value_counts(dropna=True).sort_index()

idhogar    qmobilephone
001ff74ca  1                2
003123ec2  2                4
004616164  2                2
004983866  2                2
005905417  1                3
006031de3  2                4
006555fe2  4                5
00693f597  3                4
006b64543  2                2
00941f1f4  0                4
009ae1cec  1                3
00e3e05c5  2                3
00e443b00  1                1
00edc0d0f  4                6
0108c62b8  3                6
013962b12  2                2
013e9ee6a  0                2
0172ab1d9  3                5
0194d569d  4                1
01c6fcb6e  1                1
01d2e2b0b  0                3
01ff6a086  2                2
020713493  1                1
023edfed0  2                3
0250e0b59  3                3
027651991  0                1
029f3d736  3                4
02a3da971  1                2
02ae50d8f  2                2
02e9bb4e7  1                3
02f34e26c  4                6
02ff93d1e  6                6
032d9f940  5    

## Check variable with labels `yes` or `no` instead of numbers

In [69]:
columns = train_df.isin(['yes', 'no']).any(axis=0)
columns[columns]

dependency    True
edjefe        True
edjefa        True
dtype: bool

In [71]:
train_df['dependency'].value_counts()

yes          2192
no           1747
.5           1497
2             730
1.5           713
.33333334     598
.66666669     487
8             378
.25           260
3             236
4             100
.75            98
.2             90
.40000001      84
1.3333334      84
2.5            77
5              24
1.25           18
3.5            18
.80000001      18
2.25           13
.71428573      12
1.75           11
1.2            11
.83333331      11
.22222222      11
.2857143        9
1.6666666       8
.60000002       8
6               7
.16666667       7
Name: dependency, dtype: int64