In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocess

In the preprocessing of the raw dataset, the variables are transformed to the real input of each model.

In [41]:
import pandas as pd

import preprocess.X as X_pre
import preprocess.y as y_pre

from sklearn.pipeline import make_pipeline

from utils.constants import RAW_DIR
from utils.dataload import load_data
from utils.display import cdisplay
from utils.functions import inspect_nulls
from utils.transformers import (NameTransformer, AssignTransformer,
    AggregateTransformer)

In [42]:
train_df = load_data(RAW_DIR / 'train.csv', load_func=pd.read_csv)
test_df = load_data(RAW_DIR / 'test.csv', load_func=pd.read_csv)

## Transformers - `X`
Each of the following transformers has the objective to processed the dataframe to turn it suitable for training machine learning models. The steps in the pipeline are:
* `NameTransformer`: Transforms column names into readable variable names
* `AssignTransformer`: Impute values for NaN observations in `monthly_rent`, `number_tablet`, `behind_school_years`, `education_years_mean_18+` and `education_years_mean_18+_sqd`
* `AggregationTransformer`: Brings all data to household level

In [None]:
name_transformer = NameTransformer(X_pre.NAMES_MAP, X_pre.KEEP_FEATURES)

assign_transformer = AssignTransformer(X_pre.ASSIGN_MAP)

aggregate_transformer = AggregateTransformer(
    X_pre.AGGREGATE_MAP, X_pre.INDEX_KEY, keep=True,
    exclude=['rubbish_disposal_throw_river_creek_sea'])

In [None]:
X_pipeline = make_pipeline(
    name_transformer,
    assign_transformer,
    aggregate_transformer
)

### NameTransformer

In [None]:
train_named_df = name_transformer.transform(train_df)
test_named_df = name_transformer.transform(test_df)

In [None]:
print(f'''
Train dataset:
    * {train_named_df.shape=}
    * {train_df.shape=}
''')

In [None]:
inspect_nulls(train_named_df)

### AssignTransformer

In [None]:
train_assigned_df = assign_transformer.transform(train_named_df)
test_assigned_df = assign_transformer.transform(test_named_df)

In [None]:
inspect_nulls(train_assigned_df)

### AggregateTransformer

In [None]:
train_aggregated_df = aggregate_transformer.transform(train_assigned_df)
test_aggregated_df = aggregate_transformer.transform(test_assigned_df)

In [None]:
train_assigned_df.shape

In [None]:
train_aggregated_df.shape

In [None]:
cdisplay(train_aggregated_df)

## Transformers - `y`
The target variable is processed and obtained independetly of the `X` features. The following transformers define the steps to follow:
* `NameTransformer`: Changes the name of the target variable and remove all other variables from the dataset, except the `idhogar`
* `AssignTransformer`: Transforms target variable into a binary variable meaning *general poverty* or *no poverty*
* `AggregateTransformer`: Obtain data at household level

In [None]:
y_name_transformer = NameTransformer(y_pre.NAMES_MAP, y_pre.KEEP_FEATURES)

y_assign_transformer = AssignTransformer(y_pre.ASSIGN_MAP)

y_aggregate_transformer = AggregateTransformer({}, y_pre.INDEX_KEY, keep=True)

In [None]:
y_pipeline = make_pipeline(
    y_name_transformer,
    y_assign_transformer,
    y_aggregate_transformer
)

## Pipeline

In [None]:
X_train = X_pipeline.fit_transform(train_df)
X_test = X_pipeline.fit_transform(test_df)
y = y_pipeline.fit_transform(train_df)

In [None]:
X_train.shape

In [None]:
cdisplay(X_train)

In [None]:
y.shape

In [None]:
cdisplay(y)

## Save datasets

In [None]:
from utils.constants import STAGE_DIR

X_train.to_parquet(STAGE_DIR / 'preprocess' / 'X_train.parquet')
X_test.to_parquet(STAGE_DIR / 'preprocess' / 'X_test.parquet')
y.to_parquet(STAGE_DIR / 'preprocess' / 'y.parquet')