In [1]:
%load_ext autoreload
%autoreload 2

# Preprocess

In the preprocessing of the raw dataset, the variables are transformed to the real input of each model.

In [5]:
import pandas as pd

import preprocess.preprocess as pre

from sklearn.pipeline import make_pipeline

from utils.constants import RAW_DIR
from utils.dataload import load_data
from utils.transformers import NameTransformer

In [3]:
train_df = load_data(RAW_DIR / 'train.csv', load_func=pd.read_csv)
test_df = load_data(RAW_DIR / 'test.csv', load_func=pd.read_csv)

## Transformers
Each of the following transformers has the objective to processed the dataframe to turn it suitable for training machine learning models. The steps in the pipeline are:
* `NameTransformer`: Transforms column names into readable variable names

In [7]:
name_transformer = NameTransformer(pre.NAMES_MAP, pre.KEEP_FEATURES)

In [8]:
pipeline = make_pipeline(
    name_transformer
)

## Run preprocessing

In [9]:
train_named_df = name_transformer.transform(train_df)
test_named_df = name_transformer.transform(test_df)

In [12]:
print(f'''
Train dataset:
    * {train_named_df.shape=}
    * {train_df.shape=}
    * {train_named_df.columns}

Test dataset:
    * {test_named_df.shape=}
    * {test_df.shape=}
    * {test_named_df.columns}
''')


Train dataset:
    * train_named_df.shape=(9557, 138)
    * train_df.shape=(9557, 143)
    * Index(['monthly_rent', 'is_overcrowd_by_bedrooms', 'number_rooms',
       'is_overcrowd_by_rooms', 'has_toilet', 'has_refrigerator', 'has_tablet',
       'number_tablet', 'male_12-', 'male_12+',
       ...
       'scholarship_years_sqd', 'age_sqd', 'total_household_sqd',
       'head_education_sqd', 'children_19-_sqd', 'members_per_room_sqd',
       'dependency_rate_sqd', '', 'idhogar', 'age'],
      dtype='object', length=138)

Test dataset:
    * test_named_df.shape=(23856, 138)
    * test_df.shape=(23856, 142)
    * Index(['monthly_rent', 'is_overcrowd_by_bedrooms', 'number_rooms',
       'is_overcrowd_by_rooms', 'has_toilet', 'has_refrigerator', 'has_tablet',
       'number_tablet', 'male_12-', 'male_12+',
       ...
       'scholarship_years_sqd', 'age_sqd', 'total_household_sqd',
       'head_education_sqd', 'children_19-_sqd', 'members_per_room_sqd',
       'dependency_rate_sqd', '', 'i