In [1]:
%load_ext autoreload
%autoreload 2

# Preprocess

In the preprocessing of the raw dataset, the variables are transformed to the real input of each model.

In [13]:
import pandas as pd

import preprocess.preprocess as pre

from sklearn.pipeline import make_pipeline

from utils.constants import RAW_DIR
from utils.dataload import load_data
from utils.functions import inspect_nulls
from utils.transformers import NameTransformer, AssignTransformer

In [3]:
train_df = load_data(RAW_DIR / 'train.csv', load_func=pd.read_csv)
test_df = load_data(RAW_DIR / 'test.csv', load_func=pd.read_csv)

## Transformers
Each of the following transformers has the objective to processed the dataframe to turn it suitable for training machine learning models. The steps in the pipeline are:
* `NameTransformer`: Transforms column names into readable variable names

In [9]:
name_transformer = NameTransformer(pre.NAMES_MAP, pre.KEEP_FEATURES)

assign_transformer = AssignTransformer(pre.ASSIGN_MAP)

In [5]:
pipeline = make_pipeline(
    name_transformer,
    assign_transformer
)

## Run preprocessing

### NameTransformer

In [6]:
train_named_df = name_transformer.transform(train_df)
test_named_df = name_transformer.transform(test_df)

In [11]:
print(f'''
Train dataset:
    * {train_named_df.shape=}
    * {train_df.shape=}

Test dataset:
    * {test_named_df.shape=}
    * {test_df.shape=}
''')


Train dataset:
    * train_named_df.shape=(9557, 138)
    * train_df.shape=(9557, 143)

Test dataset:
    * test_named_df.shape=(23856, 138)
    * test_df.shape=(23856, 142)



In [14]:
inspect_nulls(train_named_df)

monthly_rent                    6860
number_tablet                   7342
behind_school_years             7928
education_years_mean_18+           5
education_years_mean_18+_sqd       5
dtype: int64

### Assign Transformer

In [10]:
train_assigned_df = assign_transformer.transform(train_named_df)
test_assigned_df = assign_transformer.transform(test_named_df)

In [16]:
inspect_nulls(train_assigned_df)

Series([], dtype: int64)