# 4. Preparing the Data

Creating the pre-processing pipeline to transform the data following the ```df_decision``` from the EDA and then fitting some models with default parameters to check how they will adapt to the data and make a final cut for fine-tunning. Use:

```make pipeline```

To create the pipeline, generate X_train, y_train, X_valid_transformed and y_valid, run the models and generate ```df_classifiers``` with the scores.

## Preparing our toolbox

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Libraries created for this project
from src.data.explore_data import (
    list_datasets,
    describe_feature,
    overview_data,
    create_dataframe,
    describe_features,
    create_exploratory_dataset,
)
from src.features.pipeline import (
    NanTransformer,
    trim_outliers,
    run_preprocessing_pipeline,
    create_classifiers_dataset,
)

# Basic Imports
import pandas as pd
import numpy as np
import re

# Sklearn Base
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

# Sklearn Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Imblearn
from imblearn import FunctionSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

# Feature Engine
from feature_engine.imputation import (
    MeanMedianImputer,
    CategoricalImputer,
    RandomSampleImputer,
)
from feature_engine.encoding import OrdinalEncoder
from feature_engine.outliers import OutlierTrimmer
from feature_engine.creation import MathFeatures, RelativeFeatures
from feature_engine.selection import DropFeatures

## Pre-processing data

In [2]:
# Loading DataFrames
df = create_dataframe("raw", "application_train.csv")
df_decision = create_dataframe("interim", "application_decision.csv")

# Creating X and y
X = df.drop(labels=("TARGET"), axis=1).copy()
y = df["TARGET"].copy()

# Splitting into Train and Validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Defining features to be processed in the pipeline - following the decision dataframe
drop_cols = df_decision["Column"][df_decision.isin(["drop"]).any(axis=1)].to_list()
xna_cols = ["CODE_GENDER", "ORGANIZATION_TYPE"]
unknown_cols = ["NAME_FAMILY_STATUS"]
ordinal_encoder_cols = df_decision["Column"][
    df_decision["TypeDecision"].str.contains("OrdinalEncoder")
].to_list()
nan_to_frequent_cols = df_decision["Column"][
    df_decision["NanDecision"] == "frequent"
].to_list()
nan_to_random_cols = df_decision["Column"][
    df_decision["NanDecision"] == "random"
].to_list()
outliers_cols = df_decision["Column"][
    df_decision["OutliersDecision"] == "remove"
].to_list()
group_cols = df_decision["Column"][df_decision["CorrDecision"] == "group"].to_list()
rate_annuity_cols = ["AMT_CREDIT", "AMT_ANNUITY"]
rate_credit_cols = ["AMT_INCOME_TOTAL", "AMT_CREDIT"]
rate_income_cols = ["AMT_INCOME_TOTAL", "CNT_FAM_MEMBERS"]

In [5]:
preprocessing_pipeline = imbpipeline(
    [
        # Step 1. Drop Columns with NanPercentage > 30% and SK_ID_CURR
        ("drop_features", DropFeatures(features_to_drop=drop_cols)),
        
        # Step 2. Reveal Hidden NaNs
        ("xna_transformer", NanTransformer(nan_string="XNA", variables=xna_cols)),
        (
            "unknown_transformer",
            NanTransformer(nan_string="Unknown", variables=unknown_cols),
        ),
        
        # Step 3. Impute NaN Values
        ("num_imputer", MeanMedianImputer()),
        (
            "cat1_imputer",
            CategoricalImputer(
                imputation_method="frequent", variables=nan_to_frequent_cols
            ),
        ),
        (
            "cat2_imputer",
            RandomSampleImputer(random_state=42, variables=nan_to_random_cols),
        ),
        
        # Step 4. Encode Categorical Features
        ("ordinal_encoder", OrdinalEncoder(variables=ordinal_encoder_cols)),
        
        # Step 5. Trim Outliers (with Z-score > +-3)
        (
            "trim_outliers",
            FunctionSampler(
                func=trim_outliers,
                validate=False,
                kw_args={
                    "capping_method": "gaussian",
                    "tail": "both",
                    "variables": outliers_cols,
                },
            ),
        ),
        
        # Step 6. Group Document Columns
        (
            "grouper",
            MathFeatures(
                variables=group_cols,
                func=[np.sum],
                new_variables_names=["DOCUMENTS_PROVIDED"],
                drop_original=True,
            ),
        ),
        
        # Step 7. Create new Features
        (
            "AMT_CREDIT_div_AMT_ANNUITY_creator",
            RelativeFeatures(
                variables=[rate_annuity_cols[0]],
                reference=[rate_annuity_cols[1]],
                func=["div"],
                drop_original=False,
            ),
        ),
        (
            "AMT_INCOME_TOTAL_div_AMT_CREDIT_creator",
            RelativeFeatures(
                variables=[rate_credit_cols[0]],
                reference=[rate_credit_cols[1]],
                func=["div"],
                drop_original=False,
            ),
        ),
        (
            "AMT_INCOME_TOTAL_div_CNT_FAM_MEMBERS_creator",
            RelativeFeatures(
                variables=[rate_income_cols[0]],
                reference=[rate_income_cols[1]],
                func=["div"],
                drop_original=False,
            ),
        ),
        
        # Step 8. Scale features
        ("scaler", MinMaxScaler()),
    ]
)

### Reproducing Train and Validation DataFrames

Running the pipeline, and saving training and validation datasets to ```data/processed/``` folder for reproducibility purposes.

In [3]:
run_preprocessing_pipeline()

# 5. Model Selection and Training

### Short-Listing Promising Models

In [6]:
classifiers_pipeline = clone(preprocessing_pipeline)
classifiers_pipeline.steps.append(["classifier", DummyClassifier(random_state=42)])

classifiers = [
    {"classifier": (DummyClassifier(strategy="stratified", random_state=42),)},
    {"classifier": (LogisticRegression(random_state=42, max_iter=1000),)},
    {"classifier": (KNeighborsClassifier(),)},
    {"classifier": (DecisionTreeClassifier(random_state=42),)},
    {"classifier": (LinearSVC(random_state=42),)},
    {"classifier": (RandomForestClassifier(random_state=42),)},
    {"classifier": (AdaBoostClassifier(random_state=42),)},
    {"classifier": (GradientBoostingClassifier(random_state=42),)},
    {"classifier": (XGBClassifier(random_state=42),)},
    {"classifier": (CatBoostClassifier(random_state=42, verbose=False),)},
    {"classifier": (LGBMClassifier(random_state=42),)},
]
classifiers_grid = GridSearchCV(
    estimator=classifiers_pipeline,
    param_grid=classifiers,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
)

classifiers_grid.fit(X_train, y_train)

### Reproducing Classifiers DataFrames

Running the pipeline with multiple classifiers, and saving the results to  ```data/interim/``` folder for reproducibility purposes.

In [7]:
create_classifiers_dataset()

In [8]:
display(create_dataframe("interim", "classifiers.csv"))

Unnamed: 0,rank_test_score,classifiers,params,mean_test_score,std_test_score,mean_fit_time,mean_score_time
0,1,CatBoostClassifier,Default,0.756751,0.003777,204.436009,27.909439
1,2,LGBMClassifier,Default,0.755913,0.004033,92.291189,18.674409
2,3,GradientBoostingClassifier,Default,0.75189,0.004491,217.3827,28.272827
3,4,XGBClassifier,Default,0.747783,0.004165,208.38456,29.231811
4,5,AdaBoostClassifier,Default,0.744451,0.005151,133.552438,28.745857
5,6,LinearSVC,Default,0.74015,0.005537,149.822774,27.001444
6,7,LogisticRegression,Default,0.739509,0.005555,105.96312,26.964667
7,8,RandomForestClassifier,Default,0.710713,0.002814,185.298346,29.786624
8,9,KNeighborsClassifier,Default,0.581665,0.002348,105.977588,166.06024
9,10,DecisionTreeClassifier,Default,0.537125,0.004003,114.555107,26.133809


I have decided to go ahead with Fine-Tuning the top-two-scoring classifiers, CatBoost and LightGBM, to create a final Ensemble with votings from them.

*Next notebook: [5.0-fine-tunning-the-model.ipynb](5.0-fine-tunning-the-model.ipynb).*