## Import dependancies


In [None]:
import os
import sys

src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.make_dataset as make_dataset
import visualization.visualize as visualize
import features.build_features as build_features

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px



## Prepare data loading : set correct variable types


In [None]:
# Read column names
column_names = pd.read_csv(
    "../data/processed/application_train.csv", nrows=0
).columns.values

# Set column types according to fields description (https://static.openfoodfacts.org/data/data-fields.txt)
column_types = {
    col: "category"
    for col in column_names
    if col.startswith(("NAME_",))
    or col.endswith(("_TYPE"))
    or col
    in [
        "CODE_GENDER",
        "WEEKDAY_APPR_PROCESS_START",
        "FONDKAPREMONT_MODE",
        "HOUSETYPE_MODE",
        "WALLSMATERIAL_MODE",
        "EMERGENCYSTATE_MODE",
    ]
}
column_types |= {
    col: bool
    for col in column_names
    if col.startswith(("FLAG_", "REG_", "LIVE_"))
}


## Application and Test data loading and first observations


### Application training data


In [None]:
# Load application data
app_train_df = pd.read_csv(
    "../data/processed/application_train.csv",
    dtype=column_types,
    true_values=["Y", "Yes", "1"],
    false_values=["N", "No", "0"],
    na_values=["XNA"],
)

# Application data features
app_train_df.head()


In [None]:
# Application data columns info
app_train_df.info()


In [None]:
# Application data variables description
app_train_df.describe(include="all")


### Application testing data


In [None]:
# Load test data
app_test_df = pd.read_csv(
    "../data/processed/application_test.csv",
    dtype=column_types,
    true_values=["Y", "Yes", "1"],
    false_values=["N", "No", "0"],
    na_values=["XNA"],
)

# Test data features
app_test_df.head()


In [None]:
# Test data columns info
app_test_df.info()


In [None]:
# Test data variables description
app_test_df.describe(include="all")


## Exploratory Data Analysis (EDA)


### Look for empty columns

In [None]:
# Plot application columns emptiness ratio
visualize.plot_empty_values(app_train_df)


In [None]:
# Plot test columns emptiness ratio
visualize.plot_empty_values(app_test_df)


### Look at numerical variables


In [None]:
clean_app_train_df = build_features.drop_impossible_values(
    app_train_df,
    constraints={
        "AMT_INCOME_TOTAL": {"min": 0, "max": 20 * 1000 * 1000,},
        "DAYS_EMPLOYED": {"min": -100 * 365, "max": 0,},
        "AMT_REQ_CREDIT_BUREAU_QRT": {"min": 0, "max": 20,},
        "OBS_30_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 200,},
        "DEF_30_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 20,},
        "OBS_60_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 200,},
        "DEF_60_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 20,},
    },
)
clean_app_train_df = build_features.drop_outliers(
    clean_app_train_df, columns=["REGION_POPULATION_RELATIVE"],
)

# Draw the BoxPlots of each numeric column, split per Nutrition Grade
visualize.plot_boxes(
    dataframe=clean_app_train_df,
    plot_columns=[
        "AMT_INCOME_TOTAL",
        "AMT_CREDIT",
        "AMT_ANNUITY",
        "AMT_GOODS_PRICE",
        "DAYS_BIRTH",
        "DAYS_EMPLOYED",
        "OWN_CAR_AGE",
        "REGION_RATING_CLIENT",
        "REGION_RATING_CLIENT_W_CITY",
        "EXT_SOURCE_1",
        "EXT_SOURCE_2",
        "EXT_SOURCE_3",
        "DAYS_LAST_PHONE_CHANGE",
        "AMT_REQ_CREDIT_BUREAU_YEAR",
    ],
    categorical_column="TARGET",
)


In [None]:
clean_app_test_df = build_features.drop_impossible_values(
    app_test_df,
    constraints={
        "DAYS_EMPLOYED": {"min": -100 * 365, "max": 0,},
        "AMT_REQ_CREDIT_BUREAU_QRT": {"min": 0, "max": 20,},
        "OBS_30_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 200,},
        "DEF_30_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 20,},
        "OBS_60_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 200,},
        "DEF_60_CNT_SOCIAL_CIRCLE": {"min": 0, "max": 20,},
    },
)
clean_app_test_df = build_features.drop_outliers(
    clean_app_test_df, columns=["REGION_POPULATION_RELATIVE"],
)

### Look at categorical variables


In [None]:
visualize.plot_categories_bars(
    clean_app_train_df,
    plot_columns=[
        "NAME_CONTRACT_TYPE",
        "CODE_GENDER",
        "FLAG_OWN_CAR",
        "FLAG_OWN_REALTY",
        "NAME_INCOME_TYPE",
        "NAME_EDUCATION_TYPE",
        "NAME_FAMILY_STATUS",
        "NAME_HOUSING_TYPE",
        "OCCUPATION_TYPE",
        "FLAG_MOBIL",
    ],
    categorical_column="TARGET",
)


#### One Hot Encoding

No ordinal data => One Hot Encoding is better than Label Encoding


In [None]:
# one-hot encoding of categorical variables
encoded_app_train_df = pd.get_dummies(clean_app_train_df, dtype=bool)
encoded_app_test_df = pd.get_dummies(clean_app_test_df, dtype=bool)

train_labels = encoded_app_train_df['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
encoded_app_train_df, encoded_app_test_df = encoded_app_train_df.align(encoded_app_test_df, join = 'inner', axis = 1)

# Add the target back in
encoded_app_train_df['TARGET'] = train_labels

print('Training Features shape: ', encoded_app_train_df.shape)
print('Testing Features shape: ', encoded_app_test_df.shape)


#### Features selection

Variables that are not highly correlated to an other, and at least a bit correlated to TARGET.

In [None]:
corrs_app_train_df = encoded_app_train_df.corr()
fig = px.imshow(corrs_app_train_df,
    title="Correlations between features",
    width=1200,
    height=1200,
)
fig.show()

In [None]:
highly_correlated_columns = []
for i in range(len(corrs_app_train_df.columns)):
    for j in range(i + 1, len(corrs_app_train_df.columns)):
        if i != j and abs(corrs_app_train_df.iloc[i, j]) > 0.99:
            highly_correlated_columns.append(corrs_app_train_df.columns[i])
            print(
                "Highly correlated pair : ",
                corrs_app_train_df.columns[i],
                corrs_app_train_df.columns[j],
                corrs_app_train_df.iloc[i, j],
            )

highly_decorrelated_from_target_columns = []
for col in corrs_app_train_df.columns:
    if col != "TARGET" and abs(corrs_app_train_df[col]["TARGET"]) < 0.001:
        highly_decorrelated_from_target_columns.append(col)
        print(
            "Highly de-correlated from TARGET : ",
            col,
            corrs_app_train_df[col]["TARGET"],
        )


---
---
---
---
---