# This notebook covers early preprocessing steps to begin initial modeling.

## Preprocessing steps: 
- Drop high missing cols (50%+)
- Replaced sentinel values in DAYS_EMPLOYED
- Split the data frame into X/y train and valid data (y containing only "TARGET" as the default indicator)
- Impute missing numerical values into X train/valid
- One-hot encoded categorical data into X train/valid

## After these steps, the data will have gone through early preprocessing and is ready for ML modeling. This notebook represents a general data processing pipeline that will be followed in later notebooks and, eventually, integrated into .py script.

In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Opening file

root = Path.cwd().parent

path = root / "data" / "interim" / "application_train.csv"

df = pd.read_csv(path)

In [4]:
# Assigning a placeholder to df to maintain original integrity
df_processing = df

In [5]:
# Determining the aggregated missingness of columns in the dataframe
missing = df_processing.isna().mean()

In [6]:
# Creating an index of which columns to drop from the data frame (those with more than 50% missing data)
cols_to_drop = missing > 0.5

In [7]:
# Dropping the columns using indexing
df_processing = df_processing.loc[:, ~cols_to_drop].copy()

In [8]:
# Replacing sentinel values ('365243') in DAYS_EMPLOYED with NaN values
df_processing["DAYS_EMPLOYED"] = df_processing["DAYS_EMPLOYED"].replace(365243, np.nan)

In [9]:
df_processing.shape

(307511, 81)

In [10]:
df_processing.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Setting up the dataframe to be split into training and validation dataframes. y-variable is "TARGET", as it is the variable indicating a default within an account
# y is stratified to ensure the default rate is generally similar in the training and validation datasets

df_processing = df_processing.copy()

X = df_processing.drop(columns=["TARGET"])
y = df_processing["TARGET"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size = 0.2,
    stratify = y,
    random_state = 69
)

# Sanity checking the dataframes maintained similar default rates (around 8.07%
print(y_train.mean(), y_valid.mean())

0.08072908198107379 0.08072776937710356


In [12]:
# Indexing numeric columns from the X-dataframes to ensuring imputation is only applied to numeric columns
numeric_cols = X_train.select_dtypes(include=[np.number]).columns

In [13]:
# Utilizing SKlearn SimpleImputer to impute NaN values in numeric columns only
imputer = SimpleImputer(strategy = 'median')

X_train = X_train.copy()
X_valid = X_valid.copy()

X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_valid[numeric_cols] = imputer.transform(X_valid[numeric_cols])

In [14]:
X_train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
161321,287018.0,Cash loans,F,N,N,1.0,270000.0,450000.0,22018.5,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
229430,365737.0,Cash loans,F,N,Y,1.0,157500.0,164952.0,13162.5,130500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
74168,186010.0,Cash loans,F,N,Y,1.0,135000.0,509602.5,34605.0,387000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
244221,382678.0,Cash loans,M,N,Y,0.0,225000.0,765000.0,27234.0,765000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53353,161792.0,Cash loans,F,N,Y,0.0,72000.0,675000.0,21906.0,675000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
# Indexing categorical columns from the X-dataframes to ensure One-Hot encoding is only applied to categorical columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

In [None]:
categorical_cols

In [16]:
# Defining the OneHotEncoder
one_hot_encoding = OneHotEncoder(
    handle_unknown = "ignore",
    sparse_output = False
)

In [17]:
# Applying One Hot encoding to the X-dataframes
X_train_cat = one_hot_encoding.fit_transform(X_train[categorical_cols])
X_valid_cat = one_hot_encoding.transform(X_valid[categorical_cols])

In [18]:
# Filtering the X-dataframes to select only numeric columns to be combined with the One Hot encoded categorical rows
X_train_num = X_train[numeric_cols].to_numpy()
X_valid_num = X_valid[numeric_cols].to_numpy()

In [19]:
# Combining the numeric and categorical rows back into two now imputed and One Hot encoded X-dataframes
X_train_final = np.hstack([X_train_num, X_train_cat])
X_valid_final = np.hstack([X_valid_num, X_valid_cat])

In [20]:
# Sanity testing to ensure both X-dataframes maintained a generally similar shape
print(X_train_final.shape)
print(X_valid_final.shape)

(246008, 196)
(61503, 196)


In [21]:
# Sanity testing to ensure all NaN values have been dropped from the X-
print(np.isnan(X_train_final).sum())
print(np.isnan(X_valid_final).sum())

0
0


In [22]:
X_train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
161321,287018.0,Cash loans,F,N,N,1.0,270000.0,450000.0,22018.5,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
229430,365737.0,Cash loans,F,N,Y,1.0,157500.0,164952.0,13162.5,130500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
74168,186010.0,Cash loans,F,N,Y,1.0,135000.0,509602.5,34605.0,387000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
244221,382678.0,Cash loans,M,N,Y,0.0,225000.0,765000.0,27234.0,765000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53353,161792.0,Cash loans,F,N,Y,0.0,72000.0,675000.0,21906.0,675000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
