# 3. Exploring the Data

Exploratory Data Analysis in the flagship dataset: **application_train.csv**. The goal of the analysis is to gain insights for a first iteration of data preparation.

## Preparing our toolbox 🧰

In [234]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from src.data.explore_data import (
        list_datasets, 
        describe_feature, 
        overview_data, 
        create_dataframe, 
        describe_features, 
        create_exploratory_dataset
)
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from imblearn import FunctionSampler
from imblearn.pipeline import Pipeline as imbpipeline
from feature_engine.selection import DropFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [172]:
df = create_dataframe('raw', 'application_train.csv')

In [235]:
X = df.drop(labels=('TARGET'), axis=1).copy()
y = df['TARGET'].copy()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42,)

In [194]:
# NaNPercentage higher than 30%
features_to_drop = [
    'OWN_CAR_AGE', 
    'OCCUPATION_TYPE', 
    'EXT_SOURCE_1',
    'APARTMENTS_AVG',
    'BASEMENTAREA_AVG',
    'YEARS_BEGINEXPLUATATION_AVG',
    'YEARS_BUILD_AVG',
    'COMMONAREA_AVG',
    'ELEVATORS_AVG',
    'ENTRANCES_AVG',
    'FLOORSMAX_AVG',
    'FLOORSMIN_AVG',
    'LANDAREA_AVG',
    'LIVINGAPARTMENTS_AVG',
    'LIVINGAREA_AVG',
    'NONLIVINGAPARTMENTS_AVG',
    'NONLIVINGAREA_AVG',
    'APARTMENTS_MODE',
    'BASEMENTAREA_MODE',
    'YEARS_BEGINEXPLUATATION_MODE',
    'YEARS_BUILD_MODE',
    'COMMONAREA_MODE',
    'ELEVATORS_MODE',
    'ENTRANCES_MODE',
    'FLOORSMAX_MODE',
    'FLOORSMIN_MODE',
    'LANDAREA_MODE',
    'LIVINGAPARTMENTS_MODE',
    'LIVINGAREA_MODE',
    'NONLIVINGAPARTMENTS_MODE',
    'NONLIVINGAREA_MODE',
    'APARTMENTS_MEDI',
    'BASEMENTAREA_MEDI',
    'YEARS_BEGINEXPLUATATION_MEDI',
    'YEARS_BUILD_MEDI',
    'COMMONAREA_MEDI',
    'ELEVATORS_MEDI',
    'ENTRANCES_MEDI',
    'FLOORSMAX_MEDI',
    'FLOORSMIN_MEDI',
    'LANDAREA_MEDI',
    'LIVINGAPARTMENTS_MEDI',
    'LIVINGAREA_MEDI',
    'NONLIVINGAPARTMENTS_MEDI',
    'NONLIVINGAREA_MEDI',
    'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE',
    'TOTALAREA_MODE',
    'WALLSMATERIAL_MODE',
    'EMERGENCYSTATE_MODE'
]

In [195]:
def drop_hidden_string_nans(X, y):
    indexes_for_removal = X[X['CODE_GENDER'] == 'XNA'].index
    X = X[X['CODE_GENDER'] != 'XNA']
    y.drop(index=indexes_for_removal, inplace=True)
    return X, y

drop_hidden_nans = FunctionSampler(func=drop_hidden_string_nans, validate=False)
drop_high_nan_percentage = DropFeatures(features_to_drop)

pipeline = imbpipeline(steps=[('oi', drop_hidden_nans), ('tudo', drop_high_nan_percentage)])
print(X_train.shape,y_train.shape)
X_train = pipeline.fit_transform(X_train, y_train)
print(X_train.shape,y_train.shape)

(196806, 121) (196806,)
(196804, 71) (196804,)


In [197]:
print(X_valid.shape,y_valid.shape)
X_valid = pipeline.transform(X_valid)
print(X_valid.shape,y_valid.shape)

(49202, 121) (49202,)
(49202, 71) (49202,)


In [243]:
cols= ['CODE_GENDER']

impute_hidden = SimpleImputer(missing_values='XNA', strategy='most_frequent')

string_pipe = imbpipeline([('eai', impute_hidden)])

preprocessor = ColumnTransformer([('str', string_pipe, cols)], remainder='passthrough')

X_trained = X_train[['SK_ID_CURR', 'CODE_GENDER']].copy()
X_trained = preprocessor.fit_transform(X_trained, y_train)

In [251]:
oi = pd.DataFrame(X_trained)
oi[0].value_counts()

F    129395
M     67411
Name: 0, dtype: int64