In [1]:
import os
import pandas as pd
import numpy  as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
DATA_REPOSITORY= '../data'
RAW_DATA_FILENAME= 'raw_modeling_data.csv'

In [4]:
RAW_DATA_PATH= os.path.join(
    DATA_REPOSITORY,
    RAW_DATA_FILENAME
)

# Feature Engineering 

### Load raw_modeling_data into a pd.DataFrame object 

In [5]:
raw_modeling_data= pd.read_csv(
    filepath_or_buffer= RAW_DATA_PATH,
    index_col= 'ID_CLIENT',
    header= 0
)

In [6]:
raw_modeling_data.sample(5)

Unnamed: 0_level_0,CLERK_TYPE,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,QUANT_ADDITIONAL_CARDS,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,CLI_EDUCATION_LEVEL,STATE_OF_BIRTH,...,FLAG_HOME_ADDRESS_DOCUMENT,FLAG_RG,FLAG_CPF,FLAG_INCOME_PROOF,PRODUCT,FLAG_ACSP_RECORD,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3,TARGET_LABEL_BAD=1
ID_CLIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3924,C,20,Web,0,1,F,7,0,0,AL,...,0,0,0,0,1,N,38,570,570,0
2933,C,10,0,0,1,F,1,0,0,PE,...,0,0,0,0,1,N,31,563,563,1
1658,C,5,0,0,1,M,1,0,0,BA,...,0,0,0,0,1,N,29,456,456,0
21480,C,25,Carga,0,1,F,1,0,0,RJ,...,0,0,0,0,1,N,48,244,244,1
39900,C,25,Web,0,1,M,1,2,0,PE,...,0,0,0,0,1,N,22,555,555,0


### Understanding base columns description

First thing we will do is to check:
- The data type of each column
- The number of unique values for each column
- The number of missing values for each column
- The percentage of missing values over the total number of records in the **modeling_data** dataframe

In [7]:
def get_columns_metrics(df: pd.DataFrame, data_types: bool = True, unique_values: bool = True,null_values_q: bool = True, null_values_p: bool = True, numerator:int = None):
    
    df= df.copy()
    objs= []
    
    if data_types:
        col_dypes = df.dtypes
        col_dypes.name= "Data type" if numerator is None else f"Data type DF{numerator:02d}"
        objs.append(col_dypes)
        
    if unique_values:
        col_n_unique_values= df.nunique()
        col_n_unique_values.name= "# Unique Values" if numerator is None else f"# Unique Values DF{numerator:02d}"
        objs.append(col_n_unique_values)

    if null_values_q:
        col_n_null_values= df.isna().sum()
        col_n_null_values.name= "# NULL Values" if numerator is None else f"# NULL Values DF{numerator:02d}"
        objs.append(col_n_null_values)
        
    if null_values_p:
        col_per_null_values= np.round(df.isna().sum() / df.shape[0] * 100, 2)
        col_per_null_values.name= "% NULL Values" if numerator is None else f"% NULL Values DF{numerator:02d}"
        objs.append(col_per_null_values)
    
    output= pd.concat(
        objs= objs, 
        axis= 1
    )
    
    return output 

In [8]:
columns_metrics= get_columns_metrics(raw_modeling_data,)

In [9]:
columns_metrics

Unnamed: 0,Data type,# Unique Values,# NULL Values,% NULL Values
CLERK_TYPE,object,1,0,0.0
PAYMENT_DAY,int64,6,0,0.0
APPLICATION_SUBMISSION_TYPE,object,3,0,0.0
QUANT_ADDITIONAL_CARDS,int64,1,0,0.0
POSTAL_ADDRESS_TYPE,int64,2,0,0.0
SEX,object,4,0,0.0
MARITAL_STATUS,int64,8,0,0.0
QUANT_DEPENDANTS,int64,17,0,0.0
CLI_EDUCATION_LEVEL,int64,1,0,0.0
STATE_OF_BIRTH,object,29,0,0.0


For starters, we can see there are a few columns that can be dropped because:
1. There is only one unique value and they are all the same -> This will add no value for the model predictor or exploratory analysis.

For each reason we will create a mask to detect which columns to drop

In [10]:
drop_col_mask_01= (
        (columns_metrics["# Unique Values"] == 1) 
    &   (columns_metrics["# NULL Values"] == 0)
)

In [11]:
cols_to_drop= list(columns_metrics[drop_col_mask_01].index)

In [12]:
cols_to_drop

['CLERK_TYPE',
 'QUANT_ADDITIONAL_CARDS',
 'CLI_EDUCATION_LEVEL',
 'FLAG_MOBILE_PHONE',
 'FLAG_HOME_ADDRESS_DOCUMENT',
 'FLAG_RG',
 'FLAG_CPF',
 'FLAG_INCOME_PROOF',
 'FLAG_ACSP_RECORD']

Let's now take a look to the modeling data after removing columns which match with any of the mask conditions

In [13]:
cols_to_keep= [col for col in raw_modeling_data.columns if col not in cols_to_drop]

In [14]:
columns_metrics= get_columns_metrics(raw_modeling_data[cols_to_keep])

In [15]:
columns_metrics

Unnamed: 0,Data type,# Unique Values,# NULL Values,% NULL Values
PAYMENT_DAY,int64,6,0,0.0
APPLICATION_SUBMISSION_TYPE,object,3,0,0.0
POSTAL_ADDRESS_TYPE,int64,2,0,0.0
SEX,object,4,0,0.0
MARITAL_STATUS,int64,8,0,0.0
QUANT_DEPENDANTS,int64,17,0,0.0
STATE_OF_BIRTH,object,29,0,0.0
CITY_OF_BIRTH,object,9910,0,0.0
NACIONALITY,int64,3,0,0.0
RESIDENCIAL_STATE,object,27,0,0.0


## Analysing expected value

In this section, we will analyze the expected values, in order to detect inconcistencies in what the data has and the dataset description

### Retrieve the Variables Description dataset 

In [16]:
variables_data_path= os.path.join(
    DATA_REPOSITORY,
    'var_descriptions.csv'
)

In [17]:
df_variables= pd.read_csv(
    filepath_or_buffer= variables_data_path
).set_index('Var_Title')


Lets take a look to the data in the Field Content after a few conversions

In [18]:
f_field_content_prep= lambda x: str(x)\
    .replace('...', 'etc')\
    .replace('.', ',')\
    .split(',')

In [19]:
from typing import List 

In [20]:
def data_conversion(values: List[str]):
    output= []
    
    for value in values:
        if value.strip().isdecimal():
            output.append(int(value))
        elif value.strip().upper() == 'NULL':
            output.append(np.nan)
        elif '=male' in value.lower():
            output.append('M')
        elif '=female' in value.lower():
            output.append('F')
        else:
            output.append(value.strip())
            
    return output

In [21]:
var_content= pd.DataFrame(
    df_variables[
        ~df_variables.Field_Content.isna()
    ]['Field_Content']\
        .apply(f_field_content_prep)\
        .apply(lambda x: data_conversion(x))
)

In [22]:
mask_series= var_content.iloc[:, 0].apply(lambda elements: 'etc' in elements or 'XX' in elements) 

In [23]:
var_content_dict= var_content[
    ~mask_series
]\
    .to_dict()['Field_Content']

In [24]:
var_content_dict

{'ID_CLIENT': ['1-50000', '50001-70000', '70001-90000'],
 'CLERK_TYPE': ['C'],
 'PAYMENT_DAY': [1, 5, 10, 15, 20, 25],
 'APPLICATION_SUBMISSION_TYPE': ['Web', 'Carga'],
 'QUANT_ADDITIONAL_CARDS': [1, 2, nan],
 'POSTAL_ADDRESS_TYPE': [1, 2],
 'SEX': ['M', 'F'],
 'MARITAL_STATUS': [1, 2, 3, 4, 5, 6, 7],
 'CLI_EDUCATION_LEVEL': [1, 2, 3, 4, 5],
 'NACIONALITY': [0, 1, 2],
 'FLAG_RESIDENCIAL_PHONE': ['Y', 'N'],
 'RESIDENCE_TYPE': [1, 2, 3, 4, 5, nan],
 'FLAG_MOBILE_PHONE': ['Y', 'N'],
 'FLAG_EMAIL': [0, 1],
 'FLAG_VISA': [0, 1],
 'FLAG_MASTERCARD': [0, 1],
 'FLAG_DINERS': [0, 1],
 'FLAG_AMERICAN_EXPRESS': [0, 1],
 'FLAG_OTHER_CARDS': [0, 1, nan],
 'QUANT_BANKING_ACCOUNTS': [0, 1, 2],
 'QUANT_SPECIAL_BANKING_ACCOUNTS': [0, 1, 2],
 'COMPANY': ['Y', 'N'],
 'FLAG_PROFESSIONAL_PHONE': ['Y', 'N'],
 'OCCUPATION_TYPE': [1, 2, 3, 4, 5, nan],
 'MATE_EDUCATION_LEVEL': [1, 2, 3, 4, 5],
 'FLAG_HOME_ADDRESS_DOCUMENT': [0, 1],
 'FLAG_RG': [0, 1],
 'FLAG_CPF': [0, 1],
 'FLAG_INCOME_PROOF': [0, 1],
 'PRODUC

In [25]:
def restrinct_to_expected_var_content(data: pd.DataFrame, expected_var_content: dict, verbose:bool = False):

    data= data.copy()

    for column in data.columns:
        if column in var_content_dict.keys():
            if verbose:
                print(f"--- {column.center(10)} ---")
            not_admitted_found= []
            for idx, row in data.iterrows():
                if row[column] not in var_content_dict[column]:
                    
                    not_admitted_found.append(row[column]) if row[column] not in not_admitted_found else None
                    data.loc[idx, column] = np.nan
            
            if verbose:
                print(f"! {not_admitted_found} !") if len(not_admitted_found) else None
    return data

In [26]:
raw_modeling_data_mod= restrinct_to_expected_var_content(
    data= raw_modeling_data, 
    expected_var_content= var_content_dict
)

In [27]:
null_analysis= []

for idx, df in enumerate([raw_modeling_data[cols_to_keep], raw_modeling_data_mod[cols_to_keep]]):
    null_analysis.append(get_columns_metrics(df, numerator= idx + 1, data_types= False, unique_values= False, null_values_q= False))

pd.concat(null_analysis, axis= 1)

Unnamed: 0,% NULL Values DF01,% NULL Values DF02
PAYMENT_DAY,0.0,0.0
APPLICATION_SUBMISSION_TYPE,0.0,38.92
POSTAL_ADDRESS_TYPE,0.0,0.0
SEX,0.0,0.13
MARITAL_STATUS,0.0,0.4
QUANT_DEPENDANTS,0.0,0.0
STATE_OF_BIRTH,0.0,0.0
CITY_OF_BIRTH,0.0,0.0
NACIONALITY,0.0,0.0
RESIDENCIAL_STATE,0.0,0.0


### Expected values

What we can see is that if we restrict values to what the variable descriptions show, the percentage of values to impute increase for a some fields. The decision made is that, although this percentages may be high they aren't high enough to be discarded. During the tranining process we will iterate with the imputation to be made, or fields to be discarded, but for now, they will stay in the dataset.

We will go on with the **raw_data_modeling_mod**, without the non wanted columns

In [28]:
raw_modeling_data_mod= raw_modeling_data_mod[cols_to_keep]

### Convert objects to uppercase and replace empty strings with NaNs

In [29]:
df_col_metrics= get_columns_metrics(raw_modeling_data_mod)
object_columns= df_col_metrics[get_columns_metrics(raw_modeling_data_mod)["Data type"] == 'object'].index

In [30]:
df_aux= raw_modeling_data_mod.copy()

In [31]:
for col in object_columns:
    df_aux[col]= df_aux[col].apply(lambda s: s.upper() if isinstance(s, str) else s)
    df_aux[col]= df_aux[col].apply(lambda s: np.nan if s == " " else s)

In [32]:
objs= []

for idx, df in enumerate([raw_modeling_data_mod, df_aux]):
    objs.append(
        get_columns_metrics(
            df, data_types= False, unique_values= False, null_values_q= False, numerator= idx+1
        )
    )

pd.concat(
    objs= objs,
    axis= 1
)

Unnamed: 0,% NULL Values DF01,% NULL Values DF02
PAYMENT_DAY,0.0,0.0
APPLICATION_SUBMISSION_TYPE,38.92,38.92
POSTAL_ADDRESS_TYPE,0.0,0.0
SEX,0.13,0.13
MARITAL_STATUS,0.4,0.4
QUANT_DEPENDANTS,0.0,0.0
STATE_OF_BIRTH,0.0,4.13
CITY_OF_BIRTH,0.0,4.13
NACIONALITY,0.0,0.0
RESIDENCIAL_STATE,0.0,0.0


We can see after some changes in the dataframe there is high percentage of null values for data in profession related columns.

It is not a good idea to get rid of those columns because they may be null values for people without a job and can be valuable information in case they do have a job.

Lets take a look to variables description for those columns

In [33]:
prof_related_columns= [
    'COMPANY', 'PROFESSIONAL_STATE', 'PROFESSIONAL_CITY',
   'PROFESSIONAL_BOROUGH', 'FLAG_PROFESSIONAL_PHONE',
   'PROFESSIONAL_PHONE_AREA_CODE', 'MONTHS_IN_THE_JOB',
]

In [34]:
for idx, value in df_variables.loc[prof_related_columns, 'Var_Description'].iteritems():
    print(
        f"--- {idx.center(10)} ---",
        value,
        sep= '\n',
        end="\n"*2
    )

---  COMPANY   ---
If the applicant has supplied the name of the company where he/she formally works

--- PROFESSIONAL_STATE ---
State where the applicant works

--- PROFESSIONAL_CITY ---
City where the applicant works

--- PROFESSIONAL_BOROUGH ---
Borough where the applicant works

--- FLAG_PROFESSIONAL_PHONE ---
Indicates if the professional phone number was supplied

--- PROFESSIONAL_PHONE_AREA_CODE ---
Three-digit pseudo-code

--- MONTHS_IN_THE_JOB ---
Time in the current job in months



As supposed, mostly all columns have to do with the profession, and **COMPANY** is the a flag column which states if the the employee has provided the data about where they work. 

Let's group the columns by COMPANY (Y, N) and check the percentage of null values.

In [35]:
get_columns_metrics(df_aux).loc[prof_related_columns[1:], :]

Unnamed: 0,Data type,# Unique Values,# NULL Values,% NULL Values
PROFESSIONAL_STATE,object,27,34307,68.61
PROFESSIONAL_CITY,object,1648,34114,68.23
PROFESSIONAL_BOROUGH,object,4239,34713,69.43
FLAG_PROFESSIONAL_PHONE,object,2,0,0.0
PROFESSIONAL_PHONE_AREA_CODE,object,86,36532,73.06
MONTHS_IN_THE_JOB,int64,21,0,0.0


In [36]:
total_values= df_aux.groupby('COMPANY').agg(
    {key: (lambda x: x.shape[0]) for key in prof_related_columns[1:]}
).T
total_values.name= 'TOTAL'

In [37]:
null_values= df_aux.groupby('COMPANY').agg(
    {key: (lambda x: np.round(x.isna().sum() / x.shape[0] * 100, 2) ) for key in prof_related_columns[1:]}
).T

In [38]:
null_values.columns = [f"% NULL | {value}" for value in null_values.columns]

In [39]:
null_values

Unnamed: 0,% NULL | N,% NULL | Y
PROFESSIONAL_STATE,97.05,32.55
PROFESSIONAL_CITY,97.03,31.69
PROFESSIONAL_BOROUGH,97.15,34.26
FLAG_PROFESSIONAL_PHONE,0.0,0.0
PROFESSIONAL_PHONE_AREA_CODE,96.59,43.22
MONTHS_IN_THE_JOB,0.0,0.0


Given that the applicant hasn't supplied the name of the company where he/she formally works, values for professional related features are null, which makes sense. On the other hand, there is a considerable percentage of NULL values to be imputed considering he/she has provided info.

In [40]:
raw_modeling_data_mod_02= df_aux.drop(
columns= ['MATE_PROFESSION_CODE', 'MAT_EDUCATION_LEVEL'])

In [41]:
get_columns_metrics(raw_modeling_data_mod_02[null_values.index.to_list()])

Unnamed: 0,Data type,# Unique Values,# NULL Values,% NULL Values
PROFESSIONAL_STATE,object,27,34307,68.61
PROFESSIONAL_CITY,object,1648,34114,68.23
PROFESSIONAL_BOROUGH,object,4239,34713,69.43
FLAG_PROFESSIONAL_PHONE,object,2,0,0.0
PROFESSIONAL_PHONE_AREA_CODE,object,86,36532,73.06
MONTHS_IN_THE_JOB,int64,21,0,0.0


---

Let's check the percentage of values in the top 10 most frecuent categories for the categorical features.

A feature will be considered categorical if the column data type is an object or the number of unique features is lower than 20

In [42]:
df_col_metrics= get_columns_metrics(raw_modeling_data_mod_02)

In [43]:
cat_features= df_col_metrics[
        (df_col_metrics['Data type'] == 'object')
    |   (df_col_metrics['# Unique Values'] <= 20)
].index.to_list()[:-1]

num_features= [col for col in raw_modeling_data_mod_02.columns[:-1] if col not in cat_features]

In [44]:
for cat_feature in cat_features:
    raw_modeling_data_mod_02[cat_feature]= raw_modeling_data_mod_02[cat_feature].astype('category')


In [51]:
def remove_unncessary_columns(df, per_top_value_threshold= 98, per_low_top_ten_threshold= 60, cat_features: list= None):
    
    df= df.copy()
    result= None
    
    if cat_features is None or not isinstance(cat_features, list):
        
        cat_features= df\
            .T[(df.dtypes == 'object') | (df.dtypes == 'category')]\
            .index\
            .to_list()
        
        
    for feature in cat_features:
        s= np.round(
            df[feature].value_counts(
                normalize= True,
                sort= True,
                ascending= False,
                dropna= False
            )\
                .head(10)\
                .cumsum(skipna= False)\
                .reset_index(drop= True) * 100,
            decimals= 2

        )
        
        if result is None:
            result= s
        else:
            result= pd.concat(
                objs=[result, s], axis= 1
            )
    
    before_filter= result.copy()
    result= result.T[
            (result.iloc[0] >= per_top_value_threshold)
        |   (
                (~result.iloc[9].isna()) 
            &   (result.iloc[9] <= per_low_top_ten_threshold)
        )
    ]
    
    output= result.index.to_list()
    
    return result, output

In [52]:
result, output= remove_unncessary_columns(df= raw_modeling_data_mod_02, cat_features= cat_features)

In [53]:
columns_to_keep= [col for col in raw_modeling_data_mod_02.columns if col not in output]

In [55]:
raw_modeling_data_mod_02= raw_modeling_data_mod_02[columns_to_keep]

In [56]:
df_col_metrics= get_columns_metrics(raw_modeling_data_mod_02)

In [57]:
df_col_metrics

Unnamed: 0,Data type,# Unique Values,# NULL Values,% NULL Values
PAYMENT_DAY,category,6,0,0.0
APPLICATION_SUBMISSION_TYPE,category,2,19461,38.92
SEX,category,2,65,0.13
MARITAL_STATUS,category,7,202,0.4
QUANT_DEPENDANTS,category,17,0,0.0
STATE_OF_BIRTH,category,28,2064,4.13
NACIONALITY,category,3,0,0.0
RESIDENCIAL_STATE,category,27,0,0.0
FLAG_RESIDENCIAL_PHONE,category,2,0,0.0
RESIDENCE_TYPE,category,5,2109,4.22


In [58]:
cat_features= df_col_metrics[
        (df_col_metrics['Data type'] == 'category')
    |   (df_col_metrics['# Unique Values'] <= 20)
].index.to_list()[:-1]

num_features= [col for col in raw_modeling_data_mod_02.columns[:-1] if col not in cat_features]

In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [61]:
SimpleImputer?

In [68]:
prof_cat_features= list(set(prof_related_columns) & set(cat_features))
prof_num_features= list(set(prof_related_columns) & set(num_features))
cat_features= list(set(cat_features) - set(prof_cat_features))
num_features= list(set(num_features) - set(prof_num_features))

In [72]:
median_imputer= SimpleImputer(
    strategy= 'median', 
    missing_values= pd.NA
)

categorical_imputer= SimpleImputer(
    strategy= 'most_frequent',
    missing_values= pd.NA,
    fill_value= 'N'
)

constant_imputer= SimpleImputer(
    strategy= "constant",
    missing_values= pd.NA,
    
)



one_hot_encoder= OneHotEncoder(
    drop= 'first',
    handle_unknown= 'ignore',
    sparse= False
)
one_hot_encoder_prof= OneHotEncoder(
    drop= 'first',
    handle_unknown= 'ignore',
    sparse= False
)

categorical_pipeline= Pipeline(
    steps= [
        ('categorical_imputer', categorical_imputer),
        ('one_hot_encoder', one_hot_encoder)
    ]
)
standard_scaler=  StandardScaler()

numerical_pipeline= Pipeline(
    steps= [
        ('median_imputer', median_imputer),
        ('standard_scaler', standard_scaler)
    ]
)

prof_categorical_pipeline= Pipeline(
    steps= [
        ('constant_imputer', constant_imputer),
        ('one_hot_encoder_prf', one_hot_encoder_prof)
    ]
)



In [73]:
col_transformer= ColumnTransformer(
    transformers= [
        ('cat_columns', categorical_pipeline, cat_features),
        ('num_columns', numerical_pipeline, num_features + prof_num_features),
        ('cat_prof_features', prof_categorical_pipeline, prof_cat_features)
    ], 
    remainder= 'drop',
    verbose= True
)

In [74]:
X, y= raw_modeling_data_mod_02.drop(columns=raw_modeling_data_mod_02.columns[-1]), raw_modeling_data_mod_02[raw_modeling_data_mod_02.columns[-1]]

In [76]:
X_train, X_rest, y_train, y_rest= train_test_split(X, y, test_size= 0.2, random_state= 42)

In [77]:
X_val, X_test, y_val, y_test= train_test_split(X_rest, y_rest, test_size= 0.2, random_state= 42)

In [80]:
col_transformer.fit(X_train, y= None)

[ColumnTransformer] ... (1 of 3) Processing cat_columns, total=   0.9s
[ColumnTransformer] ... (2 of 3) Processing num_columns, total=   0.0s
[ColumnTransformer]  (3 of 3) Processing cat_prof_features, total=   1.0s


In [86]:
X_train_transformed= col_transformer.transform(X_train)
X_val_transformed= col_transformer.transform(X_val)
X_test_transformed= col_transformer.transform(X_test)

In [91]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

In [93]:
random_forest= RandomForestClassifier()
random_forest.fit(X_train_transformed, y_train)

In [94]:
random_forest.predict(X_test_transformed)

array([0, 0, 0, ..., 0, 0, 0])

In [92]:
cv_result= cross_validate(
    estimator= RandomForestClassifier(),
    X= X_train_transformed,
    y= y_train,
    scoring= 'accuracy',
    cv= 5,
    n_jobs= -1,
    return_train_score= True
)

KeyboardInterrupt: 