In [0]:
import pandas as pd

pd.options.display.max_columns = 500

In [0]:
df_train = pd.read_csv('train_data_nonull.csv')
df_val = pd.read_csv('val_data_nonull.csv')

In [0]:
keys = ['id', 'vintage']
product = ['installments']
relevance_features = ['is_loan', 'is_default', 'relevance_score']
hold_feature = ['internal_feature_3']

all_remaining = [col for col in df_train.columns if col not in keys + product + relevance_features + hold_feature]

Treatment for categorical features

In [0]:
def get_regiao(value):
    if value in ['RS', 'PR', 'SC']:
        new_value = 'Sul'
    elif value in ['MG', 'RJ', 'ES']:
        new_value = 'Sudeste'
    elif value in ['MS', 'MT', 'GO', 'DF']:
        new_value = 'Centro_Oeste'
    elif value in ['BA', 'PI', 'MA', 'CE', 'RN', 'PB', 'PE', 'AL', 'SE']:
        new_value = 'Nordeste'
    elif value in ['AC', 'RO', 'AM', 'AP', 'PA', 'RR', 'TO']:
        new_value = 'Norte'
    else:
        new_value = 'SP'
    return new_value

df_train['internal_feature_11'] = df_train['internal_feature_11'].apply(lambda x: get_regiao(x))
df_val['internal_feature_11'] = df_val['internal_feature_11'].apply(lambda x: get_regiao(x))

In [0]:
trans_dict = {}

for feature in ['internal_feature_0', 'internal_feature_4']:

    temp = df_train[feature] \
            .value_counts(normalize = True) \
            .sort_values(ascending = False) \
            .reset_index() \
            .rename(columns = {'index': feature, feature: 'values'})

    temp['values'] = temp.apply(lambda x: x[0] if x[1] > 0.01 else 'Others', axis = 1)

    temp_dict = {}
    for line in temp[feature]:        
        temp_dict[line] = temp.loc[temp[feature] == line, 'values'].values[0]
    
    trans_dict[feature] = temp_dict
    
for feature in trans_dict.keys():
    df_train[feature] = df_train[feature].apply(lambda x: trans_dict[feature][x])

In [0]:
def transform_if5(value):
    if value in ['IEMP', 'BUSI']:
    return 'BUSINESS'
    elif value in ['DEBT', 'PCRE', 'PDEB', 'PEMP', 'PDIV', 'PORT']:
    return 'DEBT'
    elif value in ['VEIC', 'ELET']:
    return 'PURCHASE'
    elif value in ['IMOB', 'FURN']:
    return 'HOUSE'
    elif value in ['ESTH', 'MEDI']:
    return 'MEDICAL'
    else:
    return 'OTHERS'

df_train['internal_feature_5'] = df_train['internal_feature_5'].apply(lambda x: transform_if5(x))
df_val['internal_feature_5'] = df_val['internal_feature_5'].apply(lambda x: transform_if5(x))

Out[6]: array(['PCRE', 'PDIV', 'WEDD', 'PDEB', 'TRAV', 'EDUC', 'BUSI', 'FURN',
       'VEIC', 'IMOB', 'DEBT', 'ELET', 'IEMP', 'MEDI', 'PORT', 'PEMP',
       'ESTH'], dtype=object)

In [0]:
# internal_feature_9

def get_status(value):
    if value in ['DIVORCED', 'SEPARATED']:
        new_value = 'DIVORCED'
    elif value in ['MARRIED', 'STABLE_UNION']:
        new_value = 'MARRIED'
    else:
        new_value = 'SINGLE'
    return new_value

df_train['internal_feature_9'] = df_train['internal_feature_9'].apply(lambda x: get_status(x))
df_val['internal_feature_9'] = df_val['internal_feature_9'].apply(lambda x: get_status(x))

In [0]:
bureau_cat = ['bureau_feature_16', 'bureau_feature_17', 'bureau_feature_18', 
              'bureau_feature_19', 'bureau_feature_20', 'bureau_feature_21']

for feature in bureau_cat:
    df_train[feature] = df_train[feature].apply(lambda x: 'NAO_CONSTAM_OCORRENCIAS' if x == 'Not_found' else x)
    df_val[feature] = df_val[feature].apply(lambda x: 'NAO_CONSTAM_OCORRENCIAS' if x == 'Not_found' else x)

Treatment for numerical columns

In [0]:
# Define %
def concentration(database):
    Numerical_variables = database.select_dtypes(include=['float64', 'int64'])
    to_remove = []
    for var in list(Numerical_variables):
        value_counts = database[var].value_counts(normalize=True)
        if len(list(value_counts[value_counts > 0.70])) > 0:
          to_remove.append(var)
    return to_remove

In [0]:
drop = concentration(df_train)

In [0]:
drop_variables_numeric = ['bacen_feature_0', 'bacen_feature_4', 'bacen_feature_5', 'bacen_feature_7', 'bacen_feature_8', 'bacen_feature_9', 'bacen_feature_10', 'bacen_feature_12', 'bacen_feature_13', 'bacen_feature_14', 'bacen_feature_15', 'bacen_feature_17', 'bureau_feature_1', 'bureau_feature_2', 'bureau_feature_3', 'bureau_feature_5', 'bureau_feature_6', 'bureau_feature_7', 'bureau_feature_9', 'bureau_feature_11', 'bureau_feature_12', 'bureau_feature_13', 'bureau_feature_14']

In [0]:
df_train = df_train.drop(drop_variables_numeric, axis = 1)
df_val = df_val.drop(drop_variables_numeric, axis = 1)

In [0]:
# Outliers Analysis

# Variables ok → internal_feature_2 internal_feature_8  installments bureau_feature_4 bureau_feature_15 
# Outliers atention → internal_feature_1 internal_feature_6 internal_feature_7 bacen_feature_1 bacen_feature_2 bacen_feature_3 bacen_feature_6 bacen_feature_11 bacen_feature_16