# 0. Libraries

In [1]:
import os
import warnings

import pandas as pd
import numpy  as np

from utils              import *
from sklearn.ensemble   import RandomForestClassifier, GradientBoostingClassifier
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

## 0.1. Settings parameters

### 0.1.1. Seed

In [2]:
# seed
seed = 42

### 0.1.2. Paths

In [3]:
# gets the current working directory (where the notebook is located)
actual_path = os.getcwd()

# gets the current working root directory
root_path = os.path.abspath(os.path.join(actual_path, "..", ".."))

# gets the parent directory (one level up)
parent_path = os.path.dirname(actual_path)

# extract folder name from parent directory
parent_folder_name = os.path.basename(parent_path)

print(f"Parent path name: {parent_folder_name}")

Parent path name: PROJECT_17


# 1. Data

In [5]:
# loading train
abt_train_00 = pd.read_csv(f'{root_path}/{parent_folder_name}/DATAS/GOLD/abt_train_00_{parent_folder_name}.csv')

# checking
abt_train_00.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,TARGET
0,-0.575932,-1.002571,-0.19367,-0.339207,-0.090181,-1.562639,2.127881,0.233488,-0.864784,0.0,...,-0.23363,-0.446176,-0.464518,-0.452648,-0.552146,-0.27411,-0.009418,-0.007295,-1.031318,0
1,-0.575932,-0.338096,-1.071741,-0.716216,1.064266,-0.05312,-0.457387,-1.083388,-1.129518,-0.297457,...,-0.23363,-0.446176,-0.464518,-0.452648,1.811116,-0.27411,-0.009418,-0.007295,-1.031318,0
2,0.792432,-0.79012,-0.08235,-0.349766,-1.132932,0.907441,-0.457719,0.58539,1.774599,0.0,...,-0.23363,-0.446176,-0.464518,-0.452648,1.811116,-0.27411,-0.009418,-0.007295,-1.031318,1
3,-0.575932,0.113928,-0.818402,-0.940745,1.064266,0.849295,-0.472233,-0.282352,-0.817012,0.0,...,-0.23363,-0.446176,-0.464518,-0.452648,1.811116,-0.27411,-0.009418,-0.007295,-1.031318,0
4,-0.575932,-0.338096,-0.986221,-1.173659,-0.911693,0.796185,-0.458447,1.152107,-0.018828,-1.159942,...,-0.23363,-0.446176,-0.464518,-0.452648,-0.552146,-0.27411,-0.009418,-0.007295,0.969633,0


In [6]:
# loading test
abt_test_00 = pd.read_csv(f'{root_path}/{parent_folder_name}/DATAS/GOLD/abt_test_00_{parent_folder_name}.csv')

# checking
abt_test_00.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,TARGET
0,0.792432,-0.880524,-0.856441,-0.474918,-0.784369,0.379316,-0.480112,0.644181,-0.870092,0.0,...,-0.23363,-0.446176,-0.464518,-0.452648,-0.552146,-0.27411,-0.009418,-0.007295,0.969633,0
1,0.792432,-0.202488,-0.134598,-0.11002,-0.809389,0.026774,-0.472643,0.485331,-0.679669,0.565028,...,4.280277,-0.446176,-0.464518,-0.452648,-0.552146,-0.27411,-0.009418,-0.007295,0.969633,0
2,-0.575932,-0.112084,1.013729,1.080635,-1.143112,-1.391634,2.127881,-2.366063,-0.697583,0.0,...,-0.23363,-0.446176,-0.464518,-0.452648,-0.552146,-0.27411,-0.009418,-0.007295,-1.031318,0
3,0.792432,4.634168,-0.930281,-1.096021,-0.8942,0.395798,-0.490281,0.872281,-1.162693,-1.016195,...,-0.23363,-0.446176,-0.464518,-0.452648,1.811116,-0.27411,-0.009418,-0.007295,-1.031318,0
4,-0.575932,-0.564108,-0.063521,-0.293556,0.744307,-0.776059,-0.490146,-1.93304,0.482111,0.0,...,-0.23363,-0.446176,-0.464518,-0.452648,-0.552146,-0.27411,-0.009418,-0.007295,0.969633,0


## 1.1. Merge Data

In [7]:
# concating tables
abt_full_00 = pd.concat([abt_train_00, abt_test_00])

# checking
abt_full_00.shape

(80540, 846)

# 2. Split

In [7]:
# spliting data to model train
X = abt_full_00.drop(columns=['TARGET'])
y = abt_full_00['TARGET']

# 3. Modeling

## 3.1. Training

In [8]:
# training model
clf = GradientBoostingClassifier(random_state=seed)
clf.fit(X, y)

## 3.2. Feature Importances

In [9]:
# variable importance
feature_importance = clf.feature_importances_

# creating dataframe of variables importances
data_feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importance
})

# soting values by importance
data_feature_importance = data_feature_importance.sort_values('importance', ascending=False)

# checking 15
data_feature_importance.head(15)

Unnamed: 0,feature,importance
25,EXT_SOURCE_3,0.265709
24,EXT_SOURCE_2,0.236105
23,EXT_SOURCE_1,0.071897
794,PAYMENT_RATE,0.04527
795,CREDIT_TO_GOODS_RATIO,0.035795
644,MIN_DEBT_CREDIT_DIFF_B,0.025806
5,DAYS_BIRTH,0.019682
796,INCOME_TO_EMPLOYED_RATIO,0.016939
137,SUM_CREDIT_TYPE_Microloan_AMT_CREDIT_SUM_DEBT_B,0.013024
639,SUM_BUREAU_CREDIT_DEBT_RATIO_B,0.012437


In [10]:
# cutoff to select features
cutoff_max = 0.005

# nomalizing cutoff
cutoff = cutoff_max * feature_importance.max()

# creating flag if importance is higher than cutoff
data_feature_importance['flag_cutoff'] = data_feature_importance['importance'].apply(lambda x: 1 if x > cutoff else 0)

# creatinh list of selected features by cutoff chosen
list_of_selected_features = data_feature_importance[data_feature_importance['flag_cutoff'] == 1].feature.tolist()

print('List of selected features:\n', list_of_selected_features)
print('Len of list of selected features:', len(list_of_selected_features))

List of selected features: ['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'PAYMENT_RATE', 'CREDIT_TO_GOODS_RATIO', 'MIN_DEBT_CREDIT_DIFF_B', 'DAYS_BIRTH', 'INCOME_TO_EMPLOYED_RATIO', 'SUM_CREDIT_TYPE_Microloan_AMT_CREDIT_SUM_DEBT_B', 'SUM_BUREAU_CREDIT_DEBT_RATIO_B', 'NAME_EDUCATION_TYPE_Higher education', 'AMT_ANNUITY', 'FLAG_DOCUMENT_3', 'MIN_BUREAU_CREDIT_DEBT_RATIO_B', 'MAX_DAYS_CREDIT_B', 'DAYS_ID_PUBLISH', 'AVG_DAYS_CREDIT_B', 'AVG_CREDIT_TYPE_Consumer_credit_DAYS_CREDIT_ENDDATE_B', 'DEF_30_CNT_SOCIAL_CIRCLE', 'SUM_CREDIT_TYPE_Consumer_credit_AMT_CREDIT_SUM_OVERDUE_B', 'AVG_CREDIT_ACTIVE_Active_B', 'MAX_CREDIT_ACTIVE_Active_AMT_CREDIT_MAX_OVERDUE_B', 'MIN_CREDIT_ACTIVE_Active_AMT_CREDIT_MAX_OVERDUE_B', 'MAX_CREDIT_TYPE_Consumer_credit_DAYS_CREDIT_ENDDATE_B', 'ANNUITY_INCOME_PERC', 'var_33', 'CAR_TO_EMPLOYED_RATIO', 'SUM_CREDIT_TYPE_Consumer_credit_AMT_CREDIT_MAX_OVERDUE_B', 'MIN_CREDIT_TYPE_Consumer_credit_AMT_CREDIT_MAX_OVERDUE_B', 'AMT_CREDIT', 'OCCUPATION_TYPE', 'SUM_CREDIT_

In [11]:
# saving pkl
import pickle
with open(f'{root_path}/{parent_folder_name}/PKL/prd_list_of_selected_features_{parent_folder_name}.pkl', 'wb') as file:
    pickle.dump(list_of_selected_features, file)

In [12]:
# # saving excel
# data_feature_importance[data_feature_importance['flag_cutoff'] == 1].to_csv(f'{root_path}/{parent_folder_name}/DATAS/df_feature_importance_{parent_folder_name}.csv')

# 4. Results

In [8]:
# preparing dataframe to plot variables importances
data_selected_features =  data_feature_importance.sort_values('importance', ascending=True)
data_selected_features = data_selected_features[data_selected_features['flag_cutoff'] == 1]

# Ajusta o tamanho da figura com base no número de features selecionadas
plt.figure(figsize=(10, len(data_selected_features) * 0.3))

# Plota as features selecionadas
plt.barh(data_selected_features['feature'], data_selected_features['importance'], color=(0.25, 0.5, 1))
plt.xlabel("Feature Importance")
plt.title("Variáveis Selecionadas - Random Forest")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

NameError: name 'data_feature_importance' is not defined

# 5. Analytical Base Table (ABT)

In [None]:
# selecting features on train
abt_train_01 = abt_train_00[list_of_selected_features]


In [14]:
# bringing target to dataframe
abt_train_01 = abt_train_01.merge(abt_train_00[['TARGET']], left_index=True, right_index=True, how='inner')

# checking
abt_train_01.head()

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,PAYMENT_RATE,CREDIT_TO_GOODS_RATIO,MIN_DEBT_CREDIT_DIFF_B,DAYS_BIRTH,INCOME_TO_EMPLOYED_RATIO,SUM_CREDIT_TYPE_Microloan_AMT_CREDIT_SUM_DEBT_B,SUM_BUREAU_CREDIT_DEBT_RATIO_B,...,var_19,MAX_CREDIT_ACTIVE_Active_AMT_CREDIT_SUM_OVERDUE_B,var_49,AVG_ENDDATE_DIFF_B,MIN_BUREAU_CREDIT_FACT_DIFF_B,AVG_CREDIT_ACTIVE_Sold_DAYS_CREDIT_ENDDATE_B,var_47,SUM_CREDIT_TYPE_Credit_card_AMT_CREDIT_SUM_B,SUM_CREDIT_TYPE_Consumer_credit_CNT_CREDIT_PROLONG_B,TARGET
0,0.9388575,-0.330462,1.739545,-0.493629,0.289744,0.547377,-1.562639,0.254763,-0.02725533,-0.007905,...,-1.139534,-0.01743356,-1.378984,4.11411e-17,2.667306e-16,0.016551,-0.997497,-0.643717,-0.07874,0
1,-6.373303e-16,-1.950521,0.0,2.039525,0.074927,0.0,-0.05312,-0.060507,-3.67896e-18,0.0,...,0.415795,-2.906674e-18,0.611346,4.11411e-17,2.667306e-16,0.0,-0.589655,0.0,0.0,0
2,-6.373303e-16,0.142739,0.0,-0.655337,0.61197,0.0,0.907441,0.05947,-3.67896e-18,0.0,...,1.553617,-2.906674e-18,-0.078582,4.11411e-17,2.667306e-16,0.0,-0.948428,0.0,0.0,1
3,1.3705,0.678846,0.0,-0.165864,-0.999158,0.49058,0.849295,0.15975,-0.02725533,-0.008741,...,1.44179,-0.01743356,0.301569,-0.02271842,-0.6782917,0.016551,0.093008,-0.643717,-0.07874,0
4,1.217635,-2.14004,-0.469618,-0.165864,-0.999158,-0.17759,0.796185,0.001501,-0.02725533,-0.007056,...,1.067252,-0.01743356,-1.271498,-0.3057576,0.806845,0.016551,0.178394,0.000551,-0.07874,0


# 6. Applying on test

In [15]:
# selecting features on train and bringing target to dataframe
abt_test_01 = prod_feature_selection(abt_test_00, list_of_selected_features)

In [16]:
# bringing target to dataframe
abt_test_01 = abt_test_01.merge(abt_test_00[['TARGET']], left_index=True, right_index=True, how='inner')

# checking
abt_test_01.head()

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,PAYMENT_RATE,CREDIT_TO_GOODS_RATIO,MIN_DEBT_CREDIT_DIFF_B,DAYS_BIRTH,INCOME_TO_EMPLOYED_RATIO,SUM_CREDIT_TYPE_Microloan_AMT_CREDIT_SUM_DEBT_B,SUM_BUREAU_CREDIT_DEBT_RATIO_B,...,var_19,MAX_CREDIT_ACTIVE_Active_AMT_CREDIT_SUM_OVERDUE_B,var_49,AVG_ENDDATE_DIFF_B,MIN_BUREAU_CREDIT_FACT_DIFF_B,AVG_CREDIT_ACTIVE_Sold_DAYS_CREDIT_ENDDATE_B,var_47,SUM_CREDIT_TYPE_Credit_card_AMT_CREDIT_SUM_B,SUM_CREDIT_TYPE_Consumer_credit_CNT_CREDIT_PROLONG_B,TARGET
0,-1.415824,-2.307964,0.0,1.144388,0.074927,-1.875223,0.379316,0.224277,-0.027255,-0.006731,...,1.711052,-0.017434,0.113956,-0.287703,0.086565,0.016551,1.702262,-0.244271,-0.07874,0
1,-0.762793,-0.681086,0.0,-0.305554,0.719378,-0.145898,0.026774,0.177966,-0.027255,-0.007086,...,1.248251,-0.017434,-0.559155,-0.104262,1.330045,0.016551,1.57556,-0.257156,-0.07874,0
2,0.587961,0.970254,0.0,-0.499078,-0.032481,-0.013975,-1.391634,0.25511,-0.027255,-0.007585,...,-1.019134,-0.017434,-1.452173,-0.299678,0.76696,0.016551,-0.091802,-0.643717,-0.07874,0
3,1.393724,0.713441,0.0,-0.165864,-0.999158,-0.138708,0.395798,0.083686,-0.027255,-0.008741,...,1.644779,-0.017434,-0.386096,-0.215722,-0.087052,0.016551,0.870895,-0.605061,-0.07874,0
4,1.225925,0.593621,2.882229,-0.615135,0.934211,-0.138708,-0.776059,0.223882,-0.027255,-0.008122,...,-1.462541,-0.017434,-0.4175,-0.296493,0.651997,0.016551,-0.28467,0.058535,-0.07874,0


# 5. Saving Analytical Base Table

In [17]:
# saving table abt
abt_train_01.to_csv(f'{root_path}/{parent_folder_name}/DATAS/GOLD/abt_train_01_{parent_folder_name}.csv', index=None)

In [18]:
# saving table abt
abt_test_01.to_csv(f'{root_path}/{parent_folder_name}/DATAS/GOLD/abt_test_01_{parent_folder_name}.csv', index=None)