# 0. Libraries

In [15]:
import os
import warnings
import pickle

import pandas as pd
import numpy  as np

from utils                 import *
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.exceptions    import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

## 0.1. Settings parameters

### 0.1.1. Seed

In [None]:
# seed
seed = 42

### 0.1.2. Paths

In [None]:
# gets the current working directory (where the notebook is located)
actual_path = os.getcwd()

# gets the current working root directory
root_path = os.path.abspath(os.path.join(actual_path, "..", ".."))

# gets the parent directory (one level up)
parent_path = os.path.dirname(actual_path)

# extract folder name from parent directory
parent_folder_name = os.path.basename(parent_path)

print(f"Parent path name: {parent_folder_name}")

Parent path name: PROJECT_17


# 1. Data

In [18]:
# loading
data_test_scoring_01 = pd.read_csv(f'{root_path}/DADOS/application_test_merged_bureau.csv')

# checking
data_test_scoring_01.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,MAX_CREDIT_CURRENCY_currency_4_AMT_CREDIT_SUM_B,MAX_CREDIT_CURRENCY_currency_4_AMT_CREDIT_SUM_DEBT_B,MAX_CREDIT_CURRENCY_currency_4_AMT_CREDIT_SUM_LIMIT_B,MAX_CREDIT_CURRENCY_currency_4_AMT_CREDIT_SUM_OVERDUE_B,MAX_CREDIT_CURRENCY_currency_4_AMT_ANNUITY_B,MAX_CREDIT_CURRENCY_currency_4_DAYS_CREDIT_B,MAX_CREDIT_CURRENCY_currency_4_CREDIT_DAY_OVERDUE_B,MAX_CREDIT_CURRENCY_currency_4_DAYS_CREDIT_ENDDATE_B,MAX_CREDIT_CURRENCY_currency_4_DAYS_ENDDATE_FACT_B,MAX_CREDIT_CURRENCY_currency_4_DAYS_CREDIT_UPDATE_B
0,384575,Cash loans,M,Y,N,2,207000.0,465457.5,52641.0,418500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214010,Cash loans,F,Y,Y,0,247500.0,1281712.5,48946.5,1179000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,142232,Cash loans,F,Y,N,0,202500.0,495000.0,39109.5,495000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,389171,Cash loans,F,N,Y,0,247500.0,254700.0,24939.0,225000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,283617,Cash loans,M,N,Y,0,112500.0,308133.0,15862.5,234000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 1.1. Filtering

In [None]:
# dropping sensible variables
data_test_scoring_01 = data_test_scoring_01.drop(columns='CODE_GENDER')

# is required to drop lines where 'NAME_FAMILY_STATUS' is equals to 'Unknown' because there is only one in the entire database
data_test_scoring_01 = data_test_scoring_01[data_test_scoring_01['NAME_FAMILY_STATUS'] != 'Unknown']

## 1.2. Feature Engineering

In [None]:
# feature engineering
data_test_scoring_01 = generate_feature_engineering(data_test_scoring_01)

# checking
data_test_scoring_01.head()

# 2. Loading pickles

In [21]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_list_variables_to_drop_{parent_folder_name}.pkl', 'rb') as f:
    list_variables_to_drop = pickle.load(f)

In [22]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_list_variables_to_drop_nan_{parent_folder_name}.pkl', 'rb') as f:
    list_variables_to_drop_nan = pickle.load(f)

In [23]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_dict_to_fillna_{parent_folder_name}.pkl', 'rb') as f:
    dict_to_fillna = pickle.load(f)

In [24]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_dict_to_label_encoder_{parent_folder_name}.pkl', 'rb') as f:
    dict_to_label_encoder = pickle.load(f)

In [25]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_dict_to_one_hot_encoder_{parent_folder_name}.pkl', 'rb') as f:
    dict_to_one_hot_encoder = pickle.load(f)

In [26]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_list_of_variables_to_keep_low_correlation_{parent_folder_name}.pkl', 'rb') as f:
    list_of_variables_to_keep_low_correlation = pickle.load(f)

In [27]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_scaler_{parent_folder_name}.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [28]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/prd_list_of_selected_features_{parent_folder_name}.pkl', 'rb') as f:
    list_of_selected_features = pickle.load(f)

In [29]:
# Carregar lista de variáveis que foram excluidas por execesso de nulos
with open(f'{root_path}/{parent_folder_name}/PKL/model_{parent_folder_name}.pkl', 'rb') as f:
    model = pickle.load(f)

# 3. Applying Flow

In [30]:
# dropping first variables
data_test_scoring_02 = prod_variables_to_drop(data_test_scoring_01, list_variables_to_drop)

In [31]:
# dropping nan variables
data_test_scoring_03 = prod_variables_to_drop_nan(data_test_scoring_02, list_variables_to_drop_nan)

In [32]:
# filling nan values (categorical and numerical)
data_test_scoring_04 = prod_variables_to_fill_nan(data_test_scoring_03, dict_to_fillna)

In [33]:
# applying label encoder
data_test_scoring_04 = prod_label_encoder(data_test_scoring_04, dict_to_label_encoder)

In [34]:
# applying one hot encoder
data_test_scoring_04 = prod_one_hot_encoder(data_test_scoring_04, dict_to_one_hot_encoder)



In [35]:
# keeping low correlation variables
data_test_scoring_04 = prod_correlation(data_test_scoring_04, list_of_variables_to_keep_low_correlation)

In [36]:
# applying scaler
data_test_scoring_04 = prod_scaler(data_test_scoring_04, scaler)

In [37]:
# applying feature selection
data_test_scoring_05 = prod_feature_selection(data_test_scoring_04, list_of_selected_features)

# 4. Model Predict

In [40]:
# predicting probabilities
predict_proba = model.predict_proba(data_test_scoring_05)

# predicting results
predict = model.predict(data_test_scoring_05)

# inserting on dataframe
data_test_scoring_05['score'] = predict_proba[:, 1]
data_test_scoring_05['class'] = predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_scoring_05['score'] = predprob[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test_scoring_05['class'] = predict


# 5. ABT

In [41]:
# bringing SK_ID_CURR
abt_test_scored = data_test_scoring_05.merge(data_test_scoring_01[['SK_ID_CURR']], left_index=True, right_index=True, how='inner')

In [46]:
# abt_test_scored.sample(20000, random_state=seed).to_csv(f'{root_path}/{parent_folder_name}/DATAS/abt_test_scoring_{parent_folder_name}.csv')

In [47]:
# send to kaggle
abt_send_kaggle = abt_test_scored[['SK_ID_CURR', 'score']]
abt_send_kaggle.rename(columns={'score': 'TARGET', 'SK_ID_CURR': 'ID'}, inplace=True)

# checking
abt_send_kaggle.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abt_envio_kaggle.rename(columns={'score': 'TARGET', 'SK_ID_CURR': 'ID'}, inplace=True)


Unnamed: 0,ID,TARGET
0,384575,0.757881
1,214010,0.396161
2,142232,0.722206
3,389171,0.337442
4,283617,0.425381


# 5. Saving Table

In [49]:
# saving table abt
abt_send_kaggle.to_csv(f'{root_path}/{parent_folder_name}/DATAS/GOLD/abt_submission_{parent_folder_name}.csv', index=None)