#Features from POS_CASH_balance dataset

Этот блокнот создает признаки из набора данных POS_CASH_balance. Набор данных POS_CASH_balance содержит ежемесячные снимки баланса предыдущих POS (точек продаж) и кредитов наличными, которые заявитель имел в Home Credit.

In [None]:
import numpy as np
import pandas as pd

# -----------------------------------------------------
from google.colab import drive

# -----------------------------------------------------
import zipfile
import time
import sys
import os
import gc

In [None]:
# Сброс ограничений на число столбцов
pd.set_option("display.max_columns", None)

# Сброс ограничений на число строк до 400
pd.set_option("display.max_rows", 400)

In [None]:
# Подключение Google Drive к текущей среде выполнения

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Функция распаковки содержимого архива в указанную директорию

def extract_zip_file(zip_path, extract_path):
    """
    Распаковка содержимого архива в указанную директорию

    :param zip_path: Путь к архиву
    :param extract_path: Путь к директории, куда нужно распаковать содержимое архива
    """
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_path)

In [None]:
# Распаковка содержимого архива home-credit-default-risk в текущую директорию

zip_file = "/content/drive/My Drive/home-credit-default-risk.zip"
extract_path = "/content/"

extract_zip_file(zip_file, extract_path)

In [None]:
# Распаковка содержимого архива 4 - Credit_card_balance features в текущую директорию

zip_file = "/content/drive/My Drive/4 - Credit_card_balance features.zip"
extract_path = "/content/"

extract_zip_file(zip_file, extract_path)

In [None]:
# Чтение данных

application = pd.read_csv("/content/4 - Credit_card_balance features/application.csv")

pos_cash_balance = pd.read_csv("/content/home-credit-default-risk/POS_CASH_balance.csv")
homecredit_columns_description = pd.read_csv("/content/4 - Credit_card_balance features/homecredit_columns_description.csv", encoding="ISO-8859-1")

In [None]:
# Проверка

application.shape

(356255, 208)

In [None]:
# Описание столбцов в pos_cash_balance

description_application = homecredit_columns_description[homecredit_columns_description["Table"] == "POS_CASH_balance.csv"]
for index, row in description_application.iterrows():
    print(row["Row"], row["Description"])

pos_cash_balance.head(20)

SK_ID_PREV  ID of previous credit in Home Credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)
SK_ID_CURR ID of loan in our sample
MONTHS_BALANCE Month of balance relative to application date (-1 means the information to the freshest monthly snapshot, 0 means the information at application - often it will be the same as -1 as many banks are not updating the information to Credit Bureau regularly )
CNT_INSTALMENT Term of previous credit (can change over time)
CNT_INSTALMENT_FUTURE Installments left to pay on the previous credit
NAME_CONTRACT_STATUS Contract status during the month
SK_DPD DPD (days past due) during the month of previous credit
SK_DPD_DEF DPD during the month with tolerance (debts with low loan amounts are ignored) of the previous credit


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0
5,2207092,342166,-32,12.0,12.0,Active,0,0
6,1110516,204376,-38,48.0,43.0,Active,0,0
7,1387235,153211,-35,36.0,36.0,Active,0,0
8,1220500,112740,-31,12.0,12.0,Active,0,0
9,2371489,274851,-32,24.0,16.0,Active,0,0


In [None]:
# Извлечение признаков из POS_CASH_balance

# Создание пустых списков для каждого нового признака
nb_pos_cash_list = []
nb_active_pos_cash_list = []  
nb_future_instalments_list = []
nb_total_instalments_list = []
nb_days_past_due_list = []
nb_days_past_due_def_list = []
tot_nb_days_past_due_list = []
tot_nb_days_past_due_def_list = []
fr_active_credit_remaing = []
nb_months_dpd_def_event_list = []

counter = 0
lenght = len(application)
for index, row in application.iterrows():
    counter += 1
    sys.stdout.write('\r'+"Progress:"+str(round(counter/lenght*100,1))+"%")
    
    nb_pos_cash = 0
    nb_active_pos_cash = 0
    nb_future_instalments = 0
    nb_total_instalments = 0
    nb_days_past_due = 0
    nb_days_past_due_def = 0
    tot_nb_days_past_due = 0
    tot_nb_days_past_due_def = 0
    nb_months_dpd_def_event = 0
    
    df = pos_cash_balance[pos_cash_balance["SK_ID_CURR"] == row["SK_ID_CURR"]].sort_values(by="MONTHS_BALANCE", ascending = False)
    if len(df) > 0:
        unique_credit = df["SK_ID_PREV"].unique()
        # Количество прошлых кредитов на покупку жилья
        nb_pos_cash = len(unique_credit)
        # Перебор всех уникальных кредитов
        for credit in unique_credit:
            tmp = df[df["SK_ID_PREV"] == credit]
        # Количество активных прошлых кредитов home credit (-2, поскольку для -1 иногда отсутствуют данные)
            if (tmp.iloc[0]["NAME_CONTRACT_STATUS"] == "Active") & (tmp.iloc[0]["CNT_INSTALMENT_FUTURE"] > 0):
                tmp_current = tmp.iloc[0]
                nb_active_pos_cash += 1
        # Количество будущих платежей по активным кредитам
                nb_future_instalments += tmp_current["CNT_INSTALMENT_FUTURE"]
        # Общее количество платежей по активным кредитам
                nb_total_instalments += tmp_current["CNT_INSTALMENT"]
        # Общее текущее количество активных кредитов в день
                nb_days_past_due += tmp_current["SK_DPD"]
        # Общее текущее количество DPD с допуском по активным кредитам
                nb_days_past_due_def += tmp_current["SK_DPD_DEF"]        
        # Общее количество дней DPD        
        tot_nb_days_past_due = df[df["CNT_INSTALMENT_FUTURE"] > 0]["SK_DPD"].sum()
        # Общее количество дней DPD с допуском        
        tot_nb_days_past_due_def = df[df["CNT_INSTALMENT_FUTURE"] > 0]["SK_DPD_DEF"].sum()
        # Количество месяцев с DPD даже при переносимости
        nb_months_dpd_def_event = len(df[(df["SK_DPD_DEF"] > 0) & (df["CNT_INSTALMENT_FUTURE"] > 0)])
        
    nb_pos_cash_list.append(nb_pos_cash)
    nb_active_pos_cash_list.append(nb_active_pos_cash)   
    nb_future_instalments_list.append(nb_future_instalments)
    nb_total_instalments_list.append(nb_total_instalments)
    nb_days_past_due_list.append(nb_days_past_due)
    nb_days_past_due_def_list.append(nb_days_past_due_def)
    tot_nb_days_past_due_list.append(tot_nb_days_past_due)
    tot_nb_days_past_due_def_list.append(nb_days_past_due_def)
    # Доля платежей, оставшихся по кредиту 
    if nb_total_instalments > 0:
        fr_active_credit_remaing.append(nb_future_instalments/nb_total_instalments)
    else:
        fr_active_credit_remaing.append(np.NaN)
    nb_months_dpd_def_event_list.append(nb_months_dpd_def_event)     

Progress:100.0%

In [None]:
# Функция для отображения описания выбранного признака

def add_description(table, row, description):
    list_description = [np.NaN, table, row, description, np.NaN]
    homecredit_columns_description.loc[len(homecredit_columns_description)] = list_description
    return homecredit_columns_description

In [None]:
# Создание новых признаков из credit_card_balance

application["nb_pos_cash_list"] = nb_pos_cash_list
add_description("POS_CASH_balance.csv", "nb_pos_cash_list", "Number of past home credit loans")

application["nb_active_pos_cash_list"] = nb_active_pos_cash_list
add_description("POS_CASH_balance.csv", "nb_active_pos_cash_list", "Number of active past home credit loans")

application["nb_future_instalments_list"] = nb_future_instalments_list
add_description("POS_CASH_balance.csv", "nb_future_instalments_list", "Number of future instalments for active credits")

application["nb_total_instalments_list"] = nb_total_instalments_list
add_description("POS_CASH_balance.csv", "nb_total_instalments_list", "Total number of instalments for active credits")

application["nb_days_past_due_list"] = nb_days_past_due_list
add_description("POS_CASH_balance.csv", "nb_days_past_due_list", "Total current number of DPD for active credits")

application["nb_days_past_due_def_list"] = nb_days_past_due_def_list
add_description("POS_CASH_balance.csv", "nb_days_past_due_def_list", "Total current number of DPD with tolerance for active credits")

application["tot_nb_days_past_due_list"] = tot_nb_days_past_due_list
add_description("POS_CASH_balance.csv", "tot_nb_days_past_due_list", "Total number of days DPD")

application["tot_nb_days_past_due_def_list"] = tot_nb_days_past_due_def_list
add_description("POS_CASH_balance.csv", "tot_nb_days_past_due_def_list", "Total number of days DPD with tolerance")

application["fr_active_credit_remaing"] = fr_active_credit_remaing
add_description("POS_CASH_balance.csv", "fr_active_credit_remaing", "Fraction of instalments remaining on the credit")

application["nb_months_dpd_def_event_list"] = nb_months_dpd_def_event_list
add_description("POS_CASH_balance.csv", "nb_months_dpd_def_event_list", "Number of months with DPD even with tolerance");

In [None]:
# Проверка

application.shape

(356255, 218)

In [None]:
# Сохранение application

application.to_csv("application.csv", index=False)

In [None]:
# Сохранение homecredit_columns_description

homecredit_columns_description.to_csv("homecredit_columns_description.csv", index=False)