# Features from installments_payments dataset

Этот блокнот создает признаки из набора данных installments_payments. Набор данных installments_payments содержит историю погашения ранее выданных кредитов в Home Credit.

In [None]:
import numpy as np
import pandas as pd

# -----------------------------------------------------
from google.colab import drive

# -----------------------------------------------------
import zipfile
import time
import sys
import os
import gc

In [None]:
# Сброс ограничений на число столбцов
pd.set_option("display.max_columns", None)

# Сброс ограничений на число строк до 400
pd.set_option("display.max_rows", 400)

In [None]:
# Подключение Google Drive к текущей среде выполнения

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Функция распаковки содержимого архива в указанную директорию

def extract_zip_file(zip_path, extract_path):
    """
    Распаковка содержимого архива в указанную директорию

    :param zip_path: Путь к архиву
    :param extract_path: Путь к директории, куда нужно распаковать содержимое архива
    """
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_path)

In [None]:
# Распаковка содержимого архива home-credit-default-risk в текущую директорию

zip_file = "/content/drive/My Drive/home-credit-default-risk.zip"
extract_path = "/content/"

extract_zip_file(zip_file, extract_path)

In [None]:
# Распаковка содержимого архива 6 - Previous_application features в текущую директорию

zip_file = "/content/drive/My Drive/6 - Previous_application features.zip"
extract_path = "/content/"

extract_zip_file(zip_file, extract_path)

In [None]:
# Чтение данных

application = pd.read_csv("/content/6 - Previous_application features/application.csv")

installments_payments = pd.read_csv("/content/home-credit-default-risk/installments_payments.csv")
homecredit_columns_description = pd.read_csv("/content/6 - Previous_application features/homecredit_columns_description.csv", encoding="ISO-8859-1")

In [None]:
# Проверка

application.shape

(356255, 251)

In [None]:
# Описание столбцов в previous_application

description_application = homecredit_columns_description[homecredit_columns_description["Table"] == "installments_payments.csv"]
for index, row in description_application.iterrows():
    print(row["Row"], row["Description"])

installments_payments.head(20)

SK_ID_PREV  ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)
SK_ID_CURR ID of loan in our sample
NUM_INSTALMENT_VERSION Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed
NUM_INSTALMENT_NUMBER On which installment we observe payment
DAYS_INSTALMENT When the installment of previous credit was supposed to be paid (relative to application date of current loan)
DAYS_ENTRY_PAYMENT When was the installments of previous credit paid actually (relative to application date of current loan)
AMT_INSTALMENT What was the prescribed installment amount of previous credit on this installment
AMT_PAYMENT What the client actually paid on previous credit on this installment


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585
5,1137312,164489,1.0,12,-1384.0,-1417.0,5970.375,5970.375
6,2234264,184693,4.0,11,-349.0,-352.0,29432.295,29432.295
7,1818599,111420,2.0,4,-968.0,-994.0,17862.165,17862.165
8,2723183,112102,0.0,14,-197.0,-197.0,70.74,70.74
9,1413990,109741,1.0,4,-570.0,-609.0,14308.47,14308.47


In [None]:
# Извлечение признаков из installments_payments

# Создание пустых списков для каждого нового признака
nb_inst_pay_credits_list = []
nb_inst_payments_list = []
days_last_req_inst_list = []
days_last_inst_pay_list = []
avg_delay_inst_pay_list = []
max_delay_inst_pay_list = []
avg_inst_req_list = []
avg_inst_pay_list = []
fr_inst_pay_req_list = []
nb_late_inst_pay_list = []
fr_late_inst_pay_list = []
nb_incomplete_inst_pay_list = []
fr_incomplete_inst_pay_list = []
max_reworked_inst_list = []

counter = 0
lenght = len(application)
for index, row in application.iterrows():
    counter += 1
    sys.stdout.write('\r'+"Progress:"+str(round(counter/lenght*100,1))+"%")
 
    nb_inst_pay_credits = 0
    nb_inst_payments = 0
    days_last_req_inst = np.NaN
    days_last_inst_pay = np.NaN
    avg_delay_inst_pay = np.NaN
    max_delay_inst_pay = np.NaN
    avg_inst_req = np.NaN
    avg_inst_pay = np.NaN
    fr_inst_pay_req = np.NaN
    nb_late_inst_pay = 0
    fr_late_inst_pay = np.NaN
    nb_incomplete_inst_pay = 0
    fr_incomplete_inst_pay = np.NaN
    max_reworked_inst = np.NaN

    df = installments_payments[installments_payments["SK_ID_CURR"] == row["SK_ID_CURR"]].sort_values(by="NUM_INSTALMENT_NUMBER", ascending = False)
    if len(df) > 0:
        unique_credit = df["SK_ID_PREV"].unique()
        # Количество прошлых платежей в рассрочку кредитов
        nb_inst_pay_credits = len(unique_credit)
        # Количество прошлых платежей в рассрочку
        nb_inst_payments = len(df)
        # Дней с момента последнего требуемого взноса
        days_last_req_inst = df["DAYS_INSTALMENT"].min()
        # Дней с момента последнего взноса
        days_last_inst_pay = df["DAYS_ENTRY_PAYMENT"].min()
        # Средняя задержка платежей в рассрочку
        avg_delay_inst_pay = (df["DAYS_INSTALMENT"] - df["DAYS_ENTRY_PAYMENT"]).mean()
        # Максимальная задержка платежей в рассрочку
        max_delay_inst_pay = (df["DAYS_INSTALMENT"] - df["DAYS_ENTRY_PAYMENT"]).max()
        # Средний требуемый взнос
        avg_inst_req = df["AMT_INSTALMENT"].mean()
        # Средние платежи в рассрочку
        avg_inst_pay= df["AMT_PAYMENT"].mean()
        # Частичная рассрочка платежей сверх требуемой
        if avg_inst_req > 0:
            fr_inst_pay_req = avg_inst_pay/avg_inst_req
        # Количество просроченных платежей в рассрочку
        nb_late_inst_pay = len(df[df["DAYS_INSTALMENT"] < df["DAYS_ENTRY_PAYMENT"]])
        # Доля просроченных платежей в рассрочку
        if nb_inst_payments > 0:
            fr_late_inst_pay = nb_late_inst_pay/nb_inst_payments
        # Количество неполных платежей в рассрочку
        nb_incomplete_inst_pay = len(df[df["AMT_PAYMENT"] < df["AMT_INSTALMENT"]])
        # Доля просроченных платежей в рассрочку
        if nb_inst_payments > 0:
            fr_incomplete_inst_pay = nb_incomplete_inst_pay/nb_inst_payments
        # Максимальное количество переработанных рассрочек
        max_reworked_inst = df["NUM_INSTALMENT_VERSION"].max()
        
        
    nb_inst_pay_credits_list.append(nb_inst_pay_credits)
    nb_inst_payments_list.append(nb_inst_payments)
    days_last_req_inst_list.append(days_last_req_inst)
    days_last_inst_pay_list.append(days_last_inst_pay)
    avg_delay_inst_pay_list.append(avg_delay_inst_pay)
    max_delay_inst_pay_list.append(max_delay_inst_pay)
    avg_inst_req_list.append(avg_inst_req)
    avg_inst_pay_list.append(avg_inst_pay)
    fr_inst_pay_req_list.append(fr_inst_pay_req)
    nb_late_inst_pay_list.append(nb_late_inst_pay)
    fr_late_inst_pay_list.append(fr_late_inst_pay)
    nb_incomplete_inst_pay_list.append(nb_incomplete_inst_pay)
    fr_incomplete_inst_pay_list.append(fr_incomplete_inst_pay)
    max_reworked_inst_list.append(max_reworked_inst)

Progress:100.0%

In [None]:
# Функция для отображения описания выбранного признака

def add_description(table, row, description):
    list_description = [np.NaN, table, row, description, np.NaN]
    homecredit_columns_description.loc[len(homecredit_columns_description)] = list_description
    return homecredit_columns_description

In [None]:
# Создание новых признаков из installment_payments

application["nb_inst_pay_credits_list"] = nb_inst_pay_credits_list
add_description("installment_payments.csv", "nb_inst_pay_credits_list", "Number of past installment payments credits")

application["nb_inst_payments_list"] = nb_inst_payments_list
add_description("installment_payments.csv", "nb_inst_payments_list", "Number of past installment payments")

application["days_last_req_inst_list"] = days_last_req_inst_list
add_description("installment_payments.csv", "days_last_req_inst_list", "Days since last required installment")

application["days_last_inst_pay_list"] = days_last_inst_pay_list
add_description("installment_payments.csv", "days_last_inst_pay_list", "Days since last installment payment")

application["avg_delay_inst_pay_list"] = avg_delay_inst_pay_list
add_description("installment_payments.csv", "avg_delay_inst_pay_list", "Average delay in installment payments")

application["max_delay_inst_pay_list"] = max_delay_inst_pay_list
add_description("installment_payments.csv", "max_delay_inst_pay_list", "Maximum delay in installment payments")

application["avg_inst_req_list"] = avg_inst_req_list
add_description("installment_payments.csv", "avg_inst_req_list", "Average installment required")

application["avg_inst_pay_list"] = avg_inst_pay_list
add_description("installment_payments.csv", "avg_inst_pay_list", "Average installment payments")

application["fr_inst_pay_req_list"] = fr_inst_pay_req_list
add_description("installment_payments.csv", "fr_inst_pay_req_list", "Fraction installment payments over required")

application["nb_late_inst_pay_list"] = nb_late_inst_pay_list
add_description("installment_payments.csv", "nb_late_inst_pay_list", "Number of late installment payments")

application["fr_late_inst_pay_list"] = fr_late_inst_pay_list
add_description("installment_payments.csv", "fr_late_inst_pay_list", "Fraction of late installment payments")

application["nb_incomplete_inst_pay_list"] = nb_incomplete_inst_pay_list
add_description("installment_payments.csv", "nb_incomplete_inst_pay_list", "Number of incomplete installment payments")

application["fr_incomplete_inst_pay_list"] = fr_incomplete_inst_pay_list
add_description("installment_payments.csv", "fr_incomplete_inst_pay_list", "Fraction of late installment payments")

application["max_reworked_inst_list"] = max_reworked_inst_list
add_description("installment_payments.csv", "max_reworked_inst_list", "Maximum number of reworked installments");


In [None]:
# Проверка

application.shape

(356255, 265)

In [None]:
# Сохранение application

application.to_csv("application.csv", index=False)

In [None]:
# Сохранение homecredit_columns_description

homecredit_columns_description.to_csv("homecredit_columns_description.csv", index=False)