# Работа с параметрами

## Чтение данных

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)

In [2]:
accepted = pd.read_csv('lending_data/accepted_clean.csv')
accepted.head().transpose()

Unnamed: 0,0,1,2,3,4
loan_amnt,3600.0,24700.0,10400.0,20000.0,20000.0
installment,123.03,820.28,289.91,637.58,631.26
sub_grade,14,11,26,7,6
emp_length,10,10,3,10,10
home_ownership,MORTGAGE,MORTGAGE,MORTGAGE,MORTGAGE,MORTGAGE
annual_inc,55000.0,65000.0,104433.0,180000.0,85000.0
verification_status,Not Verified,Not Verified,Source Verified,Not Verified,Not Verified
issue_d,2015-12-01,2015-12-01,2015-12-01,2015-12-01,2015-12-01
purpose,debt_consolidation,small_business,major_purchase,debt_consolidation,major_purchase
addr_state,PA,SD,PA,MN,SC


## Конвертация существующих параметров

### Конвертируем даты

In [3]:
dates_cols = ['issue_d', 'earliest_cr_line', 'last_credit_pull_d']

for col in dates_cols:
    accepted[col] = pd.to_datetime(accepted[col])
accepted[dates_cols].dtypes

issue_d               datetime64[ns]
earliest_cr_line      datetime64[ns]
last_credit_pull_d    datetime64[ns]
dtype: object

## Синтетические параметры

In [4]:
accepted.columns

Index(['loan_amnt', 'installment', 'sub_grade', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'issue_d', 'purpose', 'addr_state',
       'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low',
       'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'total_acc',
       'initial_list_status', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths',
       'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths',
       'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op',
       'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',
       'mths_since_recent_bc', 'mths_since_recent_inq',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
       'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'nu

### Средний FICO

In [5]:
accepted['fico_range_mean'] = (accepted['fico_range_high'] + accepted['fico_range_low']) / 2

### Datetime extract year

In [6]:
dates_cols = ['issue_d', 'earliest_cr_line', 'last_credit_pull_d']

for col in dates_cols:
    accepted[f'{col}_year'] = pd.to_datetime(accepted[col]).dt.year
    accepted[f'{col}_month'] = pd.to_datetime(accepted[col]).dt.month

### Loan Issue Season

In [7]:
accepted['issue_d_season'] = accepted['issue_d'].dt.quarter

## Представление данных

### Переведем колонки с датами к целочисленному виду

In [8]:
dates_cols = ['issue_d', 'earliest_cr_line', 'last_credit_pull_d']

for col in dates_cols:
    accepted[col] = (accepted[col] - pd.to_datetime('1900-01-01')).dt.days
accepted[dates_cols].dtypes

issue_d               int64
earliest_cr_line      int64
last_credit_pull_d    int64
dtype: object

### Переведем оставшиеся категориальные колонки к целочисленному виду

In [9]:
str_cols = list(accepted.select_dtypes(include=['object']).columns)

for cols in str_cols:
    accepted[cols] = accepted[cols].astype('category').cat.codes

In [10]:
accepted.head().transpose()

Unnamed: 0,0,1,2,3,4
loan_amnt,3600.0,24700.0,10400.0,20000.0,20000.0
installment,123.03,820.28,289.91,637.58,631.26
sub_grade,14,11,26,7,6
emp_length,10,10,3,10,10
home_ownership,1,1,1,1,1
annual_inc,55000.0,65000.0,104433.0,180000.0,85000.0
verification_status,0,0,1,0,0
issue_d,42337,42337,42337,42337,42337
purpose,2,11,6,2,6
addr_state,38,41,38,23,40


## Моделирование целевой переменной

В рамках нашей задачи мы рассматриваем базовое решение и проводим эксперименты.
В рамках базового решения мы будем рассматривать событие, как наступление дефолта в первые 12 месяцев.

In [11]:
accepted['baseline_event'] = ((accepted['event']) & (accepted['duration'] < 12))
accepted[['baseline_event', 'event', 'duration']].head()

Unnamed: 0,baseline_event,event,duration
0,False,False,36
1,False,False,36
2,False,False,60
3,False,False,36
4,False,False,36


In [12]:
accepted.dropna(inplace=True)
accepted.head().transpose()

Unnamed: 0,0,1,2,3,4
loan_amnt,3600.0,24700.0,10400.0,20000.0,20000.0
installment,123.03,820.28,289.91,637.58,631.26
sub_grade,14,11,26,7,6
emp_length,10,10,3,10,10
home_ownership,1,1,1,1,1
annual_inc,55000.0,65000.0,104433.0,180000.0,85000.0
verification_status,0,0,1,0,0
issue_d,42337,42337,42337,42337,42337
purpose,2,11,6,2,6
addr_state,38,41,38,23,40


Разделим данные для базового решения и для экспериментов.

In [13]:
accepted_se = accepted.drop(columns=['baseline_event'])
accepted_base = accepted.drop(columns=['event'])

## Выгрузка данных

In [14]:
accepted_se.to_csv('lending_data/accepted_se_features.csv', index=False)
accepted_base.to_csv('lending_data/accepted_base_features.csv', index=False)