In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
loans = pd.read_excel('LTV.xlsx')

In [3]:
loans.sample(10)

Unnamed: 0,cid,value,ins_value,ins_pct,start_date,status,is_insuance_paid,paid_body,interest,commis,fees
57185,4383555,20000,,,2020-05-20,Погашен,,20000.0,8000.0,2600.0,0.0
98617,3602251,5175,675.0,0.15,2020-06-20,Просрочен,0.0,,,,
12192,4616502,20700,2700.0,0.15,2020-11-04,Погашен,1.0,20700.0,2869.02,0.0,0.0
43156,4767704,4600,600.0,0.15,2020-12-27,Погашен,1.0,4600.0,0.0,0.0,0.0
48392,4073403,3450,450.0,0.15,2020-05-09,Погашен,1.0,3450.0,1863.0,390.0,45.36
26918,4493744,4000,,,2020-08-09,Погашен,,4000.0,0.0,0.0,0.0
93947,3906628,9200,1200.0,0.15,2020-10-30,Погашен,1.0,9200.0,4278.13,0.0,15.42
14561,4541796,7360,960.0,0.15,2020-09-03,Погашен,1.0,7360.0,1840.0,0.0,0.0
22005,4548012,9200,1200.0,0.15,2020-11-19,Погашен,1.0,9200.0,2879.52,0.0,0.0
103287,3017348,4000,,,2020-05-20,Погашен,,4000.0,118.8,520.0,0.0


In [4]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109077 entries, 0 to 109076
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   cid               109077 non-null  int64         
 1   value             109077 non-null  int64         
 2   ins_value         39109 non-null   float64       
 3   ins_pct           39109 non-null   float64       
 4   start_date        109077 non-null  datetime64[ns]
 5   status            109077 non-null  object        
 6   is_insuance_paid  39109 non-null   float64       
 7   paid_body         93934 non-null   float64       
 8   interest          93934 non-null   float64       
 9   commis            93934 non-null   float64       
 10  fees              93934 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(2), object(1)
memory usage: 9.2+ MB


In [5]:
loans.isna().sum()

cid                     0
value                   0
ins_value           69968
ins_pct             69968
start_date              0
status                  0
is_insuance_paid    69968
paid_body           15143
interest            15143
commis              15143
fees                15143
dtype: int64

### Обработка пропусков

In [6]:
# заполним нулями пропуски в столбцах ins_value, ins_pct, paid_body, interest, commis и fees
loans[['ins_value', 'ins_pct', 'paid_body', 'interest', 'commis', 'fees']] = loans[['ins_value', 'ins_pct', 'paid_body', 'interest', 'commis', 'fees']].fillna(0)

In [7]:
loans.head(10)

Unnamed: 0,cid,value,ins_value,ins_pct,start_date,status,is_insuance_paid,paid_body,interest,commis,fees
0,4541262,13800,1800.0,0.15,2020-09-03,Погашен,1.0,13800.0,4756.36,0.0,0.0
1,4542542,5750,750.0,0.15,2020-09-04,Погашен,1.0,5750.0,920.0,0.0,0.0
2,4522946,8050,1050.0,0.15,2020-09-04,Погашен,1.0,8050.0,2898.0,0.0,0.0
3,4542687,5750,750.0,0.15,2020-09-04,Просрочен,0.0,0.0,1725.0,0.0,0.0
4,4541835,20700,2700.0,0.15,2020-09-04,Погашен,1.0,20700.0,5153.82,0.0,0.0
5,4541962,4600,600.0,0.15,2020-09-04,Просрочен,0.0,0.0,0.0,0.0,0.0
6,3350059,1955,255.0,0.15,2020-09-04,Просрочен,0.0,0.0,0.0,0.0,0.0
7,4441493,10925,1425.0,0.15,2020-09-04,Погашен,1.0,10925.0,2294.25,0.0,0.0
8,4371211,2300,300.0,0.15,2020-09-04,Погашен,1.0,2300.0,633.22,0.0,0.0
9,4543967,12650,1650.0,0.15,2020-09-05,Погашен,1.0,12650.0,7969.5,0.0,159.39


In [8]:
loans.isna().sum()

cid                     0
value                   0
ins_value               0
ins_pct                 0
start_date              0
status                  0
is_insuance_paid    69968
paid_body               0
interest                0
commis                  0
fees                    0
dtype: int64

### Расчет дополнительных показателей

In [9]:
# выделим запрашиваемую сумму займа с страховку в отдельные столбцы
loans['value'] = loans['value'] - loans['ins_value']

# для займов со страховкой выделим себестоимость страховки в отдельный столбец
loans['ins_cost'] = loans['ins_value'] * 0.09

In [10]:
loans = loans[['cid', 'value', 'ins_value', 'ins_cost','ins_pct', 'start_date', 
               'status', 'is_insuance_paid', 'paid_body', 'interest', 'commis', 'fees']]

In [11]:
loans['client_get'] = loans['value'] + loans['ins_cost']
loans['client_pay'] = loans['paid_body'] + loans['interest'] + loans['commis'] + loans['fees']
loans['income'] = loans['client_pay'] - loans['client_get']

In [12]:
ltv_data = loans[['cid', 'start_date', 'income']]
ltv_data['loan_month'] = ltv_data['start_date'].astype('datetime64[M]')

In [13]:
ltv_data.head(10)

Unnamed: 0,cid,start_date,income,loan_month
0,4541262,2020-09-03,6394.36,2020-09-01
1,4542542,2020-09-04,1602.5,2020-09-01
2,4522946,2020-09-04,3853.5,2020-09-01
3,4542687,2020-09-04,-3342.5,2020-09-01
4,4541835,2020-09-04,7610.82,2020-09-01
5,4541962,2020-09-04,-4054.0,2020-09-01
6,3350059,2020-09-04,-1722.95,2020-09-01
7,4441493,2020-09-04,3591.0,2020-09-01
8,4371211,2020-09-04,906.22,2020-09-01
9,4543967,2020-09-05,9630.39,2020-09-01


### Поиск уникальных заемщиков

In [14]:
first_loans = ltv_data.sort_values(by=['cid', 'start_date']).groupby(by='cid').agg({'loan_month': 'first'})\
.reset_index()
first_loans.columns=['cid', 'first_loan_month']
first_loans

Unnamed: 0,cid,first_loan_month
0,8417,2020-08-01
1,38555,2020-01-01
2,42769,2020-12-01
3,53567,2020-02-01
4,55114,2020-05-01
...,...,...
109072,4847348,2021-01-01
109073,4847353,2021-01-01
109074,4847378,2021-01-01
109075,4847382,2021-01-01


In [15]:
loans['cid'].value_counts().head(10)

4722690    1
4480301    1
4775758    1
4519755    1
4515657    1
4759366    1
2918213    1
4241219    1
4763456    1
4415294    1
Name: cid, dtype: int64