## 4 - Модуль предупреждающих сигналов

### 4.1 - Библиотеки

In [1]:
import pandas as pd
import numpy as np

from _utils.text_preporation import master_scale
from _utils.qualitative_features_analysis import comprehensive_feature_analysis, recommend_features_to_drop

import yaml

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns', None)

### 4.2 - Пути

In [3]:
with open('../CONFIGS.yaml', 'r') as file:
    CONFIG = yaml.safe_load(file)

PATHS = CONFIG['data_paths']

In [4]:
filepath = {
      'warnings_processed' : PATHS['module_warnings']['processed_data']
    , 'warnings_raw'       : PATHS['module_warnings']['raw_data']
    , 'rating_weights'     : PATHS['ratings']['weights']
    , 'youngs_data'        : PATHS['case_youngs']['processed_data']
    , 'master_scale'       : PATHS['ratings']['master_scale']
    , 'final_signals'      : PATHS['module_warnings']['final']
}

In [5]:
target_col = 'target'
weight_col = 'W_norm'

id_cols = ['client_id', 'rating_id', target_col, weight_col]

EXCLUDE_SIGNALS = False

### 4.3 - Чтение файлов

In [6]:
df = pd.read_parquet(filepath['warnings_processed'])
df_warnings = pd.read_excel(filepath['warnings_raw'])
warnings_names = df_warnings.copy()
warnings_names = warnings_names[['Показатель', 'Код интеграции']].set_axis(['definition', 'code'], axis=1)

df_weights = pd.read_parquet(filepath['rating_weights'])
df_dpd = pd.read_parquet(PATHS['risk']['dpd_data'])

master = pd.read_excel(filepath['master_scale'])
master.rename(columns = {
      'Рейтинг' : 'rating'
    , 'Нижний предел' : 'lower_bound'
    , 'Вероятность Дефолта' : 'pd'
    , 'Верхний предел' : 'upper_bound'
}, inplace=True)

### 4.4 - Формирование датасета

In [7]:
df_warnings = df_warnings.rename(columns = {  'Код интеграции': 'signal'
                                            , 'Ухудшение рейтинга': 'current_rating'})
df_warnings = df_warnings[['signal', 'current_rating']]
df_warnings['signal_type']  = np.where(df_warnings['current_rating'] < 17, 'additive', 'target')

In [8]:
df_warnings.signal_type.value_counts(dropna=False)

signal_type
target      32
additive    10
Name: count, dtype: int64

In [9]:
df_signals = pd.merge(df,
                      df_weights,
                      on=['client_id', 'rating_id'],
                      how='inner')

df_signals = pd.merge(df_signals,
                      df_dpd,
                      on=['rating_id'],
                      how='inner')

df_signals.head(3)

Unnamed: 0,client_id,rating_id,kws1,kws2,kws3,kws4,kws5,kws7,kws8,kws9,kws10,kws11,kws12,kws14,kws17,kws18,DT0800,kws26,kws27,kws25,DT0100,DT0200,DT0400,DT0500,DT0600,DT0700,DT0700_1,DT0700_2,DT0700_3,DT0700_4,DT0700_5,DT0700_6,DT0700_7,DT0700_8,DT0700_9,DT0700_10,DT0900,DT1000,rating_assignment_date,target,valid_date,weight_day,weight_1,weight_2,W,weight_day_norm,weight_1_norm,weight_2_norm,W_norm,dpd
0,176783,1737,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,2018-12-12,0,2019-12-12,0.003344,2.92092,1.30492,3.219233,0.000296,0.590623,1.30492,2.843321,0
1,197185,1048,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,2018-08-14,0,2019-08-14,0.010526,3.503319,1.30492,3.861112,0.000931,0.491416,1.30492,2.365725,0
2,197255,912,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,Нет,2018-07-30,0,2018-12-19,0.015873,2.381363,1.30492,2.624572,0.001404,0.210654,1.30492,1.014112,0


### 4.5 - Определение предсигналов

#### 4.5.1 - Обработака основных предсигналов

In [10]:
warnings_cols = [col for col in df_signals.columns if 'kws' in col or 'DT' in col]
warnings_cols

['kws1',
 'kws2',
 'kws3',
 'kws4',
 'kws5',
 'kws7',
 'kws8',
 'kws9',
 'kws10',
 'kws11',
 'kws12',
 'kws14',
 'kws17',
 'kws18',
 'DT0800',
 'kws26',
 'kws27',
 'kws25',
 'DT0100',
 'DT0200',
 'DT0400',
 'DT0500',
 'DT0600',
 'DT0700',
 'DT0700_1',
 'DT0700_2',
 'DT0700_3',
 'DT0700_4',
 'DT0700_5',
 'DT0700_6',
 'DT0700_7',
 'DT0700_8',
 'DT0700_9',
 'DT0700_10',
 'DT0900',
 'DT1000']

In [11]:
df_signals = df_signals[id_cols + warnings_cols + ['dpd']].copy()

In [12]:
for column in warnings_cols:
    df_signals[column] = df_signals[column].fillna('0')
    df_signals[column] = df_signals[column].apply(lambda x: str(x).lower().strip())
    df_signals[column] = df_signals[column].replace('да','1')
    df_signals[column] = df_signals[column].replace('нет','0')
    df_signals[column] = df_signals[column].astype(int)

#### 4.5.2 - Новые предсигналы по просрочке

In [13]:
df_signals['5_15_dpd'] = np.where((df_signals['dpd'] >= 5) & (df_signals['dpd'] <= 15), 1, 0)
df_signals['16_29_dpd'] = np.where((df_signals['dpd'] >= 16) & (df_signals['dpd'] <= 29), 1, 0)

warnings_cols += ['5_15_dpd', '16_29_dpd']

### 4.6 - Анализ предсигналов

#### 4.6.1 - Подсчет метрик вариативности

In [14]:
# Анализируем
signals_analysis = comprehensive_feature_analysis(
    df=df_signals,
    features=warnings_cols,
    target_col=target_col,
    verbose=False
)

signals_analysis = signals_analysis.merge(warnings_names, left_on='feature', right_on='code', how='left')

#### 4.6.2 - Определение незначимых по метрикам пред. сигналов

In [15]:
signals_constants = signals_analysis.query('unique_values<=1').feature.values.tolist()
signals_with_no_def_spread = signals_analysis.query('default_rate_spread == 0 ')['feature'].values
signals_with_no_IV = signals_analysis.query('information_value == 0 ')['feature'].values
excluded_signals = set(signals_constants)\
                    .intersection(set(signals_with_no_def_spread))\
                    .intersection(set(signals_with_no_IV))
print(f'Сигналы на рассмотрение на исключение ({len(excluded_signals)} штук из {len(warnings_cols)})')
print(list(excluded_signals))

Сигналы на рассмотрение на исключение (22 штук из 38)
['DT1000', 'DT0700_10', 'DT0700_2', 'DT0700_9', 'DT0700_5', 'DT0700_3', 'DT0400', 'DT0700_1', 'kws11', 'DT0700_4', 'kws5', 'DT0700', 'DT0700_8', 'DT0200', 'DT0100', 'DT0500', 'kws7', 'kws25', 'DT0700_7', 'DT0900', 'DT0800', 'DT0700_6']


#### 4.6.3 - Выбор полного или сокращенного набора пред. сигналов

In [16]:
selected_signals = warnings_cols
if EXCLUDE_SIGNALS:
    selected_signals = [f for f in warnings_cols if f not in excluded_signals]
    
print(len(selected_signals), '/', len(warnings_cols))

38 / 38


In [17]:
if EXCLUDE_SIGNALS:
    display(signals_analysis.query('feature in @selected_signals')[['feature','unique_values', 'information_value', 'default_rate_spread', 'definition']])

### 4.7 - Анализ текущих и переподсчитанных рейтингов для каждого сигнала на основании срабатываний и дефолтов на текущих анализируемых данных

#### 4.7.1 - Подсчет

In [18]:
w = pd.DataFrame()

columns = warnings_cols

signal_triggered_count = []
total_observations = []
defaults_when_triggered = []
total_weight_triggered = []
default_weight_triggered = []
for i in columns:
    signal_triggered_count.append(df_signals[df_signals[i] == 1].shape[0])
    total_observations.append(df_signals.shape[0])
    defaults_when_triggered.append(df_signals[df_signals[i] == 1][target_col].sum())
    total_weight_triggered.append(df_signals[df_signals[i] == 1][weight_col].sum())
    default_weight_triggered.append(df_signals[(df_signals[i] == 1) & (df_signals[target_col] == 1)][weight_col].sum())
    
    
w['signal'] = columns
w['total_observations'] = total_observations
w['signal_triggered_count'] = signal_triggered_count
w['defaults_when_triggered'] = defaults_when_triggered
w['total_weight_triggered'] = total_weight_triggered
w['default_weight_triggered'] = default_weight_triggered

w = pd.merge(w,
             df_warnings,
             how = 'left',
             on = ['signal'])


#  Невзвешанный рейтинг
w['triggered_dr'] = w['defaults_when_triggered'] / w['signal_triggered_count']
w.fillna({'triggered_dr' : 0}, inplace=True)
w['unweighted_rating'] = w['triggered_dr'].apply(lambda x: master_scale(x, master))

# Взвешанный рейтинг
w['triggered_weighted_dr'] = w['default_weight_triggered'] / w['total_weight_triggered']
w.fillna({'triggered_weighted_dr' : 0}, inplace=True)
w['weighted_rating'] = w['triggered_weighted_dr'].apply(lambda x: master_scale(x,master))

w.fillna({'signal_type' : 'target'}, inplace=True)

# Доп. метрика
w['coverage'] = w['defaults_when_triggered'] / df_signals[target_col].sum()  # Покрытие (% от всех дефолтов)

# Сортировка для наглядности
w = w.sort_values(['coverage', 'current_rating', 'unweighted_rating', 'weighted_rating'], ascending=False)
w = w.merge(warnings_names, left_on='signal', right_on='code', how='left').drop('code', axis=1)
w

Unnamed: 0,signal,total_observations,signal_triggered_count,defaults_when_triggered,total_weight_triggered,default_weight_triggered,current_rating,signal_type,triggered_dr,unweighted_rating,triggered_weighted_dr,weighted_rating,coverage,definition
0,kws14,4984,31,24,28.998697,20.654883,25,target,0.774194,25.0,0.712269,25.0,0.096386,Просрочка платежа от 30 до 60 календарных дней...
1,5_15_dpd,4984,82,23,50.229038,10.357334,23,target,0.280488,23.0,0.206202,22.0,0.092369,Просрочка платежа от 5 до 16 календарных дней ...
2,16_29_dpd,4984,22,15,15.74883,11.00012,25,target,0.681818,25.0,0.698472,25.0,0.060241,Просрочка платежа от 16 до 30 календарных дней...
3,kws1,4984,11,8,9.416465,7.259177,2,additive,0.727273,25.0,0.770903,25.0,0.032129,Наложение кредиторами ареста на расчетные счет...
4,kws9,4984,23,4,23.938444,2.948196,23,target,0.173913,21.0,0.123157,20.0,0.016064,Существенное нарушение условий кредитной докум...
5,kws4,4984,30,4,35.133426,5.494774,3,additive,0.133333,20.0,0.156397,21.0,0.016064,Наличие негативной информации о деловой репута...
6,kws17,4984,66,4,59.075191,4.922502,2,additive,0.060606,18.0,0.083326,19.0,0.016064,"Наличие просрочек по налоговым выплатам, несущ..."
7,kws3,4984,3,3,1.59802,1.59802,2,additive,1.0,26.0,1.0,26.0,0.012048,Нестабильное поведение уполномоченных представ...
8,kws18,4984,6,2,4.662187,1.366822,24,target,0.333333,23.0,0.293172,23.0,0.008032,"Наличие просрочек по налоговым выплатам, сущес..."
9,DT0600,4984,1,1,1.120919,1.120919,26,target,1.0,26.0,1.0,26.0,0.004016,Реструктуризация: реструктуризация обязательст...


#### 4.7.2 - Сравнение

In [19]:
w[w['current_rating'] < w[['unweighted_rating','weighted_rating']].max(axis=1)]\
    .query('signal in @selected_signals')

Unnamed: 0,signal,total_observations,signal_triggered_count,defaults_when_triggered,total_weight_triggered,default_weight_triggered,current_rating,signal_type,triggered_dr,unweighted_rating,triggered_weighted_dr,weighted_rating,coverage,definition
3,kws1,4984,11,8,9.416465,7.259177,2,additive,0.727273,25.0,0.770903,25.0,0.032129,Наложение кредиторами ареста на расчетные счет...
5,kws4,4984,30,4,35.133426,5.494774,3,additive,0.133333,20.0,0.156397,21.0,0.016064,Наличие негативной информации о деловой репута...
6,kws17,4984,66,4,59.075191,4.922502,2,additive,0.060606,18.0,0.083326,19.0,0.016064,"Наличие просрочек по налоговым выплатам, несущ..."
7,kws3,4984,3,3,1.59802,1.59802,2,additive,1.0,26.0,1.0,26.0,0.012048,Нестабильное поведение уполномоченных представ...
10,kws27,4984,1,1,1.803688,1.803688,19,target,1.0,26.0,1.0,26.0,0.004016,"Отсутствие выручки более, чем за одну отчетную..."
11,kws26,4984,7,1,8.863565,1.803688,17,target,0.142857,21.0,0.203495,22.0,0.004016,Отсутствие выручки за одну отчетную дату из пя...
12,kws12,4984,9,1,6.948239,0.09499,5,additive,0.111111,20.0,0.013671,13.0,0.004016,Существенное ухудшение финансового положения и...


In [20]:
w.query('signal in @selected_signals').query('signal_type == "target" and current_rating != 26')

Unnamed: 0,signal,total_observations,signal_triggered_count,defaults_when_triggered,total_weight_triggered,default_weight_triggered,current_rating,signal_type,triggered_dr,unweighted_rating,triggered_weighted_dr,weighted_rating,coverage,definition
0,kws14,4984,31,24,28.998697,20.654883,25,target,0.774194,25.0,0.712269,25.0,0.096386,Просрочка платежа от 30 до 60 календарных дней...
1,5_15_dpd,4984,82,23,50.229038,10.357334,23,target,0.280488,23.0,0.206202,22.0,0.092369,Просрочка платежа от 5 до 16 календарных дней ...
2,16_29_dpd,4984,22,15,15.74883,11.00012,25,target,0.681818,25.0,0.698472,25.0,0.060241,Просрочка платежа от 16 до 30 календарных дней...
4,kws9,4984,23,4,23.938444,2.948196,23,target,0.173913,21.0,0.123157,20.0,0.016064,Существенное нарушение условий кредитной докум...
8,kws18,4984,6,2,4.662187,1.366822,24,target,0.333333,23.0,0.293172,23.0,0.008032,"Наличие просрочек по налоговым выплатам, сущес..."
10,kws27,4984,1,1,1.803688,1.803688,19,target,1.0,26.0,1.0,26.0,0.004016,"Отсутствие выручки более, чем за одну отчетную..."
11,kws26,4984,7,1,8.863565,1.803688,17,target,0.142857,21.0,0.203495,22.0,0.004016,Отсутствие выручки за одну отчетную дату из пя...
32,kws10,4984,3,0,6.706056,0.0,25,target,0.0,,0.0,,0.0,"Имеются доказательства мошенничества, по инфор..."


## 4.8 - Финальные предсигналы

In [21]:
warnings_cols_to_drop = ['kws13', 'kws15', 'DT0300', 'DT1100', 'DT0100']
selected_signals      = [s for s in selected_signals if s not in warnings_cols_to_drop]
selected_signals      =    [col for col in selected_signals if 'kws' in col] \
                         + [col for col in selected_signals if 'dpd' in col] \
                         + [col for col in selected_signals if 'DT'  in col] 
selected_signals

['kws1',
 'kws2',
 'kws3',
 'kws4',
 'kws5',
 'kws7',
 'kws8',
 'kws9',
 'kws10',
 'kws11',
 'kws12',
 'kws14',
 'kws17',
 'kws18',
 'kws26',
 'kws27',
 'kws25',
 '5_15_dpd',
 '16_29_dpd',
 'DT0800',
 'DT0200',
 'DT0400',
 'DT0500',
 'DT0600',
 'DT0700',
 'DT0700_1',
 'DT0700_2',
 'DT0700_3',
 'DT0700_4',
 'DT0700_5',
 'DT0700_6',
 'DT0700_7',
 'DT0700_8',
 'DT0700_9',
 'DT0700_10',
 'DT0900',
 'DT1000']

## 4.9 - Сохранение предсигналов

In [22]:
print(len(selected_signals),'/', len(warnings_cols))

37 / 38


In [23]:
df_signals = df_signals[['client_id', 'rating_id'] + selected_signals]

In [24]:
df_signals.to_parquet(filepath['final_signals'])