In [32]:
!pip install optuna -q

### Загрузка данных

In [33]:
import pandas as pd

# чтение файлов
train_df = pd.read_parquet('train.parquet')
test_df = pd.read_parquet('test.parquet')

# отбросим столбец id
train_df.drop(["id"], axis=1, inplace=True)

In [34]:
train_df['okved'] = train_df['okved'].astype('category')

## Обработка данных

### Обработка категориальных признаков

In [35]:
cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]

train_df[cat_cols] = train_df[cat_cols].astype('category')
test_df[cat_cols] = test_df[cat_cols].astype('category')

### Обработка пустых значений

### Признаки с малым кол-вом пропусков заменяем медианой или наиболее часто встречающимся значением

### Признаки с большим количеством пропусков
Первые топ-10 признаков по пропускам:

max_end_plan_non_fin_deals: Кол-во месяцев до максимальной плановой даты закрытия среди текущих сделок по всем срочным продуктам
min_end_plan_non_fin_deals: Кол-во месяцев до минимальной плановой даты закрытия среди текущих сделок по всем срочным продуктам
min_start_non_fin_deals: Кол-во месяцев до минимальной даты открытия среди текущих сделок по всем срочным продуктам
max_start_non_fin_deals: Кол-во месяцев до максимальной даты закрытия среди текущих сделок по всем срочным продуктам

max_end_fact_fin_deals: Кол-во месяцев до максимальной фактической даты закрытия среди закрытых сделок по всем срочным продуктам
max_start_fin_deals: Кол-во месяцев до максимальной даты закрытия среди закрытых сделок по всем срочным продуктам
min_end_fact_fin_deals: Кол-во месяцев до минимальной фактической даты закрытия среди закрытых сделок по всем срочным продуктам
min_start_fin_deals: Кол-во месяцев до минимальной даты открытия среди закрытых сделок по всем срочным продуктам

max_founderpres: Количество дней, прошедших с первой даты регистрации огрн
min_founderpres: Количество дней, прошедших с последней даты регистрации огрн

Заметим, что в этих признаках процент пропусков составляет от 86-93%
Поэтому просто отбросим эти признаки, а признак регистрации ОГРН сделаем бинарным - зарегестрировано/незарегистрированно


In [36]:
train_df.drop(["max_end_plan_non_fin_deals",
               "min_end_plan_non_fin_deals",
               "max_start_non_fin_deals",
               "min_start_non_fin_deals",
               "max_end_fact_fin_deals",
               "max_start_fin_deals",
               "min_end_fact_fin_deals",
               "min_start_fin_deals",
               ], axis=1, inplace=True)

test_df.drop(["max_end_plan_non_fin_deals",
              "min_end_plan_non_fin_deals",
              "max_start_non_fin_deals",
              "min_start_non_fin_deals",
              "max_end_fact_fin_deals",
              "max_start_fin_deals",
              "min_end_fact_fin_deals",
              "min_start_fin_deals",
              ], axis=1, inplace=True)

Если значение min(max)_founderpres - NaN значит человек не регистрировал себе ОГРН, иначе регистрировал.

In [37]:
train_df["ogrn_reg"] = (train_df["max_founderpres"].isna()*1 - 1) * -1
train_df[["ogrn_reg", "max_founderpres"]]

Unnamed: 0,ogrn_reg,max_founderpres
0,1,-0.963860
1,0,
2,1,-0.271164
3,0,
4,0,
...,...,...
299995,1,-0.754824
299996,1,0.549400
299997,0,
299998,0,


Проделаем тоже самое для тестовых данных

In [38]:
test_df["ogrn_reg"] = (test_df["max_founderpres"].isna()*1 - 1) * -1
test_df.drop(["max_founderpres", "min_founderpres"], axis=1, inplace=True)

train_df["ogrn_reg"] = train_df["ogrn_reg"].astype("category")
test_df["ogrn_reg"] = test_df["ogrn_reg"].astype("category")
# train_df[cat_indexes] = train_df[cat_indexes].astype("category")
# test_df[cat_indexes] = test_df[cat_indexes].astype("category")

Проверим, остались ли пропуски после обработки

В данных есть признак - index_city_code	код города в почтовом индексе. Так как мы закодировали города, можно избавиться от этого признака, он будет создавать лишние зависимости

In [39]:
train_df.drop(['max_founderpres', 'min_founderpres'], axis=1, inplace=True)

In [40]:
train_df.drop(['index_city_code'], axis=1, inplace=True)

# повторим для тестовых данных
test_df.drop(['index_city_code'], axis=1, inplace=True)

Заметим, что в наших данных есть такие признаки как sum_a_oper_1m и cnt_a_oper_1m - сумма операций типа А за месяц и их количество. Это распространяется и на другие типы операций. Сконструируем новый признак sum/cnt - средняя сумма операции типа A в месяц. Распространим эту логику на все типы операций.

In [41]:
train_nans = train_df.isna().sum().sort_values(ascending=False).loc[lambda x: x > 0]
#в обучающих данных
train_nans = train_nans.loc[lambda x: x > 0]
smaller_train_nans = train_nans

for i in smaller_train_nans.index:
    if train_df[i].dtype == "object" or train_df[i].dtype == "category":
        train_df[i].loc[train_df[i].isna()] = train_df[i].value_counts().sort_values(ascending=False).index[0]
    else:
        train_df[i].loc[train_df[i].isna()] = train_df[i].median()


# в тестовых данных
test_nans = test_df.isna().sum().sort_values(ascending=False).loc[lambda x: x > 0]

smaller_test_nans = test_nans

for i in smaller_test_nans.index:
    if test_df[i].dtype == "category":
        test_df[i].loc[test_df[i].isna()] = test_df[i].value_counts().sort_values(ascending=False).index[0]
    else:
        test_df[i].loc[test_df[i].isna()] = test_df[i].median()

test_nans = test_df.isna().sum().sort_values(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i].loc[train_df[i].isna()] = train_df[i].median()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i].loc[train_df[i].isna()] = train_df[i].median()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i].loc[train_df[i].isna()] = train_df[i].median()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

Новые фичи


In [42]:
check_df = pd.DataFrame()
check_df['deb_e_oper_growth'] = train_df['sum_deb_e_oper_1m'] > train_df['sum_deb_e_oper_3m']
check_df['deb_f_oper_growth'] = train_df['sum_deb_f_oper_1m'] > train_df['sum_deb_f_oper_3m']
check_df['deb_h_oper_growth'] = train_df['sum_deb_h_oper_1m'] > train_df['sum_deb_h_oper_3m']

check_df['cred_e_oper_growth'] = train_df['sum_cred_e_oper_1m'] > train_df['sum_cred_e_oper_3m']
check_df['cred_f_oper_growth'] = train_df['sum_cred_f_oper_1m'] > train_df['sum_cred_f_oper_3m']
check_df['cred_h_oper_growth'] = train_df['sum_cred_h_oper_1m'] > train_df['sum_cred_h_oper_3m']

check_df[['deb_e_oper_growth', 'deb_f_oper_growth', 'deb_h_oper_growth']] = check_df[['deb_e_oper_growth', 'deb_f_oper_growth', 'deb_h_oper_growth']].astype('category')
check_df[['cred_e_oper_growth', 'cred_f_oper_growth', 'cred_h_oper_growth']] = check_df[['cred_e_oper_growth', 'cred_f_oper_growth', 'cred_h_oper_growth']].astype('category')


In [43]:
dumies = pd.get_dummies(train_df, columns=['segment'])

In [44]:
check_df = check_df.join(dumies)
train_df = check_df

In [45]:
check_df = pd.DataFrame()
check_df['deb_e_oper_growth'] = test_df['sum_deb_e_oper_1m'] > test_df['sum_deb_e_oper_3m']
check_df['deb_f_oper_growth'] = test_df['sum_deb_f_oper_1m'] > test_df['sum_deb_f_oper_3m']
check_df['deb_h_oper_growth'] = test_df['sum_deb_h_oper_1m'] > test_df['sum_deb_h_oper_3m']

check_df['cred_e_oper_growth'] = test_df['sum_cred_e_oper_1m'] > test_df['sum_cred_e_oper_3m']
check_df['cred_f_oper_growth'] = test_df['sum_cred_f_oper_1m'] > test_df['sum_cred_f_oper_3m']
check_df['cred_h_oper_growth'] = test_df['sum_cred_h_oper_1m'] > test_df['sum_cred_h_oper_3m']

check_df[['deb_e_oper_growth', 'deb_f_oper_growth', 'deb_h_oper_growth']] = check_df[['deb_e_oper_growth', 'deb_f_oper_growth', 'deb_h_oper_growth']].astype('category')
check_df[['cred_e_oper_growth', 'cred_f_oper_growth', 'cred_h_oper_growth']] = check_df[['cred_e_oper_growth', 'cred_f_oper_growth', 'cred_h_oper_growth']].astype('category')

dumies = pd.get_dummies(test_df, columns=['segment'])
check_df = check_df.join(dumies)
test_df = check_df

In [46]:
train_df.drop(['balance_amt_avg', 'balance_amt_min', 'ogrn_month','ogrn_year'], axis=1, inplace=True)
test_df.drop(['balance_amt_avg', 'balance_amt_min', 'ogrn_month','ogrn_year'], axis=1, inplace=True)

В данных есть признак - index_city_code	код города в почтовом индексе. Так как мы закодировали города, можно избавиться от этого признака, он будет создавать лишние зависимости

In [None]:
train_df.drop(['index_city_code'], axis=1, inplace=True)

# повторим для тестовых данных
test_df.drop(['index_city_code'], axis=1, inplace=True)

Заметим, что в наших данных есть такие признаки как sum_a_oper_1m и cnt_a_oper_1m - сумма операций типа А за месяц и их количество. Это распространяется и на другие типы операций. Сконструируем новый признак sum/cnt - средняя сумма операции типа A в месяц. Распространим эту логику на все типы операций.

In [47]:

# для операций типа A, B, C за 1 месяц
train_df['med_sum_oper_a_1m'] = train_df['sum_a_oper_1m']/train_df['cnt_a_oper_1m']
train_df.drop(['sum_a_oper_1m', 'cnt_a_oper_1m'], axis=1, inplace=True)

train_df['med_sum_oper_b_3m'] = train_df['sum_b_oper_3m']/train_df['cnt_b_oper_3m']
train_df.drop(['sum_b_oper_1m', 'cnt_b_oper_1m'], axis=1, inplace=True)

train_df['med_sum_oper_c_1m'] = train_df['sum_c_oper_1m']/train_df['cnt_c_oper_1m']
train_df.drop(['sum_c_oper_1m', 'cnt_c_oper_1m'], axis=1, inplace=True)


# для исходящих операций типа D, E, F, G, H за 1 месяц
train_df['med_sum_deb_d_oper_1m'] = train_df['sum_deb_d_oper_1m']/train_df['cnt_deb_d_oper_1m']
train_df.drop(['sum_deb_d_oper_1m', 'cnt_deb_d_oper_1m'], axis=1, inplace=True)

train_df['med_sum_deb_e_oper_1m'] = train_df['sum_deb_e_oper_1m']/train_df['cnt_deb_e_oper_1m']
train_df.drop(['sum_deb_e_oper_1m', 'cnt_deb_e_oper_1m'], axis=1, inplace=True)

train_df['med_sum_deb_f_oper_1m'] = train_df['sum_deb_f_oper_1m']/train_df['cnt_deb_f_oper_1m']
train_df.drop(['sum_deb_f_oper_1m', 'cnt_deb_f_oper_1m'], axis=1, inplace=True)

train_df['med_sum_deb_g_oper_1m'] = train_df['sum_deb_g_oper_1m']/train_df['cnt_deb_g_oper_1m']
train_df.drop(['sum_deb_g_oper_1m', 'cnt_deb_g_oper_1m'], axis=1, inplace=True)

train_df['med_sum_deb_h_oper_1m'] = train_df['sum_deb_h_oper_1m']/train_df['cnt_deb_h_oper_1m']
train_df.drop(['sum_deb_h_oper_1m', 'cnt_deb_h_oper_1m'], axis=1, inplace=True)

# для входящих операций типа D, E, F, G, H  за 1 месяц
train_df['med_sum_cred_d_oper_1m'] = train_df['sum_cred_d_oper_1m']/train_df['cnt_cred_d_oper_1m']
train_df.drop(['sum_cred_d_oper_1m', 'cnt_cred_d_oper_1m'], axis=1, inplace=True)

train_df['med_sum_cred_e_oper_1m'] = train_df['sum_cred_e_oper_1m']/train_df['cnt_cred_e_oper_1m']
train_df.drop(['sum_cred_e_oper_1m', 'cnt_cred_e_oper_1m'], axis=1, inplace=True)

train_df['med_sum_cred_f_oper_1m'] = train_df['sum_cred_f_oper_1m']/train_df['cnt_cred_f_oper_1m']
train_df.drop(['sum_cred_f_oper_1m', 'cnt_cred_f_oper_1m'], axis=1, inplace=True)

train_df['med_sum_cred_g_oper_1m'] = train_df['sum_cred_g_oper_1m']/train_df['cnt_cred_g_oper_1m']
train_df.drop(['sum_cred_g_oper_1m', 'cnt_cred_g_oper_1m'], axis=1, inplace=True)

train_df['med_sum_cred_h_oper_1m'] = train_df['sum_cred_h_oper_1m']/train_df['cnt_cred_h_oper_1m']
train_df.drop(['sum_cred_h_oper_1m', 'cnt_cred_h_oper_1m'], axis=1, inplace=True)

In [48]:
# повторим для тестовых данных

# для операций типа A, B, C за 1 месяц
test_df['med_sum_oper_a_1m'] = test_df['sum_a_oper_1m']/test_df['cnt_a_oper_1m']
test_df.drop(['sum_a_oper_1m', 'cnt_a_oper_1m'], axis=1, inplace=True)

test_df['med_sum_oper_b_3m'] = test_df['sum_b_oper_3m']/test_df['cnt_b_oper_3m']
test_df.drop(['sum_b_oper_1m', 'cnt_b_oper_1m'], axis=1, inplace=True)

test_df['med_sum_oper_c_1m'] = test_df['sum_c_oper_1m']/test_df['cnt_c_oper_1m']
test_df.drop(['sum_c_oper_1m', 'cnt_c_oper_1m'], axis=1, inplace=True)

# для исходящих операций типа D, E, F, G, H за 1 месяц
test_df['med_sum_deb_d_oper_1m'] = test_df['sum_deb_d_oper_1m']/test_df['cnt_deb_d_oper_1m']
test_df.drop(['sum_deb_d_oper_1m', 'cnt_deb_d_oper_1m'], axis=1, inplace=True)

test_df['med_sum_deb_e_oper_1m'] = test_df['sum_deb_e_oper_1m']/test_df['cnt_deb_e_oper_1m']
test_df.drop(['sum_deb_e_oper_1m', 'cnt_deb_e_oper_1m'], axis=1, inplace=True)

test_df['med_sum_deb_f_oper_1m'] = test_df['sum_deb_f_oper_1m']/test_df['cnt_deb_f_oper_1m']
test_df.drop(['sum_deb_f_oper_1m', 'cnt_deb_f_oper_1m'], axis=1, inplace=True)

test_df['med_sum_deb_g_oper_1m'] = test_df['sum_deb_g_oper_1m']/test_df['cnt_deb_g_oper_1m']
test_df.drop(['sum_deb_g_oper_1m', 'cnt_deb_g_oper_1m'], axis=1, inplace=True)

test_df['med_sum_deb_h_oper_1m'] = test_df['sum_deb_h_oper_1m']/test_df['cnt_deb_h_oper_1m']
test_df.drop(['sum_deb_h_oper_1m', 'cnt_deb_h_oper_1m'], axis=1, inplace=True)

# для входящих операций типа D, E, F, G, H за 1 месяц
test_df['med_sum_cred_d_oper_1m'] = test_df['sum_cred_d_oper_1m']/test_df['cnt_cred_d_oper_1m']
test_df.drop(['sum_cred_d_oper_1m', 'cnt_cred_d_oper_1m'], axis=1, inplace=True)

test_df['med_sum_cred_e_oper_1m'] = test_df['sum_cred_e_oper_1m']/test_df['cnt_cred_e_oper_1m']
test_df.drop(['sum_cred_e_oper_1m', 'cnt_cred_e_oper_1m'], axis=1, inplace=True)

test_df['med_sum_cred_f_oper_1m'] = test_df['sum_cred_f_oper_1m']/test_df['cnt_cred_f_oper_1m']
test_df.drop(['sum_cred_f_oper_1m', 'cnt_cred_f_oper_1m'], axis=1, inplace=True)

test_df['med_sum_cred_g_oper_1m'] = test_df['sum_cred_g_oper_1m']/test_df['cnt_cred_g_oper_1m']
test_df.drop(['sum_cred_g_oper_1m', 'cnt_cred_g_oper_1m'], axis=1, inplace=True)

test_df['med_sum_cred_h_oper_1m'] = test_df['sum_cred_h_oper_1m']/test_df['cnt_cred_h_oper_1m']
test_df.drop(['sum_cred_h_oper_1m', 'cnt_cred_h_oper_1m'], axis=1, inplace=True)



Аналогично заменим признаки суммы и количества операций разного типа для 3 месяцев

In [49]:
# для операций типа A, B, C
train_df['med_sum_oper_a_3m'] = train_df['sum_a_oper_3m']/train_df['cnt_a_oper_3m']
train_df.drop(['sum_a_oper_3m', 'cnt_a_oper_3m'], axis=1, inplace=True)

train_df['med_sum_oper_b_3m'] = train_df['sum_b_oper_3m']/train_df['cnt_b_oper_3m']
train_df.drop(['sum_b_oper_3m', 'cnt_b_oper_3m'], axis=1, inplace=True)

train_df['med_sum_oper_c_3m'] = train_df['sum_c_oper_3m']/train_df['cnt_c_oper_3m']
train_df.drop(['sum_c_oper_3m', 'cnt_c_oper_3m'], axis=1, inplace=True)

# для исходящих операций типа D, E, F, G, H
train_df['med_sum_deb_d_oper_3m'] = train_df['sum_deb_d_oper_3m']/train_df['cnt_deb_d_oper_3m']
train_df.drop(['sum_deb_d_oper_3m', 'cnt_deb_d_oper_3m'], axis=1, inplace=True)

train_df['med_sum_deb_e_oper_3m'] = train_df['sum_deb_e_oper_3m']/train_df['cnt_deb_e_oper_3m']
train_df.drop(['sum_deb_e_oper_3m', 'cnt_deb_e_oper_3m'], axis=1, inplace=True)

train_df['med_sum_deb_f_oper_3m'] = train_df['sum_deb_f_oper_3m']/train_df['cnt_deb_f_oper_3m']
train_df.drop(['sum_deb_f_oper_3m', 'cnt_deb_f_oper_3m'], axis=1, inplace=True)

train_df['med_sum_deb_g_oper_3m'] = train_df['sum_deb_g_oper_3m']/train_df['cnt_deb_g_oper_3m']
train_df.drop(['sum_deb_g_oper_3m', 'cnt_deb_g_oper_3m'], axis=1, inplace=True)

train_df['med_sum_deb_h_oper_3m'] = train_df['sum_deb_h_oper_3m']/train_df['cnt_deb_h_oper_3m']
train_df.drop(['sum_deb_h_oper_3m', 'cnt_deb_h_oper_3m'], axis=1, inplace=True)

# для входящих операций типа D, E, F, G, H
train_df['med_sum_cred_d_oper_3m'] = train_df['sum_cred_d_oper_3m']/train_df['cnt_cred_d_oper_3m']
train_df.drop(['sum_cred_d_oper_3m', 'cnt_cred_d_oper_3m'], axis=1, inplace=True)

train_df['med_sum_cred_e_oper_3m'] = train_df['sum_cred_e_oper_3m']/train_df['cnt_cred_e_oper_3m']
train_df.drop(['sum_cred_e_oper_3m', 'cnt_cred_e_oper_3m'], axis=1, inplace=True)

train_df['med_sum_cred_f_oper_3m'] = train_df['sum_cred_f_oper_3m']/train_df['cnt_cred_f_oper_3m']
train_df.drop(['sum_cred_f_oper_3m', 'cnt_cred_f_oper_3m'], axis=1, inplace=True)

train_df['med_sum_cred_g_oper_3m'] = train_df['sum_cred_g_oper_3m']/train_df['cnt_cred_g_oper_3m']
train_df.drop(['sum_cred_g_oper_3m', 'cnt_cred_g_oper_3m'], axis=1, inplace=True)

train_df['med_sum_cred_h_oper_3m'] = train_df['sum_cred_h_oper_3m']/train_df['cnt_cred_h_oper_3m']
train_df.drop(['sum_cred_h_oper_3m', 'cnt_cred_h_oper_3m'], axis=1, inplace=True)

In [50]:
# для операций типа A, B, C за 3 месяца
test_df['med_sum_oper_a_3m'] = test_df['sum_a_oper_3m']/test_df['cnt_a_oper_3m']
test_df.drop(['sum_a_oper_3m', 'cnt_a_oper_3m'], axis=1, inplace=True)

test_df['med_sum_oper_b_3m'] = test_df['sum_b_oper_3m']/test_df['cnt_b_oper_3m']
test_df.drop(['sum_b_oper_3m', 'cnt_b_oper_3m'], axis=1, inplace=True)

test_df['med_sum_oper_c_3m'] = test_df['sum_c_oper_3m']/test_df['cnt_c_oper_3m']
test_df.drop(['sum_c_oper_3m', 'cnt_c_oper_3m'], axis=1, inplace=True)

# для исходящих операций типа D, E, F, G, H за 3 месяца
test_df['med_sum_deb_d_oper_3m'] = test_df['sum_deb_d_oper_3m']/test_df['cnt_deb_d_oper_3m']
test_df.drop(['sum_deb_d_oper_3m', 'cnt_deb_d_oper_3m'], axis=1, inplace=True)

test_df['med_sum_deb_e_oper_3m'] = test_df['sum_deb_e_oper_3m']/test_df['cnt_deb_e_oper_3m']
test_df.drop(['sum_deb_e_oper_3m', 'cnt_deb_e_oper_3m'], axis=1, inplace=True)

test_df['med_sum_deb_f_oper_3m'] = test_df['sum_deb_f_oper_3m']/test_df['cnt_deb_f_oper_3m']
test_df.drop(['sum_deb_f_oper_3m', 'cnt_deb_f_oper_3m'], axis=1, inplace=True)

test_df['med_sum_deb_g_oper_3m'] = test_df['sum_deb_g_oper_3m']/test_df['cnt_deb_g_oper_3m']
test_df.drop(['sum_deb_g_oper_3m', 'cnt_deb_g_oper_3m'], axis=1, inplace=True)

test_df['med_sum_deb_h_oper_3m'] = test_df['sum_deb_h_oper_3m']/test_df['cnt_deb_h_oper_3m']
test_df.drop(['sum_deb_h_oper_3m', 'cnt_deb_h_oper_3m'], axis=1, inplace=True)

# для входящих операций типа D, E, F, G, H за 3 месяца
test_df['med_sum_cred_d_oper_3m'] = test_df['sum_cred_d_oper_3m']/test_df['cnt_cred_d_oper_3m']
test_df.drop(['sum_cred_d_oper_3m', 'cnt_cred_d_oper_3m'], axis=1, inplace=True)

test_df['med_sum_cred_e_oper_3m'] = test_df['sum_cred_e_oper_3m']/test_df['cnt_cred_e_oper_3m']
test_df.drop(['sum_cred_e_oper_3m', 'cnt_cred_e_oper_3m'], axis=1, inplace=True)

test_df['med_sum_cred_f_oper_3m'] = test_df['sum_cred_f_oper_3m']/test_df['cnt_cred_f_oper_3m']
test_df.drop(['sum_cred_f_oper_3m', 'cnt_cred_f_oper_3m'], axis=1, inplace=True)

test_df['med_sum_cred_g_oper_3m'] = test_df['sum_cred_g_oper_3m']/test_df['cnt_cred_g_oper_3m']
test_df.drop(['sum_cred_g_oper_3m', 'cnt_cred_g_oper_3m'], axis=1, inplace=True)

test_df['med_sum_cred_h_oper_3m'] = test_df['sum_cred_h_oper_3m']/test_df['cnt_cred_h_oper_3m']
test_df.drop(['sum_cred_h_oper_3m', 'cnt_cred_h_oper_3m'], axis=1, inplace=True)

Обработаем выбросы с помощью изолированного дерева

In [51]:
from sklearn.ensemble import IsolationForest

# Выберем признаки, в которых не будет происходить поиск аномалий (категориальные признаки)
cat_cols = [
    'channel_code', 'city', 'city_type',
    'branch_code', 'okved'
]

# Отфильтруем категориальные признаки из всех признаков
no_anomaly_features = [col for col in train_df.columns if col not in cat_cols]


df_to_filter = train_df[no_anomaly_features]
iso_clf = IsolationForest(random_state=42, contamination=0.2).fit(df_to_filter)
anomaly = iso_clf.predict(df_to_filter)

# Отфильтруем только те строки, которые не являются аномалиями
train_df_filtered = train_df[anomaly == 1]

# Присоединим отфильтрованные числовые данные к категориальным данным
train_df_filtered_with_categorical = train_df_filtered[cat_cols].join(train_df_filtered[no_anomaly_features])

print(train_df_filtered_with_categorical.shape)

train_df = train_df_filtered_with_categorical

(240000, 70)


Теперь отберем признаки
Для начала выделим в отдельные переменные столбцы target_1, target_2 и target

In [52]:
target_1 = train_df['target_1']
target_2 = train_df['target_2']
target = train_df['total_target']

train_df.drop(['target_1', 'target_2', 'total_target'], axis=1, inplace=True)

Теперь определим меру взаимной информации для признаков, чтобы понять какие из признаков действительно оказывают влияние на модель

In [53]:
# from sklearn.feature_selection import mutual_info_classif

# names = [column for column in train_df]
# X = train_df[names[:90]]

# # для target_1
# y1 = target_1.to_numpy().ravel()

# # mi_1 = mutual_info_classif(X, y1)

# mi_sc_1 = pd.Series(mi_1, name="MI Scores", index=X.columns)
# mi_sc_1 = mi_sc_1.sort_values(ascending=False)

# # head используем, чтобы получить самые влиятельные, tail - наименее влиятельные
# print('Наиболее влиятельные признаки для target_1')
# mi_sc_1

In [54]:
# # для target_2
# y2 = target_2.to_numpy().ravel()

# mi_2 = mutual_info_classif(X, y2)

# mi_sc_2 = pd.Series(mi_2, name="MI Scores", index=X.columns)
# mi_sc_2 = mi_sc_2.sort_values(ascending=False)


# # head используем чтобы получить самые влиятельные, tail - наименее влиятельные
# print('Наиболее влиятельные признаки для target_2')
# mi_sc_2

Теперь, когда мы знаем какие признаки не влияют на подсчет первого и второго таргета, мы можем их удалить и получить соответствующие датасеты для target_1 и target_2

In [55]:
# zero_mi_cols_1 = mi_sc_1[mi_sc_1 <= 0.015].index.tolist()
# X1_train = train_df.drop(columns=zero_mi_cols_1)

# zero_mi_cols_2 = mi_sc_2[mi_sc_2 <= 0.015].index.tolist()
# X2_train = train_df.drop(columns=zero_mi_cols_2)

Повторим эти действия для тестовой выборки

In [56]:
# X1_test = test_df.drop(columns=zero_mi_cols_1)
# X2_test = test_df.drop(columns=zero_mi_cols_2)

### Обучение модели

In [57]:
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier
# import optuna


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import optuna
# train_df = train_df.drop(['okved_47'],axis=1,inplace=True)
# Для target_1
x1_train, x1_test, y1_train, y1_test = train_test_split(train_df, target_1, test_size=0.2, random_state=42)

# Для target_2
x2_train, x2_test, y2_train, y2_test = train_test_split(train_df, target_2, test_size=0.2, random_state=42)

In [59]:
def objective_xgb(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric':trial.suggest_categorical('eval_metric', ['mlogloss']),
    }


    model_xgb = XGBClassifier(device='cuda', enable_categorical=True, n_jobs=-1, **params)
    model_xgb.fit(x1_train, y1_train)
    y_pred_1 = model_xgb.predict_proba(x1_test)[:,1]
    scores = roc_auc_score(y1_test,y_pred_1)


    return scores

In [60]:
def objective_xgb_t2(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric':trial.suggest_categorical('eval_metric', ['mlogloss']),
    }


    model_t2 = XGBClassifier(device='cuda', enable_categorical=True, n_jobs=-1, **params)
    model_t2.fit(x2_train, y2_train)
    y_pred_2 = model_t2.predict_proba(x2_test)[:,1]
    scores = roc_auc_score(y2_test,y_pred_2)


    return scores


In [61]:
study_xgb = optuna.create_study(study_name='target_1', direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100,show_progress_bar=True, n_jobs=-1)
study_xgb.best_params

[I 2023-10-08 11:18:28,575] A new study created in memory with name: target_1


  0%|          | 0/100 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),


[W 2023-10-08 11:18:28,594] Trial 2 failed with parameters: {'booster': 'gbtree', 'max_depth': 8, 'learning_rate': 0.9859907634696853, 'n_estimators': 3563, 'min_child_weight': 4, 'eval_metric': 'mlogloss'} because of the following error: ValueError('Experimental support for categorical data is not implemented for current tree method yet.').
Traceback (most recent call last):
  File "E:\shizofrenia\Anaconda\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\serpr\AppData\Local\Temp\ipykernel_9924\3575804054.py", line 13, in objective_xgb
    model_xgb.fit(x1_train, y1_train)
  File "E:\shizofrenia\Anaconda\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "E:\shizofrenia\Anaconda\Lib\site-packages\xgboost\sklearn.py", line 1468, in fit
    ) = self._configure_fit(
        ^^^^^^^^^^^^^^^^^^^^
  File "E:\shizofrenia\Ana

ValueError: Experimental support for categorical data is not implemented for current tree method yet.

In [None]:
study_xgb_t2 = optuna.create_study(study_name='target_2',direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_xgb_t2.optimize(objective_xgb_t2, n_trials=100,show_progress_bar=True)


Target_1

In [None]:
model = XGBClassifier(enable_categorical=True, device='cuda', **study_xgb.best_params)
model.fit(x1_train, y1_train)

y_pred_1 = model.predict_proba(x1_test)[:,1]

accuracy = roc_auc_score(y1_test, y_pred_1)
print("Xgboost Accuracy:", accuracy)

Target_2

In [None]:
model_t2 = XGBClassifier(enable_categorical=True, device='cuda', **study_xgb_t2.best_params)
model_t2.fit(x2_train, y2_train)

y_pred_2 = model_t2.predict_proba(x2_test)[:,1]
accuracy = roc_auc_score(y2_test, y_pred_2)
print("Xgboost Accuracy:", accuracy)

# Запуск на тестовых данных


In [None]:
df = test_df[[x for x in x1_test.columns]]
df2= test_df[[x for x in train_df.columns]]

In [None]:
# target_1
y_pred_1 = model.predict_proba(df)[:,1]
y_pred_1

In [None]:
y_pred_2 =  model_t2.predict_proba(df2)[:,1]
len(y_pred_2)

In [None]:
df = pd.DataFrame(columns=['id', 'target1', 'target2', 'score'])
df['id'] = test_df['id']
df['target1'] = y_pred_1
df['target2'] = y_pred_2
score = []

for index, item in enumerate(y_pred_1):
  score.append(max(item, y_pred_2[index]))

df['score'] = score
df.drop(['target1', 'target2'], axis=1, inplace=True)
df.to_csv('/content/drive/MyDrive/out15.csv', index=False)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')