In [None]:
!cp /content/drive/MyDrive/Colab/Data/'Готовые задачи'/Калининград/participants/train/train.csv ./ 
!cp /content/drive/MyDrive/Colab/Data/'Готовые задачи'/Калининград/participants/test/test.csv ./ 

In [2]:
#import необходимых модулей

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Machine Learning libraries
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer, MaxAbsScaler, StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
# metrics
from sklearn.metrics import  r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [3]:
# Константы 

RANDOM_STATE = 0 # фиксирование случайного числа

PREDICT_MONTH = 4 # номер месяц для прогноза
PREDICT_YEAR = 2022 # год прогноза
TRAIN_PERIOD = 3 # кол-во месяцев, предшествующих прогнозу, на которых будет обучаться модель (подобрано опытным путем)

In [4]:
#Считывание данных в DataFrame 

train = pd.read_csv('train.csv', sep=';', index_col=None, 
                             dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 
                                    'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str, 'PATIENT_ID_COUNT':int})
test = pd.read_csv('test.csv', sep=';', index_col=None, 
                            dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 
                                   'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str})

In [5]:
%%time
# Преобразование формата VISIT_MONTH_YEAR в дату

def add_hundreds_for_year(x):
    return x[:3]+'20'+x[-2:]

train['VISIT_MONTH_YEAR'] = train['VISIT_MONTH_YEAR'].apply(add_hundreds_for_year)
train['VISIT_MONTH_YEAR'] = pd.to_datetime(train['VISIT_MONTH_YEAR'], format="%m.%Y")
test['VISIT_MONTH_YEAR'] = test['VISIT_MONTH_YEAR'].apply(add_hundreds_for_year)
test['VISIT_MONTH_YEAR'] = pd.to_datetime(test['VISIT_MONTH_YEAR'], format="%m.%Y")

Wall time: 6.45 s


In [6]:
%%time
# Создание списка дат, которые будут применятся для обучения модели
train_dates = pd.date_range(end=f'{PREDICT_YEAR}-{PREDICT_MONTH}', periods=TRAIN_PERIOD+1, freq='MS')
# и переопределение train по этим датам
train = train[train['VISIT_MONTH_YEAR'].isin(train_dates[:-1])]

Wall time: 181 ms


In [7]:
# Сбор информации из train и cоздание словарей признаков, необходимых для обучения
mkb_adres_patient_counts = train.groupby(['MKB_CODE', 'ADRES'])['PATIENT_ID_COUNT'].sum().to_dict()
mkb_age_patient_counts = train.groupby(['MKB_CODE', 'AGE_CATEGORY'])['PATIENT_ID_COUNT'].sum().to_dict()
mkb_sex_patient_counts = train.groupby(['MKB_CODE', 'PATIENT_SEX'])['PATIENT_ID_COUNT'].sum().to_dict()
mkb_patient_counts = train.groupby(['MKB_CODE'])['PATIENT_ID_COUNT'].sum().to_dict()

adres_counts = train.groupby(['ADRES'])['MKB_CODE'].count().to_dict()
age_counts = train.groupby(['AGE_CATEGORY'])['MKB_CODE'].count().to_dict()
sex_counts = train.groupby(['PATIENT_SEX'])['MKB_CODE'].count().to_dict()

In [8]:
# Объединение train и test в один DataFrame, для облегчения добавления признаков
train_test = pd.concat([train, test])

# Добавление вспомогательных колонок для считывания признаков из словарей
train_test['MKB_ADRES'] = list(zip(train_test['MKB_CODE'], train_test['ADRES']))
train_test['MKB_SEX'] = list(zip(train_test['MKB_CODE'], train_test['PATIENT_SEX']))
train_test['MKB_AGE'] = list(zip(train_test['MKB_CODE'], train_test['AGE_CATEGORY']))

# Разделение кода на букву и на первые и вторые цифры
train_test['MKB_CODE_ALPHA'] = train_test['MKB_CODE'].str[0]
train_test['MKB_CODE_FIRST'] = train_test['MKB_CODE'].str.split('.').str[0].str[1:]
train_test['MKB_CODE_SECOND'] = train_test['MKB_CODE'].str.split('.').str[1]
train_test['MKB_CODE_SECOND'].fillna(-1, inplace=True)

In [9]:
# Сортировка значений по-порядку
train_test = train_test.sort_values(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 
                                     'AGE_CATEGORY', 'VISIT_MONTH_YEAR'])

# Добавление предшествующего месяца (транспонирование вышестоящего в строку)
train_test['PREV_PATIENT_ID_COUNT'] = train_test.groupby(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 
                                                          'AGE_CATEGORY'])['PATIENT_ID_COUNT'].shift()
# Вычисление скользящей средней за два месяца (группировка не используется, потому что скользящая расчитана от 'сдвига',
# где предыдущее NaN и, если нет необходимого window будет NaN, не произойдет расчета по значениям других групп)
train_test['AVG_PATIENT_ID_COUNT'] = train_test['PREV_PATIENT_ID_COUNT'].rolling(window=TRAIN_PERIOD-1).mean()

In [10]:
# Создание 'словарных' признаков
train_test['PATIENT_MKB_COUNT'] = train_test['MKB_CODE'].apply(lambda x: mkb_patient_counts.get(x, 0))
train_test['PATIENT_MKB_ADRES_COUNT'] = train_test['MKB_ADRES'].apply(lambda x: mkb_adres_patient_counts.get(x, 0))
train_test['PATIENT_MKB_SEX_COUNT'] = train_test['MKB_SEX'].apply(lambda x: mkb_sex_patient_counts.get(x, 0))
train_test['PATIENT_MKB_AGE_COUNT'] = train_test['MKB_AGE'].apply(lambda x: mkb_age_patient_counts.get(x, 0))

train_test['PATIENT_ADRES_COUNT'] = train_test['ADRES'].apply(lambda x: adres_counts.get(x, 0))
train_test['PATIENT_SEX_COUNT'] = train_test['PATIENT_SEX'].apply(lambda x: sex_counts.get(x, 0))
train_test['PATIENT_AGE_COUNT'] = train_test['AGE_CATEGORY'].apply(lambda x: age_counts.get(x, 0))

train_test['COUNT_MKB_VS_MKB_ADRES'] = (train_test['PATIENT_MKB_COUNT']\
                                        / train_test['PATIENT_MKB_ADRES_COUNT']).replace(np.inf, 0)

train_test.fillna(0, inplace=True)

In [11]:
# Разделение на train и test
train_ = train_test[train_test['VISIT_MONTH_YEAR']!=train_dates[-1]].copy()
test_ = train_test[train_test['VISIT_MONTH_YEAR']==train_dates[-1]].copy().sort_index()

In [12]:
# Поиск приемлемых значений из среднего по 'PATIENT_ID_COUNT' за три месяца для бинов (немного заглядывания в будущее) 
bins_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY']
train_test_patient_avg = train_.groupby(bins_features)[['PATIENT_ID_COUNT']].transform('mean')
tree_X = train_test_patient_avg[train_['VISIT_MONTH_YEAR']==train_dates[-2]]
tree_y = train_[train_['VISIT_MONTH_YEAR']==train_dates[-2]]['PATIENT_ID_COUNT']

tree = DecisionTreeRegressor(max_depth=2, criterion="absolute_error", random_state=RANDOM_STATE)
tree.fit(tree_X, tree_y)

bin_min = -1

bin_1 = tree.tree_.threshold[1]
bin_2 = tree.tree_.threshold[4]

bin_max = np.inf
bins = [bin_min, bin_1, bin_2, bin_max]

# Создание признака по найденным бинам 
train_['PATIENT_ID_COUNT_BINS'] = pd.cut(train_test_patient_avg.values.flatten(), bins=bins)

test_ = test_.merge(train_[bins_features+['PATIENT_ID_COUNT_BINS']].drop_duplicates(), 
                  how='left', on=bins_features, suffixes=('', '_BINS'))
bins_ = train_['PATIENT_ID_COUNT_BINS'].cat.categories
test_['PATIENT_ID_COUNT_BINS'].fillna(bins_[0], inplace=True)
bins_

IntervalIndex([(-1.0, 3.167], (3.167, 254.667], (254.667, inf]], dtype='interval[float64, right]')

In [13]:
# Выделение одного предыдущего месяца для train
train_ = train_[train_['VISIT_MONTH_YEAR']==train_dates[-2]]

# Разбивка train и test по бинам (каждый бин будет обучаться и прогнозироваться отдельно)
train_bins_groups = dict(list(train_.groupby('PATIENT_ID_COUNT_BINS')))
train_bin_1 = train_bins_groups[bins_[0]]
train_bin_2 = train_bins_groups[bins_[1]]
train_bin_max_ = train_bins_groups[bins_[2]]

test_bins_groups = dict(list(test_.groupby('PATIENT_ID_COUNT_BINS')))
test_bin_1 = test_bins_groups[bins_[0]]
test_bin_2 = test_bins_groups[bins_[1]]
test_bin_max_ = test_bins_groups[bins_[2]]
train_.columns

Index(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY',
       'PATIENT_ID_COUNT', 'MKB_ADRES', 'MKB_SEX', 'MKB_AGE', 'MKB_CODE_ALPHA',
       'MKB_CODE_FIRST', 'MKB_CODE_SECOND', 'PREV_PATIENT_ID_COUNT',
       'AVG_PATIENT_ID_COUNT', 'PATIENT_MKB_COUNT', 'PATIENT_MKB_ADRES_COUNT',
       'PATIENT_MKB_SEX_COUNT', 'PATIENT_MKB_AGE_COUNT', 'PATIENT_ADRES_COUNT',
       'PATIENT_SEX_COUNT', 'PATIENT_AGE_COUNT', 'COUNT_MKB_VS_MKB_ADRES',
       'PATIENT_ID_COUNT_BINS'],
      dtype='object')

In [14]:
# Категориальные признаки
cat_features = ['PATIENT_SEX', 
                'MKB_CODE_ALPHA', 'MKB_CODE_FIRST', 'MKB_CODE_SECOND', 
                'ADRES', 'AGE_CATEGORY']

# Числовые признаки
features = ['PREV_PATIENT_ID_COUNT', 'AVG_PATIENT_ID_COUNT',
            'PATIENT_MKB_ADRES_COUNT', 'PATIENT_MKB_SEX_COUNT', 
            'PATIENT_MKB_AGE_COUNT', 'PATIENT_MKB_COUNT', 
            'PATIENT_ADRES_COUNT', 'PATIENT_SEX_COUNT', 'PATIENT_AGE_COUNT', 
            'COUNT_MKB_VS_MKB_ADRES']


In [15]:
#Отделение меток от данных

X_bin_1 = train_bin_1[features]
y_bin_1 = train_bin_1['PATIENT_ID_COUNT']
test_bin_1 = test_bin_1[features]

X_bin_2 = train_bin_2[features]
y_bin_2 = train_bin_2['PATIENT_ID_COUNT']
test_bin_2 = test_bin_2[features]

X_bin_max = train_bin_max_[cat_features+features]
y_bin_max = train_bin_max_['PATIENT_ID_COUNT']
# Кодировка признаков последнего бина
test_bin_max = test_bin_max_[cat_features+features]
train_test_bin_max = pd.concat([X_bin_max, test_bin_max])
train_test_bin_max = pd.get_dummies(train_test_bin_max, drop_first=True, columns=cat_features, prefix='_')
X_bin_max = train_test_bin_max[:X_bin_max.shape[0]]
test_bin_max = train_test_bin_max[X_bin_max.shape[0]:]


# Тестирование размерности разделенных бинов
X_bins_shape = X_bin_max.shape[0] + X_bin_2.shape[0] + X_bin_1.shape[0]
test_bins_shape = test_bin_max.shape[0] + test_bin_2.shape[0] + test_bin_1.shape[0]

print('X_bins shape', X_bins_shape, '==',  train_.shape[0], 'train shape')
print()
print('X_bin_1:', X_bin_1.shape[0], '-', np.round(X_bin_1.shape[0]/train_.shape[0] * 100, 1), '%')
print('X_bin_2:', X_bin_2.shape[0], '-', np.round(X_bin_2.shape[0]/train_.shape[0] * 100, 1), '%')
print('X_bin_max:', X_bin_max.shape[0], '-', np.round(X_bin_max.shape[0]/train_.shape[0] * 100, 1), '%')

print()
print('test_bins shape', test_bins_shape, '==', test.shape[0], 'test shape')
print()
print('test_bin_1:', test_bin_1.shape[0], '-', np.round(test_bin_1.shape[0]/test_.shape[0] * 100, 1), '%')
print('test_bin_2:', test_bin_2.shape[0], '-', np.round(test_bin_2.shape[0]/test_.shape[0] * 100, 1), '%')
print('test_bin_max:', test_bin_max.shape[0], '-', np.round(test_bin_max.shape[0]/test_.shape[0] * 100, 1), '%')

assert X_bins_shape == train_.shape[0]
assert test_bins_shape == test_.shape[0]

assert y_bin_max.shape[0] + y_bin_2.shape[0] + y_bin_1.shape[0] == train_['PATIENT_ID_COUNT'].shape[0]

X_bins shape 36298 == 36298 train shape

X_bin_1: 28867 - 79.5 %
X_bin_2: 7342 - 20.2 %
X_bin_max: 89 - 0.2 %

test_bins shape 39373 == 39373 test shape

test_bin_1: 32062 - 81.4 %
test_bin_2: 7222 - 18.3 %
test_bin_max: 89 - 0.2 %


In [16]:
# Обучение моделей для каждого бина и добавление прогнозов к test

# ______________________bin_1_________________

model_solution_bin_1 = make_pipeline(MaxAbsScaler(), LinearRegression())
model_solution_bin_1.fit(X_bin_1, y_bin_1)
y_pred_solution_bin_1 = model_solution_bin_1.predict(test_bin_1)
# Печать предсказанных отрицательных значений - показатель сбалансированности бина 
# (не должно быть много или большого отклонения в минус)
print('y_pred_solution_bin_1<0:', np.round(y_pred_solution_bin_1[y_pred_solution_bin_1<0], 1))
y_pred_solution_bin_1[y_pred_solution_bin_1<0] = y_bin_1.min()
test.loc[test_bin_1.index, 'PATIENT_ID_COUNT'] = y_pred_solution_bin_1.astype(int)

# ______________________bin_2_________________

model_solution_bin_2 = KNeighborsRegressor(1, p=3)
model_solution_bin_2.fit(X_bin_2, y_bin_2)
y_pred_solution_bin_2 = model_solution_bin_2.predict(test_bin_2)
print('y_pred_solution_bin_2<0:', np.round(y_pred_solution_bin_2[y_pred_solution_bin_2<0], 1))
y_pred_solution_bin_2[y_pred_solution_bin_2<0] = y_bin_2.min()
test.loc[test_bin_2.index, 'PATIENT_ID_COUNT'] = y_pred_solution_bin_2.astype(int)

# ______________________bin_max_________________

model_solution_bin_max = make_pipeline(MaxAbsScaler(), PolynomialFeatures(3), LinearRegression())
model_solution_bin_max.fit(X_bin_max, y_bin_max)
y_pred_solution_bin_max = model_solution_bin_max.predict(test_bin_max)
print('y_pred_solution_bin_max<0:', np.round(y_pred_solution_bin_max[y_pred_solution_bin_max<0], 1))
y_pred_solution_bin_max[y_pred_solution_bin_max<0] = y_bin_max.min()
test.loc[test_bin_max.index, 'PATIENT_ID_COUNT'] = y_pred_solution_bin_max.astype(int)

y_pred_solution_bin_1<0: [-1.7 -2.6 -0.2 -0.4 -0.9 -0.3 -1.1 -0.3 -0.2]
y_pred_solution_bin_2<0: []
y_pred_solution_bin_max<0: []


### Значимость признаков по DecisionTreeRegressor

In [17]:
# бин 1
tree_bin_1 = DecisionTreeRegressor()
tree_bin_1.fit(X_bin_1, y_bin_1)
pd.Series(np.round(tree_bin_1.feature_importances_ * 100, 3), index=X_bin_1.columns)

PREV_PATIENT_ID_COUNT       5.116
AVG_PATIENT_ID_COUNT        9.569
PATIENT_MKB_ADRES_COUNT    24.085
PATIENT_MKB_SEX_COUNT      11.142
PATIENT_MKB_AGE_COUNT      15.550
PATIENT_MKB_COUNT           9.242
PATIENT_ADRES_COUNT         5.253
PATIENT_SEX_COUNT           1.694
PATIENT_AGE_COUNT           4.491
COUNT_MKB_VS_MKB_ADRES     13.856
dtype: float64

In [18]:
# бин 2
tree_bin_2 = DecisionTreeRegressor()
tree_bin_2.fit(X_bin_2, y_bin_2)
pd.Series(np.round(tree_bin_2.feature_importances_ * 100, 3), index=X_bin_2.columns)

PREV_PATIENT_ID_COUNT       4.086
AVG_PATIENT_ID_COUNT       64.860
PATIENT_MKB_ADRES_COUNT     2.328
PATIENT_MKB_SEX_COUNT       4.121
PATIENT_MKB_AGE_COUNT       5.920
PATIENT_MKB_COUNT           1.909
PATIENT_ADRES_COUNT        10.589
PATIENT_SEX_COUNT           0.567
PATIENT_AGE_COUNT           1.423
COUNT_MKB_VS_MKB_ADRES      4.198
dtype: float64

In [19]:
# бин 3
tree_bin_max = DecisionTreeRegressor()
tree_bin_max.fit(X_bin_max, y_bin_max)
pd.Series(np.round(tree_bin_max.feature_importances_ * 100, 3), index=X_bin_max.columns)

PREV_PATIENT_ID_COUNT       9.852
AVG_PATIENT_ID_COUNT       40.388
PATIENT_MKB_ADRES_COUNT     0.421
PATIENT_MKB_SEX_COUNT       0.198
PATIENT_MKB_AGE_COUNT       2.706
PATIENT_MKB_COUNT           0.365
PATIENT_ADRES_COUNT         0.000
PATIENT_SEX_COUNT           0.020
PATIENT_AGE_COUNT          31.562
COUNT_MKB_VS_MKB_ADRES      4.827
__1                         0.277
__J                         0.000
__K                         0.000
__M                         0.000
__N                         0.000
__O                         0.000
__Z                         0.005
__01                        0.000
__02                        0.000
__04                        0.000
__06                        7.270
__07                        0.000
__11                        1.503
__25                        0.000
__32                        0.000
__34                        0.000
__35                        0.000
__40                        0.000
__42                        0.013
__59          

In [84]:
#Сохраняем в csv файл
 
test.to_csv('solution.csv', sep=';', index=None)