# Import

In [2]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, recall_score, precision_score, f1_score, accuracy_score

In [3]:
# VARS 

PATH = './data/'
SOLUTIONS = './submissions/'

In [4]:
train = pd.read_csv(
    PATH + 'train_dataset_train.csv',
    sep=';',
    dtype={
        'PATIENT_SEX':str, 
        'MKB_CODE':str, 
        'ADRES':str, 
        'VISIT_MONTH_YEAR':str, 
        'AGE_CATEGORY':str, 
        'PATIENT_ID_COUNT':int}
    )

test = pd.read_csv(
    PATH+'test_dataset_test.csv',
    sep=';', 
    dtype={
        'PATIENT_SEX':str,
        'MKB_CODE':str,
        'ADRES':str,
        'VISIT_MONTH_YEAR':str,
        'AGE_CATEGORY':str}
    )
orig_test = test.copy()
orig_train = train.copy()

# Baseline №1 - Простая аггрегация 

Предложенный Михаилом Марьиным в чате способ на простой аггрегации и среднем. Выдает значение **0.66** на **публичном лидерборде**

In [5]:
test.merge(train[train['VISIT_MONTH_YEAR'].isin(['01.22', '02.22', '03.22'])].groupby(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY'], as_index=False)['PATIENT_ID_COUNT'].mean(),
    on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY'],
    how='left').fillna(1).astype({'PATIENT_ID_COUNT': int}).to_csv(SOLUTIONS + 'baseline_mm.csv', sep=';', index=None)

Попробуем убрать из этого датасета непопулярные коды мкб 

In [6]:
agg_df = train[train['VISIT_MONTH_YEAR'].isin(['01.22', '02.22', '03.22'])]
agg_func_math = ['sum', 'count']
mkb_info_df = agg_df.groupby('MKB_CODE')['PATIENT_ID_COUNT'].agg(agg_func_math).round(2).sort_values(by='sum', ascending=False)
unpopular = mkb_info_df[mkb_info_df['count'] == 1].index.tolist()
agg_df = agg_df.query(f'MKB_CODE not in {unpopular}')

In [7]:
test.merge(agg_df.groupby(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY'], as_index=False)['PATIENT_ID_COUNT'].mean(),
    on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY'],
    how='left').fillna(1).astype({'PATIENT_ID_COUNT': int}).to_csv(SOLUTIONS + 'baseline_threshold.csv', sep=';', index=None)

Не влияет на итоговый скор на паблике. Будем пытаться бить 0,66 на кросс-валидации 

# CatBoostClassifier, пытаемся предсказать Target_Range на полном датасете

In [8]:
train_df = pd.read_csv(
    PATH + 'prepared_df.csv',
    index_col=0,
    dtype={
        'PATIENT_SEX':str, 
        'MKB_CODE':str, 
        'ADRES':str, 
        'VISIT_MONTH_YEAR':str, 
        'AGE_CATEGORY':str, 
        'PATIENT_ID_COUNT':int}
        )


In [9]:
train_df = train_df.sort_values(by='DATE').reset_index(drop=True)
train_df.head(5)

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
0,Male,Комсомольск,Z01.7,XXI,0-18,1,2018,2018-01-01,False,2,1-10
1,Male,Калининград,E27.8,IV,18-44,1,2018,2018-01-01,False,1,1-10
2,Female,Пионерский,C34,II,75-90,1,2018,2018-01-01,False,1,1-10
3,Male,Калининград,S62.6,XIX,60-74,1,2018,2018-01-01,False,5,1-10
4,Male,Калининград,E27.8,IV,60-74,1,2018,2018-01-01,False,1,1-10


In [20]:
tscv = TimeSeriesSplit()
classifier = CatBoostClassifier(
    task_type='CPU', 
    random_seed=42,
)
regressor = CatBoostRegressor(
    task_type='CPU', 
    random_seed=42,
)

Напишем функции для обучения модели и кросс-валидации 

In [10]:
def make_a_regression(df, tscv, model, target, columns_to_drop, cat_features):
    fold = 0
    for train_index, test_index in tscv.split(df):
        fold += 1 
        X_train = df.drop(columns_to_drop, axis=1).iloc[train_index]
        X_test = train_df.drop(columns_to_drop, axis=1).iloc[test_index]
        y_train = train_df[target].iloc[train_index]
        y_test = train_df[target].iloc[test_index]

        pool_train = Pool(X_train, y_train, cat_features=cat_features)
        pool_test = Pool(X_test, cat_features=cat_features)

        model.fit(pool_train, silent=True)
        y_pred = model.predict(pool_test)
        y_pred = [1 if value <= 0 else int(value) for value in y_pred]
        
        print(f'fold {fold}')
        print('R2: ', r2_score(y_test, y_pred))
        print('#'*50)

In [21]:
def make_a_classification(df, tscv, model, target, columns_to_drop, cat_features, average=None):
    columns_to_drop.append(target)
    fold = 0
    for train_index, test_index in tscv.split(train_df):
        fold += 1 
        X_train = df.drop(columns_to_drop, axis=1).iloc[train_index]
        X_test = train_df.drop(columns_to_drop, axis=1).iloc[test_index]
        y_train = train_df[target].iloc[train_index]
        y_test = train_df[target].iloc[test_index]

        pool_train = Pool(X_train, y_train, cat_features=cat_features)
        pool_test = Pool(X_test, cat_features=cat_features)

        model.fit(pool_train, silent=True)
        y_pred = model.predict(pool_test)
        
        print(f'fold {fold}')
        print('Accuracy: ',accuracy_score(y_test, y_pred))
        print('Recall: ', recall_score(y_test, y_pred, average=average, zero_division=0)) 
        print('Precision: ', precision_score(y_test, y_pred, average=average, zero_division=0))
        print('F1 score: ', f1_score(y_test, y_pred, average=average, zero_division=0))
        print('#'*50)
    

In [22]:
make_a_classification(
    df=train_df,
    tscv=tscv,
    model=classifier,
    target='TARGET_RANGE',
    columns_to_drop=['MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'],
    cat_features=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'],
)

KeyboardInterrupt: 

Как и ожидалось, модель лучше всего предсказывает значения в пределах 1-10 и плохо справляется со значениями в других пределах, что вызвано в том числе и общей несбалансированностью в датасете и во времени. 

Попробуем оттрешхолдить непопулярные МКБ и посмотреть что получится 

In [56]:
thr_df = train_df.query(f'MKB_CODE not in {unpopular}') 

In [65]:
thr_df.TARGET_RANGE.value_counts()

1-10          2029361
10-100         132999
100-1000        13791
1000-10000        905
10000+              4
Name: TARGET_RANGE, dtype: int64

In [58]:
fold = 0
for train_index, test_index in tscv.split(thr_df):
    fold += 1 
    X_train = thr_df.drop(['TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = thr_df.drop(['TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = thr_df.TARGET_RANGE.iloc[train_index]
    y_test = thr_df.TARGET_RANGE.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    model = CatBoostClassifier(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    
    print(f'fold {fold}')
    print('Accuracy: ',accuracy_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred, average=None, zero_division=0)) 
    print('Precision: ', precision_score(y_test, y_pred, average=None, zero_division=0))
    print('F1 score: ', f1_score(y_test, y_pred, average=None, zero_division=0))
    print('#'*50)

fold 1
Accuracy:  0.9600901767431093
Recall:  [0.99056752 0.41797011 0.30353929 0.70588235]
Precision:  [0.97094595 0.64402392 0.64705882 0.73469388]
F1 score:  [0.98065859 0.50693835 0.41322989 0.72      ]
##################################################
fold 2
Accuracy:  0.960112224846559
Recall:  [0.99114317 0.42615039 0.29494712 0.91566265]
Precision:  [0.97044158 0.66294195 0.70013947 0.68468468]
F1 score:  [0.98068314 0.51880444 0.41504754 0.78350515]
##################################################
fold 3
Accuracy:  0.9519241104279261
Recall:  [0.98794063 0.43308471 0.32248521 0.63716814]
Precision:  [0.96542607 0.64310406 0.66531027 0.64864865]
F1 score:  [0.9765536  0.51760164 0.43440717 0.64285714]
##################################################
fold 4
Accuracy:  0.9251742489175759
Recall:  [0.99058341 0.25831493 0.20659191 0.08520179]
Precision:  [0.93511443 0.64090053 0.73429952 0.79166667]
F1 score:  [0.96205004 0.36821915 0.32246089 0.15384615]
####################

Нет существенного влияния. Попробуем убрать значения 1-10 и посмотреть как модель справляется в этом случае. 

In [61]:
lil_df = train_df[train_df.TARGET_RANGE != '1-10']
lil_df.head()

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
6,Male,Калининград,S62.6,XIX,45-59,1,2018,2018-01-01,False,13,10-100
15,Male,Калининград,S62.6,XIX,18-44,1,2018,2018-01-01,False,13,10-100
19,Female,Калининград,N85.4,XIV,18-44,1,2018,2018-01-01,False,13,10-100
31,Female,Калининград,E34.9,IV,60-74,1,2018,2018-01-01,False,22,10-100
32,Female,Калининград,E34.9,IV,45-59,1,2018,2018-01-01,False,14,10-100


In [63]:
lil_df.TARGET_RANGE.value_counts()

10-100        133136
100-1000       13791
1000-10000       905
10000+             4
Name: TARGET_RANGE, dtype: int64

In [62]:
fold = 0
for train_index, test_index in tscv.split(lil_df):
    fold += 1 
    X_train = lil_df.drop(['TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = lil_df.drop(['TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = lil_df.TARGET_RANGE.iloc[train_index]
    y_test = lil_df.TARGET_RANGE.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    model = CatBoostClassifier(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    
    print(f'fold {fold}')
    print('Accuracy: ',accuracy_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred, average=None, zero_division=0)) 
    print('Precision: ', precision_score(y_test, y_pred, average=None, zero_division=0))
    print('F1 score: ', f1_score(y_test, y_pred, average=None, zero_division=0))
    print('#'*50)

fold 1
Accuracy:  0.9444376801006534
Recall:  [0.97955166 0.59879963 0.65322581]
Precision:  [0.96249725 0.72377232 0.79411765]
F1 score:  [0.97094957 0.65538151 0.71681416]
##################################################
fold 2
Accuracy:  0.9478063233085758
Recall:  [0.98230958 0.60140187 0.6754386 ]
Precision:  [0.96388024 0.7486911  0.71962617]
F1 score:  [0.97300766 0.66701218 0.69683258]
##################################################
fold 3
Accuracy:  0.9298672835748204
Recall:  [0.96861728 0.54904695 0.52941176]
Precision:  [0.95833518 0.62027311 0.5       ]
F1 score:  [0.9634488  0.58249075 0.51428571]
##################################################
fold 4
Accuracy:  0.9246722675433257
Recall:  [0.98748976 0.40828888 0.36818182]
Precision:  [0.93641344 0.72627737 0.92045455]
F1 score:  [0.96127361 0.5227213  0.52597403]
##################################################
fold 5
Accuracy:  0.9135516863509071
Recall:  [0.97117517 0.48989899 0.59069767 0.        ]
Precisio

Такая модель в свою очередь хорошо справляется с предсказанием периода 10-100. Попробуем научить модель бинарной классификации и решить задачу по тому .входит ли значение в период 1-10 или нет. Для этого нужно будет создать дополнительный признак

In [71]:
train_df['IS_1-10'] = [True if range == '1-10' else False for range in train_df.TARGET_RANGE]

In [72]:
train_df.head()

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE,IS_1-10
0,Male,Комсомольск,Z01.7,XXI,0-18,1,2018,2018-01-01,False,2,1-10,True
1,Male,Калининград,E27.8,IV,18-44,1,2018,2018-01-01,False,1,1-10,True
2,Female,Пионерский,C34,II,75-90,1,2018,2018-01-01,False,1,1-10,True
3,Male,Калининград,S62.6,XIX,60-74,1,2018,2018-01-01,False,5,1-10,True
4,Male,Калининград,E27.8,IV,60-74,1,2018,2018-01-01,False,1,1-10,True


Точность модели должна быть не ниже простой угадайки, то есть не ниже: 

In [78]:
train_df['IS_1-10'].value_counts(normalize=True)

True     0.933178
False    0.066822
Name: IS_1-10, dtype: float64

In [79]:
fold = 0
for train_index, test_index in tscv.split(train_df):
    fold += 1 
    X_train = train_df.drop(['IS_1-10', 'TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = train_df.drop(['IS_1-10', 'TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = train_df['IS_1-10'].iloc[train_index]
    y_test = train_df['IS_1-10'].iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    model = CatBoostClassifier(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    
    print(f'fold {fold}')
    print('Accuracy: ',accuracy_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred, average='binary', pos_label='True')) 
    print('Precision: ', precision_score(y_test, y_pred, average='binary', pos_label='True'))
    print('F1 score: ', f1_score(y_test, y_pred, average='binary', pos_label='True'))
    print('#'*50)

fold 1
Accuracy:  0.9791393206990443
Recall:  0.9922266456174068
Precision:  0.9858657846279751
F1 score:  0.9890359879584018
##################################################
fold 2
Accuracy:  0.9781304578935378
Recall:  0.9928876285411475
Precision:  0.9841542497615046
F1 score:  0.9885016497508976
##################################################
fold 3
Accuracy:  0.9707972185760932
Recall:  0.9891605740120059
Precision:  0.9798638667798243
F1 score:  0.9844902732926769
##################################################
fold 4
Accuracy:  0.9466414631765075
Recall:  0.9911021984551396
Precision:  0.952381632099576
F1 score:  0.9713561949962511
##################################################
fold 5
Accuracy:  0.955390364817808
Recall:  0.9858051723208349
Precision:  0.9656421917406294
F1 score:  0.9756195168368957
##################################################


На всех фолдах модель показывает скор лучше простого угадывания. Возможно стоит попробовать предсказывать нахождение таргета в промежутке 1-10, а после уже уточнять это значение. 

In [88]:
train_df.drop(['IS_1-10', 'TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).columns

Index(['PATIENT_SEX', 'ADRES', 'MKB_CODE', 'AGE_CATEGORY', 'IS_COVID'], dtype='object')

In [83]:
model.get_feature_importance()

array([ 7.93552474, 36.90936516, 31.45416346, 20.60131861,  3.09962804])

Согласно модели наибольшее значение имеют фичи: адрес, возраст и код МКБ соответственно. Запомним сделанные выводы и попробуем реализовать простую регрессию на датасете с кросс-валидацией по времени

# CatBoostRegressor. Кросс-валидация по времени

In [4]:
train_df = pd.read_csv(
    PATH + 'prepared_df.csv',
    index_col=0,
    dtype={
        'PATIENT_SEX':str, 
        'MKB_CODE':str, 
        'ADRES':str, 
        'VISIT_MONTH_YEAR':str, 
        'AGE_CATEGORY':str, 
        'PATIENT_ID_COUNT':int}
        )

In [11]:
train_df = train_df.sort_values(by='DATE').reset_index(drop=True)
train_df.head(5)

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
0,Male,Комсомольск,Z01.7,XXI,0-18,1,2018,2018-01-01,False,2,1-10
1,Male,Калининград,E27.8,IV,18-44,1,2018,2018-01-01,False,1,1-10
2,Female,Пионерский,C34,II,75-90,1,2018,2018-01-01,False,1,1-10
3,Male,Калининград,S62.6,XIX,60-74,1,2018,2018-01-01,False,5,1-10
4,Male,Калининград,E27.8,IV,60-74,1,2018,2018-01-01,False,1,1-10


In [7]:
tscv = TimeSeriesSplit()

Попробуем модель из коробки. 

In [13]:
fold = 0
for train_index, test_index in tscv.split(train_df):
    fold += 1 
    X_train = train_df.drop(['TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = train_df.drop(['TARGET_RANGE', 'MONTH', 'YEAR', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = train_df.PATIENT_ID_COUNT.iloc[train_index]
    y_test = train_df.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if value <= 0 else int(value) for value in y_pred]
    
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
R2:  0.8965563099344622
##################################################
fold 2
R2:  0.8067120839749432
##################################################
fold 3
R2:  0.8007594506184642
##################################################
fold 4
R2:  0.3670802574590035
##################################################
fold 5
R2:  0.3802074389296831
##################################################


Видно, что значения R2 падают на последних кусках датасета, что, скорее всего, связано с тем, что именно эти периоды выпадают на ковид. Попробуем оставить в трейне год и месяц для обучения 

In [14]:
fold = 0
for train_index, test_index in tscv.split(train_df):
    fold += 1 
    X_train = train_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = train_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = train_df.PATIENT_ID_COUNT.iloc[train_index]
    y_test = train_df.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID','MONTH', 'YEAR'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID', 'MONTH', 'YEAR'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if value <= 0 else int(value) for value in y_pred]
    
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
R2:  0.8184054524582086
##################################################
fold 2
R2:  0.9038615816692268
##################################################
fold 3
R2:  0.8305199331836094
##################################################
fold 4
R2:  0.39835105468754417
##################################################
fold 5
R2:  0.4068905121190348
##################################################


Особого эффекта не возымело. Попробуем поучиться только на данных ковидного периода

In [17]:
covid_df = train_df[train_df.IS_COVID == True]
covid_df.head()

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
1466433,Male,Калининград,L82,XII,45-59,4,2020,2020-04-01,True,1,1-10
1466434,Male,Мамоново,J18,X,0-18,4,2020,2020-04-01,True,1,1-10
1466435,Female,Калининград,S01.1,XIX,18-44,4,2020,2020-04-01,True,1,1-10
1466436,Male,Гурьевск,C41.0,II,75-90,4,2020,2020-04-01,True,1,1-10
1466437,Male,Калининград,Z01.7,XXI,18-44,4,2020,2020-04-01,True,76,10-100


In [20]:
fold = 0
for train_index, test_index in tscv.split(covid_df):
    fold += 1 
    X_train = covid_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE', 'IS_COVID'], axis=1).iloc[train_index]
    X_test = covid_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE', 'IS_COVID'], axis=1).iloc[test_index]
    y_train = covid_df.PATIENT_ID_COUNT.iloc[train_index]
    y_test = covid_df.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','MONTH', 'YEAR'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'MONTH', 'YEAR'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if value <= 0 else int(value) for value in y_pred]
    
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
R2:  0.4111437325048717
##################################################
fold 2
R2:  0.47771769732646185
##################################################
fold 3
R2:  0.3185584140738842
##################################################
fold 4
R2:  0.6182943391215374
##################################################
fold 5
R2:  0.644015112909648
##################################################


Попробуем оттрешхолдить коды МКБ и поучить пару моделей

In [27]:
agg_func_math = ['sum', 'count']
mkb_info_df = train_df.groupby('MKB_CODE')['PATIENT_ID_COUNT'].agg(agg_func_math).round(2).sort_values(by='sum', ascending=False)
unpopular = mkb_info_df[mkb_info_df['count'] < 10].index.tolist()
thr_df = train_df.query(f'MKB_CODE not in {unpopular}')

In [None]:
fold = 0
for train_index, test_index in tscv.split(thr_df):
    fold += 1 
    X_train = thr_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = thr_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = thr_df.PATIENT_ID_COUNT.iloc[train_index]
    y_test = thr_df.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','MONTH', 'YEAR', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'MONTH', 'YEAR', 'IS_COVID'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if int(value) <= 0 else int(value) for value in y_pred]
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

Попробуем нафитить регрессию на промежутке 1-10

In [36]:
df110 = train_df[train_df.TARGET_RANGE == '1-10']
df110.head(5)

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
0,Male,Комсомольск,Z01.7,XXI,0-18,1,2018,2018-01-01,False,2,1-10
1,Male,Калининград,E27.8,IV,18-44,1,2018,2018-01-01,False,1,1-10
2,Female,Пионерский,C34,II,75-90,1,2018,2018-01-01,False,1,1-10
3,Male,Калининград,S62.6,XIX,60-74,1,2018,2018-01-01,False,5,1-10
4,Male,Калининград,E27.8,IV,60-74,1,2018,2018-01-01,False,1,1-10


In [47]:
fold = 0
for train_index, test_index in tscv.split(df110):
    fold += 1 
    X_train = df110.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[train_index]
    X_test = df110.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE'], axis=1).iloc[test_index]
    y_train = df110.PATIENT_ID_COUNT.iloc[train_index]
    y_test = df110.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','MONTH', 'YEAR', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'MONTH', 'YEAR', 'IS_COVID'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if int(value) <= 0 else int(value) for value in y_pred]
    
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
[1 2 3 4 5 6 7 8 9]
R2:  0.2941824695683256
##################################################
fold 2
[1 2 3 4 5 6 7 8 9]
R2:  0.41538716614103577
##################################################
fold 3
[1 2 3 4 5 6 7 8 9]
R2:  0.38160301852933187
##################################################
fold 4
[1 2 3 4 5 6 7 8]
R2:  0.265119882956812
##################################################
fold 5
[1 2 3 4 5 6 7]
R2:  0.24436123356832584
##################################################


Видно, что даже в такой задаче Регрессор не может адекватно воспринять и предсказать значения. Попробуем дату как фичу без разбиения на месяц и год

In [88]:
fold = 0
for train_index, test_index in tscv.split(thr_df):
    fold += 1 
    X_train = thr_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT','MONTH', 'YEAR'], axis=1).iloc[train_index]
    X_test = thr_df.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT','MONTH', 'YEAR'], axis=1).iloc[test_index]
    y_train = thr_df.PATIENT_ID_COUNT.iloc[train_index]
    y_test = thr_df.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','DATE', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE','IS_COVID'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if int(value) <= 0 else int(value) for value in y_pred]
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
R2:  0.8402995952261442
##################################################
fold 2
R2:  0.7706844824255662
##################################################
fold 3
R2:  0.7188984471078972
##################################################
fold 4
R2:  0.3377114736277882
##################################################
fold 5
R2:  0.23582957521505998
##################################################


In [89]:
model.get_feature_importance()

array([ 1.58119038, 34.97668918, 34.42858083, 22.82421749,  5.46975674,
        0.71956538])

Попробуем зафитить только на 1000 популярных мкб кодов 

In [105]:
popular = train_df.MKB_CODE.value_counts()[:1000].index.tolist()
thr_df_pop = train_df.query(f'MKB_CODE in {popular}')

In [106]:
fold = 0
for train_index, test_index in tscv.split(thr_df_pop):
    fold += 1 
    X_train = thr_df_pop.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT',], axis=1).iloc[train_index]
    X_test = thr_df_pop.drop(['TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT',], axis=1).iloc[test_index]
    y_train = thr_df_pop.PATIENT_ID_COUNT.iloc[train_index]
    y_test = thr_df_pop.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','DATE', 'MONTH', 'YEAR', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE', 'MONTH', 'YEAR', 'IS_COVID'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if int(value) <= 0 else int(value) for value in y_pred]
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
R2:  0.8348191254286731
##################################################
fold 2
R2:  0.8231318943031158
##################################################
fold 3
R2:  0.7823620628030944
##################################################
fold 4
R2:  0.570306015145128
##################################################
fold 5
R2:  0.2581190593329701
##################################################


In [135]:
df_2022 = train_df[train_df.YEAR == 2022]
tscv_2022 = TimeSeriesSplit(n_splits=3)

In [139]:
fold = 0
for train_index, test_index in tscv_2022.split(df_2022):
    fold += 1 
    X_train = df_2022.drop(['PATIENT_SEX', 'TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE', 'MONTH', 'YEAR', 'IS_COVID'], axis=1).iloc[train_index]
    X_test = df_2022.drop(['PATIENT_SEX', 'TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE', 'MONTH', 'YEAR', 'IS_COVID'], axis=1).iloc[test_index]
    y_train = df_2022.PATIENT_ID_COUNT.iloc[train_index]
    y_test = df_2022.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['MKB_CODE', 'ADRES', 'AGE_CATEGORY'])
    pool_test = Pool(X_test, cat_features = ['MKB_CODE', 'ADRES', 'AGE_CATEGORY'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if int(value) <= 0 else int(value) for value in y_pred]
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

fold 1
R2:  0.8061538912091056
##################################################
fold 2
R2:  0.050113250205142124
##################################################
fold 3
R2:  0.3834520501201071
##################################################


In [141]:
model.get_feature_importance()

array([29.13376653, 49.25536951, 21.61086396])

2020 год и 1-10

In [154]:
df_2022_1_10 = train_df[(train_df.YEAR == 2022) & (train_df.TARGET_RANGE == '1-10')]
tscv_2022 = TimeSeriesSplit(n_splits=3)

In [None]:
fold = 0
for train_index, test_index in tscv_2022.split(df_2022_1_10):
    fold += 1 
    X_train = df_2022_1_10.drop(['PATIENT_SEX', 'TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE', 'MONTH', 'YEAR', 'IS_COVID'], axis=1).iloc[train_index]
    X_test = df_2022_1_10.drop(['PATIENT_SEX', 'TARGET_RANGE', 'CHAPTER', 'PATIENT_ID_COUNT', 'DATE', 'MONTH', 'YEAR', 'IS_COVID'], axis=1).iloc[test_index]
    y_train = df_2022_1_10.PATIENT_ID_COUNT.iloc[train_index]
    y_test = df_2022_1_10.PATIENT_ID_COUNT.iloc[test_index]

    pool_train = Pool(X_train, y_train, cat_features = ['MKB_CODE', 'ADRES', 'AGE_CATEGORY'])
    pool_test = Pool(X_test, cat_features = ['MKB_CODE', 'ADRES', 'AGE_CATEGORY'])

    model = CatBoostRegressor(
    task_type='GPU', 
    random_seed=42,
    loss_function = 'RMSE',
    iterations = 500, 
    l2_leaf_reg = 2
    )

    model.fit(pool_train, silent=True)
    y_pred = model.predict(pool_test)
    y_pred = [1 if int(value) <= 0 else int(value) for value in y_pred]
    
    print(f'fold {fold}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

# Make a submission

In [116]:
test.PATIENT_SEX = test.PATIENT_SEX.map({'1':'Male', '0':'Female'})
test['AGE_CATEGORY'] = test['AGE_CATEGORY'].map({
    'children': '0-18',
    'young': '18-44',
    'middleage': '45-59',
    'elderly': '60-74', 
    'old': '75-90',
    'centenarians': '90+'
})
# TODO FIX 
test['IS_COVID'] = [True] * test.shape[0]

test['DAY'] = [1] * test.shape[0]
test['MONTH'] = [value[0] for value in test.VISIT_MONTH_YEAR.astype(str).apply(lambda x: x.split('.'))]
test['MONTH'] = test['MONTH'].astype('int64')
test['YEAR'] = ['20' + value[1] if len(value[1]) > 1 else '20' + value[1] + '0' for value in test.VISIT_MONTH_YEAR.astype(str).apply(lambda x: x.split('.'))]
test['YEAR'] = test['YEAR'].astype('int64')
test['DATE'] = pd.to_datetime(test[['DAY', 'MONTH', 'YEAR']])
test = test.drop('DAY', axis=1)

In [121]:
test.head()

Unnamed: 0,PATIENT_SEX,MKB_CODE,ADRES,VISIT_MONTH_YEAR,AGE_CATEGORY,IS_COVID,MONTH,YEAR,DATE
0,Female,A00,Калининград,4.22,0-18,True,4,2022,2022-04-01
1,Female,A00,Калининград,4.22,60-74,True,4,2022,2022-04-01
2,Female,A00,Калининград,4.22,45-59,True,4,2022,2022-04-01
3,Female,A00,Калининград,4.22,18-44,True,4,2022,2022-04-01
4,Female,A01,Калининград,4.22,45-59,True,4,2022,2022-04-01


In [120]:
test.DATE = test.DATE.astype(str)

In [149]:
pool_test = Pool(test.drop(['VISIT_MONTH_YEAR','PATIENT_SEX','DATE', 'MONTH', 'YEAR', 'IS_COVID'], axis=1), cat_features = ['AGE_CATEGORY', 'MKB_CODE', 'ADRES'])

In [150]:
solution_prediction = model.predict(pool_test)
solution_prediction = [1 if int(value) <= 0 else int(value) for value in solution_prediction]

In [151]:
orig_test['PATIENT_ID_COUNT'] = solution_prediction

In [152]:
orig_test

Unnamed: 0,PATIENT_SEX,MKB_CODE,ADRES,VISIT_MONTH_YEAR,AGE_CATEGORY,PATIENT_ID_COUNT
0,0,A00,Калининград,04.22,children,1
1,0,A00,Калининград,04.22,elderly,1
2,0,A00,Калининград,04.22,middleage,1
3,0,A00,Калининград,04.22,young,2
4,0,A01,Калининград,04.22,middleage,1
...,...,...,...,...,...,...
39368,1,Z96.6,Балтийск,04.22,elderly,1
39369,1,Z96.6,Гусев,04.22,middleage,1
39370,1,Z96.7,Гусев,04.22,young,1
39371,1,Z98.8,Озерск,04.22,children,1


In [153]:
orig_test.to_csv(SOLUTIONS + 'solution_23_08_22_2.csv', sep=';', index=None)