In [None]:
import sys
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, recall_score, precision_score, f1_score, accuracy_score

In [None]:
# VARS 

PATH = '.\\data\\' if sys.platform == 'win32' else './data/'
SOLUTIONS = '.\\submissions\\' if sys.platform == 'win32' else './submissions/'

In [None]:
train_df = pd.read_csv(
    PATH + 'prepared_df.csv',
    index_col=0,
    dtype={
        'PATIENT_SEX':str, 
        'MKB_CODE':str, 
        'ADRES':str, 
        'VISIT_MONTH_YEAR':str, 
        'AGE_CATEGORY':str, 
        'PATIENT_ID_COUNT':int}
        )

In [None]:
train_df.DATE = pd.to_datetime(train_df.DATE)

In [None]:
train_df.sort_values(by='DATE').reset_index(drop=True)

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
0,Male,Комсомольск,Z01.7,XXI,0-18,1,2018,2018-01-01,False,2,1-10
1,Male,Калининград,E27.8,IV,18-44,1,2018,2018-01-01,False,1,1-10
2,Female,Пионерский,C34,II,75-90,1,2018,2018-01-01,False,1,1-10
3,Male,Калининград,S62.6,XIX,60-74,1,2018,2018-01-01,False,5,1-10
4,Male,Калининград,E27.8,IV,60-74,1,2018,2018-01-01,False,1,1-10
...,...,...,...,...,...,...,...,...,...,...,...
2212388,Female,Гусев,J20,X,60-74,3,2022,2022-03-01,True,2,1-10
2212389,Female,Балтийск,J20,X,75-90,3,2022,2022-03-01,True,1,1-10
2212390,Female,Балтийск,J20,X,0-18,3,2022,2022-03-01,True,3,1-10
2212391,Female,Калининград,J20,X,45-59,3,2022,2022-03-01,True,1,1-10


In [None]:
g = train_df.groupby(pd.Grouper(key='DATE', freq='M'))

In [None]:
dfs = [group for _,group in g]

In [None]:
dfs[0].reset_index(drop=True)

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE
0,Female,Калининград,A02.0,I,0-18,1,2018,2018-01-01,False,3,1-10
1,Female,Калининград,A02.0,I,60-74,1,2018,2018-01-01,False,1,1-10
2,Female,Калининград,A02.0,I,18-44,1,2018,2018-01-01,False,2,1-10
3,Female,СТ Искра ул. Тюльпановая,A02.0,I,0-18,1,2018,2018-01-01,False,1,1-10
4,Female,Калининград,A02,I,0-18,1,2018,2018-01-01,False,1,1-10
...,...,...,...,...,...,...,...,...,...,...,...
50765,Male,Калининград,Z98.8,XXI,18-44,1,2018,2018-01-01,False,2,1-10
50766,Male,Прибрежный,Z98.8,XXI,45-59,1,2018,2018-01-01,False,1,1-10
50767,Male,Приморье,Z98.8,XXI,45-59,1,2018,2018-01-01,False,1,1-10
50768,Male,Светлогорск,Z98.8,XXI,60-74,1,2018,2018-01-01,False,3,1-10


In [None]:
model = CatBoostRegressor(
    task_type='GPU',
    random_state=42,
    early_stopping_rounds=20
)

In [None]:
for index, dataframe in enumerate(dfs, start=1):
    
    try:
        traindf = pd.concat(dfs[:index])
        testdf = dfs[index]
    except:
        traindf = dataframe
    
    X_train = traindf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_train = traindf.PATIENT_ID_COUNT
    X_test = testdf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_test = testdf.PATIENT_ID_COUNT

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    model.fit(pool_train, silent=True)

    y_pred = model.predict(pool_test)
    y_pred = [1 if value <= 0 else int(value) for value in y_pred]
    
    
    print(f'fold {index}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)


fold 1
R2:  0.9096895612552429
##################################################
fold 2
R2:  0.9507799439096344
##################################################
fold 3
R2:  0.9087663701399205
##################################################
fold 4
R2:  0.7577398013651049
##################################################
fold 5
R2:  0.8851970524582206
##################################################
fold 6
R2:  0.9271653899927161
##################################################
fold 7
R2:  0.9016627279788647
##################################################
fold 8
R2:  0.8676108639181367
##################################################
fold 9
R2:  0.8899388205706918
##################################################
fold 10
R2:  0.9393393709243315
##################################################
fold 11
R2:  0.9296665773640801
##################################################
fold 12
R2:  0.8393100232777904
##################################################
fold 13
R2:  

In [None]:
g110 = train_df[train_df.TARGET_RANGE == '1-10'].groupby(pd.Grouper(key='DATE', freq='M'))

In [None]:
dfs110 = [group for _,group in g110]

In [None]:
for index, dataframe in enumerate(dfs110, start=1):
    
    try:
        traindf = pd.concat(dfs110[:index]).reset_index(drop=True)
        testdf = dfs110[index].reset_index(drop=True)
    except:
        traindf = dataframe.reset_index(drop=True)
    
    X_train = traindf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_train = traindf.PATIENT_ID_COUNT
    X_test = testdf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_test = testdf.PATIENT_ID_COUNT

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    print('Current test date is', testdf.MONTH[0],testdf.YEAR[0])
    model.fit(pool_train, silent=True)

    y_pred = model.predict(pool_test)
    y_pred = [1 if round(value) <= 0 else round(value) for value in y_pred]
    
    
    print(f'fold {index}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

Current test date is 2 2018
fold 1
R2:  0.3503573331837476
##################################################
Current test date is 3 2018
fold 2
R2:  0.4117713303959133
##################################################
Current test date is 4 2018
fold 3
R2:  0.44988181973769736
##################################################
Current test date is 5 2018
fold 4
R2:  0.43956052611289653
##################################################
Current test date is 6 2018
fold 5
R2:  0.4551893173499202
##################################################
Current test date is 7 2018
fold 6
R2:  0.46109906829863867
##################################################
Current test date is 8 2018
fold 7
R2:  0.4568410022004099
##################################################
Current test date is 9 2018
fold 8
R2:  0.449755096705003
##################################################
Current test date is 10 2018
fold 9
R2:  0.4694301484882559
##################################################
Current

In [None]:
df110 = train_df[train_df.TARGET_RANGE == '1-10']

In [None]:
y_test_110 = df110.PATIENT_ID_COUNT
X_test_110 = df110.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)

pool_test_110 = Pool(X_test_110, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

In [None]:
preds = model.predict(pool_test_110)

In [None]:
r2_score(y_test_110, preds)

0.23502018454721507

In [None]:
df110['PREDS'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df110

Unnamed: 0,PATIENT_SEX,ADRES,MKB_CODE,CHAPTER,AGE_CATEGORY,MONTH,YEAR,DATE,IS_COVID,PATIENT_ID_COUNT,TARGET_RANGE,PREDS
0,Female,Гурьевск,A00.0,I,18-44,8,2021,2021-08-01,True,1,1-10,1.140746
1,Female,Калининград,A00.0,I,0-18,3,2020,2020-03-01,False,1,1-10,1.301365
2,Female,Гусев,A00,I,0-18,3,2019,2019-03-01,False,1,1-10,1.160958
3,Female,Калининград,A00,I,0-18,1,2022,2022-01-01,True,1,1-10,2.310950
4,Female,Калининград,A00,I,0-18,2,2018,2018-02-01,False,1,1-10,2.310950
...,...,...,...,...,...,...,...,...,...,...,...,...
2212388,Male,Гурьевск,Z99.1,XXI,0-18,12,2021,2021-12-01,True,1,1-10,1.128315
2212389,Male,Калининград,Z99.8,XXI,18-44,10,2021,2021-10-01,True,1,1-10,1.422220
2212390,Male,Калининград,Z99.9,XXI,0-18,4,2019,2019-04-01,False,2,1-10,1.341057
2212391,Male,Калининград,Z99.9,XXI,0-18,8,2019,2019-08-01,False,1,1-10,1.341057


In [None]:
df110.PATIENT_ID_COUNT.value_counts()

1     1366778
2      318988
3      136250
4       77480
5       50492
6       36202
7       26606
8       21021
9       16873
10      13867
Name: PATIENT_ID_COUNT, dtype: int64

In [None]:
df110.PREDS.value_counts()

1    1158326
2     513933
3     213281
4     114157
5      44541
6      16246
7       3989
8         84
Name: PREDS, dtype: int64

In [None]:
df110.to_csv(PATH + 'df110_boosted.csv')

In [None]:
g110_2 = df110.groupby(pd.Grouper(key='DATE', freq='M'))

In [None]:
dfs110_2 = [group for _,group in g110_2]

In [None]:
for index, dataframe in enumerate(dfs110_2, start=1):
    
    try:
        traindf = pd.concat(dfs110_2[:index]).reset_index(drop=True)
        testdf = dfs110_2[index].reset_index(drop=True)
    except:
        traindf = dataframe.reset_index(drop=True)
    
    X_train = traindf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_train = traindf.PATIENT_ID_COUNT
    X_test = testdf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_test = testdf.PATIENT_ID_COUNT

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    print('Current test date is', testdf.MONTH[0],testdf.YEAR[0])
    model.fit(pool_train, silent=True)

    y_pred = model.predict(pool_test)
    y_pred = [1 if round(value) <= 0 else round(value) for value in y_pred]
    
    
    print(f'fold {index}')
    print('R2: ', r2_score(y_test, y_pred))
    print('#'*50)

Make a classification model

In [None]:
binary_classification_model = CatBoostClassifier(
    task_type='GPU',
    random_state=42,
    early_stopping_rounds=20
)

In [None]:
train_df['IS_1-10'] = [True if range == '1-10' else False for range in train_df.TARGET_RANGE]

In [None]:
g_binary = train_df[train_df.YEAR == 2022].groupby(pd.Grouper(key='DATE', freq='M'))

In [None]:
dfs_binary = [group for _, group in g_binary]

In [None]:
dfs_binary[0]['IS_1-10'].value_counts(normalize=True)

True     0.913375
False    0.086625
Name: IS_1-10, dtype: float64

In [None]:
for index, dataframe in enumerate(dfs_binary, start=1):
    
    try:
        traindf = pd.concat(dfs_binary[:index]).reset_index(drop=True)
        testdf = dfs_binary[index].reset_index(drop=True)
    except:
        traindf = dataframe.reset_index(drop=True)
    
    X_train = traindf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_train = traindf['IS_1-10']
    X_test = testdf.drop(['PATIENT_ID_COUNT', 'DATE', 'TARGET_RANGE', 'CHAPTER'], axis=1)
    y_test = testdf['IS_1-10']

    pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])
    pool_test = Pool(X_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'IS_COVID'])

    print('Current test date is', testdf.MONTH[0],testdf.YEAR[0])
    binary_classification_model.fit(pool_train, silent=True)

    y_pred = binary_classification_model.predict(pool_test)
    y_pred = y_pred.astype(bool)
    print('Accuracy: ',accuracy_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred, average='binary', pos_label=True)) 
    print('Precision: ', precision_score(y_test, y_pred, average='binary', pos_label=True))
    print('F1 score: ', f1_score(y_test, y_pred, average='binary', pos_label=True))
    print('#'*50)

Current test date is 2 2022
Accuracy:  0.9135452603909973
Recall:  1.0
Precision:  0.9135452603909973
F1 score:  0.9548196003519993
##################################################
Current test date is 3 2022
Accuracy:  0.9041544988704612
Recall:  1.0
Precision:  0.9041544988704612
F1 score:  0.9496650606941853
##################################################
Current test date is 3 2022
Accuracy:  0.9041544988704612
Recall:  1.0
Precision:  0.9041544988704612
F1 score:  0.9496650606941853
##################################################


In [None]:
np.unique(y_pred)

array([ True])

Well, it doesn't work... I have to try something else soon