In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sys import stdout
from time import sleep

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)

import warnings
warnings.filterwarnings('ignore')

# DATA

In [2]:
train = pd.read_parquet('data/train.parquet')
test = pd.read_parquet('data/test.parquet')
sample_sub = pd.read_csv('data/sample_submission.csv')

train.head(10)

Unnamed: 0,id,dates,values,label
0,19114,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.86, 0.79, 1.4, 0.15, 0.0, -1.24, -1.46, 3....",0.0
1,22769,"[2016-05-01, 2016-06-01, 2016-07-01, 2016-08-0...","[-1.04, -3.48, 0.05, -0.13, -0.01, 0.03, 0.27,...",1.0
2,76935,"[2017-03-01, 2017-04-01, 2017-05-01, 2017-06-0...","[0.28, 0.63, 0.06, 0.96, -1.4, -0.3, 1.62, 1.1...",0.0
3,66297,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.33, 0.58, 1.1, -0.56, -0.95, -0.61, -0.7, ...",0.0
4,2191,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.31, 0.5, -0.54, 0.95, 0.65, 0.83, -1.55, -0...",0.0
5,59504,"[2016-03-01, 2016-04-01, 2016-05-01, 2016-06-0...","[0.08, 0.88, 1.46, 0.59, -0.97, -0.41, 0.78, -...",0.0
6,49554,"[2016-04-01, 2016-05-01, 2016-06-01, 2016-07-0...","[1.05, -0.28, 1.09, -0.69, 1.26, 1.79, 1.38, -...",0.0
7,58344,"[2016-12-01, 2017-01-01, 2017-02-01, 2017-03-0...","[-0.36, -0.45, -0.4, -0.4, -0.45, -0.45, -0.45...",0.0
8,87449,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[2.92, 4.11, 3.39, -1.17, -0.79, 2.15, -0.22, ...",1.0
9,43415,"[2016-02-01, 2016-03-01, 2016-04-01, 2016-05-0...","[-0.86, -0.05, -0.35, -0.67, -0.25, -0.8, -0.6...",1.0


In [3]:
sample_sub.head(10)

Unnamed: 0,id,score
0,6125.0,0.757097
1,26781.0,0.346173
2,13333.0,0.431305
3,53218.0,0.847472
4,84204.0,0.065298
5,69997.0,0.689232
6,99301.0,0.025703
7,4361.0,0.461261
8,46607.0,0.293316
9,29836.0,0.233549


Посмотрим, какого размера данные и есть ли в них пропуски

In [4]:
test['label'] = -1
data = pd.concat([train, test], ignore_index=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      100000 non-null  int64  
 1   dates   100000 non-null  object 
 2   values  100000 non-null  object 
 3   label   100000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


Пропусков не найдено, но есть интересная деталь. Поскольку столбцы с датами и значениями содержат массивы, то можно реструктурировать данные и создать альтернативную таблицу, где под каждую дату можно выделить отдельный признак. Одна строка такой таблицы будет выглядеть примерно так.

In [5]:
pd.DataFrame({col: data.loc[0, col] for col in train.columns}) 

Unnamed: 0,id,dates,values,label
0,19114,2016-01-01,-1.86,0.0
1,19114,2016-02-01,0.79,0.0
2,19114,2016-03-01,1.40,0.0
3,19114,2016-04-01,0.15,0.0
4,19114,2016-05-01,0.00,0.0
...,...,...,...,...
57,19114,2020-10-01,-0.28,0.0
58,19114,2020-11-01,0.89,0.0
59,19114,2020-12-01,0.35,0.0
60,19114,2021-01-01,-0.51,0.0


In [6]:
def restructure_data(data):
    newdf = pd.DataFrame()
    
    print('Rows restructured:')
    for i in data.index: 
        row = pd.DataFrame({
            col: data.loc[i, col] 
            for col in data.columns
        })
        
        newdf = pd.concat([newdf, row], ignore_index=True)
        
        stdout.write(f"\r{i}")
        stdout.flush()
        sleep(0.0001)
    print('\n')
    
    newdf['year'] = pd.to_datetime(newdf['dates']).dt.year
    newdf['month'] = pd.to_datetime(newdf['dates']).dt.month
    newdf.drop('dates', axis=1, inplace=True)
    
    newdf.to_csv('data/restructured_data.csv', index=False)
    
    return newdf

In [7]:
%%time
re_data = restructure_data(data) 

re_data

Rows restructured: 
99999CPU times: total: 16min 25s
Wall time: 40min 1s


Unnamed: 0,id,values,label,year,month
0,19114,-1.86,0.0,2016,1
1,19114,0.79,0.0,2016,2
2,19114,1.40,0.0,2016,3
3,19114,0.15,0.0,2016,4
4,19114,0.00,0.0,2016,5
...,...,...,...,...,...
6442359,73528,-0.53,-1.0,2023,2
6442360,73528,1.65,-1.0,2023,3
6442361,73528,0.07,-1.0,2023,4
6442362,73528,0.62,-1.0,2023,5


In [None]:
aesdfghgfg

In [8]:
re_data = pd.read_csv('data/restructured_data.csv')

re_data

Unnamed: 0,id,values,label,year,month
0,19114,-1.86,0.0,2016,1
1,19114,0.79,0.0,2016,2
2,19114,1.40,0.0,2016,3
3,19114,0.15,0.0,2016,4
4,19114,0.00,0.0,2016,5
...,...,...,...,...,...
6442359,73528,-0.53,-1.0,2023,2
6442360,73528,1.65,-1.0,2023,3
6442361,73528,0.07,-1.0,2023,4
6442362,73528,0.62,-1.0,2023,5


Потом в таблицу будут добавляться все новые значения. Так будут выглядеть на графике первые 2 строки.

Как видим, первая и вторая строки имеют неодинаковую длину, так как наблюдения начинаются и заканчиваются в разное время. Следовательно второй временной ряд нужно дополнить новыми наблюдениями, то есть спрогнозировать их.

In [None]:
re_train = re_data[re_data['label'] >= 0]
re_test = re_data[re_data['label'] < 0].drop('label', axis=1)

re_train

In [None]:
re_train['label'].value_counts()

In [None]:
y = re_train.pop('label')

X_train, X_valid, y_train, y_valid = train_test_split(
    re_train, y, 
    test_size=0.2, 
    random_state=33, 
    stratify=y
)

data_pool = X_train, X_valid, y_train, y_valid

X_train

In [None]:
def calc_rocauc(model, X, y):
    proba = model.predict_proba(X)[:, 1]
    rocauc = roc_auc_score(y, proba)
    
    return round(rocauc, 2)


def estimate_model(model, pool=data_pool):
    X_train, X_valid, y_train, y_valid = pool
    model.fit(X_train, y_train)

    rocauc_train = calc_rocauc(model, X_train, y_train)
    rocauc_valid = calc_rocauc(model, X_valid, y_valid)

    print('Train ROC AUC: ', rocauc_train)
    print('Valid ROC AUC: ', rocauc_valid)
    
    
baseline = LogisticRegression()

estimate_model(baseline)

In [None]:
forest = RandomForestClassifier(random_state=33)

estimate_model(forest)

In [None]:
cat = CatBoostClassifier(
        eval_metric='AUC', 
        allow_writing_files=False, 
        random_state=33
)

cat.fit(X_train, 
        y_train, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds=10, 
        verbose=10)

rocauc_train = calc_rocauc(cat, X_train, y_train)
rocauc_valid = calc_rocauc(cat, X_valid, y_valid)

print('Train ROC AUC: ', rocauc_train)
print('Valid ROC AUC: ', rocauc_valid)