In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('dataset_quarts_even.csv')

In [3]:
df = df.set_index('author')

In [4]:
df

Unnamed: 0_level_0,period1,period2,period3,adf_c,adf_ct,adf_ctt,pp_c,pp_ct,kpss_c,kpss_ct,...,2,3,4,daily0,daily1,daily2,weekly0,weekly1,weekly2,target
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
author314_2023-07-01_2023-09-30,11,27,29,0,0,0,0,0,0,0,...,0.00,23.0,0.0,1,1,1,1,1,1,0.679866
author363_2023-07-01_2023-09-30,28,10,19,0,0,0,0,0,0,0,...,0.75,0.0,0.0,1,1,1,1,1,1,0.719305
author286_2023-07-01_2023-09-30,22,13,30,1,1,1,0,0,0,0,...,0.00,0.0,40.0,1,1,1,1,1,1,0.719847
author912_2023-07-01_2023-09-30,31,30,1,0,0,0,0,0,0,0,...,12.75,14.0,2.0,1,0,0,1,1,1,0.722459
author15_2023-07-01_2023-09-30,1,2,28,1,1,1,0,0,0,1,...,4.00,8.0,0.0,0,0,0,1,1,1,0.723281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
author288_2024-07-01_2024-09-30,9,29,21,0,0,0,0,0,0,0,...,0.00,0.0,0.0,1,1,1,1,1,1,0.568182
author255_2024-07-01_2024-09-30,17,21,28,0,0,0,0,0,0,0,...,168.00,0.0,0.0,1,1,1,1,1,1,0.562500
author898_2024-07-01_2024-09-30,1,14,21,1,1,1,0,0,1,0,...,0.00,0.0,0.0,1,1,1,1,1,1,0.981061
author596_2024-07-01_2024-09-30,12,25,19,0,0,0,0,0,0,0,...,0.00,64.0,80.0,1,1,1,1,1,1,0.579545


In [5]:
X, y = df.drop(['target'], axis=1), df['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **1. Linear regression (baseline)**

In [7]:
model = LinearRegression()

In [8]:
model.fit(X_train, y_train)

In [9]:
preds = model.predict(X_test)

In [10]:
mean_squared_error(y_test, preds)

0.004252871904983765

In [11]:
model.coef_

array([ 6.57198402e-05, -6.44259556e-05,  1.50792316e-04, -1.16054498e-04,
        4.83712612e-03, -1.11176232e-02, -2.38271643e-02,  3.02778175e-02,
        5.37105300e-03,  2.53688523e-03, -3.62267395e-04,  6.25784867e-03,
       -8.66883545e-03, -3.07892015e-06, -1.81316075e-04, -1.88004511e-04,
       -2.37401357e-04, -2.33644157e-04, -1.56176423e-04,  1.92378036e-02,
       -1.35054934e-02, -1.31594413e-02, -8.07141012e-03, -8.07141012e-03,
       -8.07141012e-03])

In [12]:
model.score(X_train, y_train)

0.19222823705301695

## **2. Support Vector Machines**

In [13]:
from sklearn.svm import SVR

In [14]:
svr_model = SVR()

In [15]:
svr_model.fit(X_train, y_train)

In [16]:
svr_preds = svr_model.predict(X_test)

In [17]:
mean_squared_error(y_test, svr_preds)

0.00480366447168498

In [18]:
svr_param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1.0],
                  'epsilon': [0.001, 0.01, 0.1, 0.5, 1.0],
                 }

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
svr_grid_search = GridSearchCV(estimator=svr_model, param_grid=svr_param_grid, cv=3, scoring='neg_mean_squared_error', verbose=3)

In [21]:
svr_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV 1/3] END ...........C=0.001, epsilon=0.001;, score=-0.005 total time=   0.7s
[CV 2/3] END ...........C=0.001, epsilon=0.001;, score=-0.005 total time=   0.8s
[CV 3/3] END ...........C=0.001, epsilon=0.001;, score=-0.005 total time=   1.2s
[CV 1/3] END ............C=0.001, epsilon=0.01;, score=-0.005 total time=   1.1s
[CV 2/3] END ............C=0.001, epsilon=0.01;, score=-0.005 total time=   1.0s
[CV 3/3] END ............C=0.001, epsilon=0.01;, score=-0.005 total time=   0.6s
[CV 1/3] END .............C=0.001, epsilon=0.1;, score=-0.005 total time=   0.1s
[CV 2/3] END .............C=0.001, epsilon=0.1;, score=-0.005 total time=   0.1s
[CV 3/3] END .............C=0.001, epsilon=0.1;, score=-0.005 total time=   0.1s
[CV 1/3] END .............C=0.001, epsilon=0.5;, score=-0.006 total time=   0.0s
[CV 2/3] END .............C=0.001, epsilon=0.5;, score=-0.009 total time=   0.0s
[CV 3/3] END .............C=0.001, epsilon=0.5;,

In [22]:
svr_grid_search.best_estimator_.fit(X_train, y_train)

In [23]:
svr_preds = svr_grid_search.best_estimator_.predict(X_test)

In [24]:
mean_squared_error(y_test, svr_preds)

0.004579642307590202

## **3. Random Forest**

In [25]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [27]:
rf_model.fit(X_train, y_train)

In [28]:
rf_preds = rf_model.predict(X_test)

In [29]:
mean_squared_error(y_test, rf_preds)

0.003968965167952964

In [33]:
rf_param_grid = {'n_estimators': [1, 10, 100, 1000],
                 'max_depth': [1, 2, 3, 5, 8, 15],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 5, 10, 15],
                 'max_features': ['sqrt', 'log2', 1.0]
                }

In [34]:
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3, scoring='neg_mean_squared_error', verbose=3)

In [35]:
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits
[CV 1/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1;, score=-0.005 total time=   0.0s
[CV 2/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1;, score=-0.005 total time=   0.0s
[CV 3/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1;, score=-0.005 total time=   0.0s
[CV 1/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.005 total time=   0.0s
[CV 2/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.005 total time=   0.0s
[CV 3/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.005 total time=   0.0s
[CV 1/3] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-0.005 total

  _data = np.array(data, dtype=dtype, copy=copy,


In [36]:
rf_preds = rf_grid_search.best_estimator_.predict(X_test)

In [37]:
mean_squared_error(y_test, rf_preds)

0.0038337692878756455

In [38]:
rf_grid_search.best_estimator_.feature_importances_

array([8.28380408e-02, 6.68670622e-02, 6.41642993e-02, 6.48409733e-03,
       7.97156698e-03, 8.74611319e-03, 1.39390297e-03, 1.38577129e-03,
       1.09693702e-02, 7.76277644e-03, 1.58996240e-01, 1.30929702e-02,
       1.93183331e-01, 1.68052931e-01, 4.76943330e-02, 3.42248937e-02,
       4.16905213e-02, 3.33624173e-02, 3.68733624e-02, 1.96238671e-03,
       3.05834674e-03, 8.98887826e-03, 8.20876583e-05, 7.17666303e-05,
       8.25337653e-05])

In [39]:
pd.DataFrame({'features': X.columns, 'feature_importances': rf_grid_search.best_estimator_.feature_importances_})

Unnamed: 0,features,feature_importances
0,period1,0.082838
1,period2,0.066867
2,period3,0.064164
3,adf_c,0.006484
4,adf_ct,0.007972
5,adf_ctt,0.008746
6,pp_c,0.001394
7,pp_ct,0.001386
8,kpss_c,0.010969
9,kpss_ct,0.007763


## 4. **CatBoost**

In [40]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [41]:
from catboost import CatBoostRegressor

In [42]:
cb_model = CatBoostRegressor(random_state=42)

In [43]:
cb_model.fit(X_train, y_train)

Learning rate set to 0.050931
0:	learn: 0.0738356	total: 54.3ms	remaining: 54.2s
1:	learn: 0.0731907	total: 59.7ms	remaining: 29.8s
2:	learn: 0.0726017	total: 64.6ms	remaining: 21.5s
3:	learn: 0.0720271	total: 71.6ms	remaining: 17.8s
4:	learn: 0.0715314	total: 78.8ms	remaining: 15.7s
5:	learn: 0.0710610	total: 87.4ms	remaining: 14.5s
6:	learn: 0.0706376	total: 91.8ms	remaining: 13s
7:	learn: 0.0702521	total: 100ms	remaining: 12.4s
8:	learn: 0.0698429	total: 108ms	remaining: 11.8s
9:	learn: 0.0694267	total: 115ms	remaining: 11.4s
10:	learn: 0.0690810	total: 124ms	remaining: 11.1s
11:	learn: 0.0687145	total: 132ms	remaining: 10.9s
12:	learn: 0.0684633	total: 139ms	remaining: 10.5s
13:	learn: 0.0681646	total: 155ms	remaining: 10.9s
14:	learn: 0.0679182	total: 162ms	remaining: 10.6s
15:	learn: 0.0676276	total: 167ms	remaining: 10.3s
16:	learn: 0.0673612	total: 175ms	remaining: 10.1s
17:	learn: 0.0670742	total: 184ms	remaining: 10.1s
18:	learn: 0.0668678	total: 190ms	remaining: 9.82s
19:	le

<catboost.core.CatBoostRegressor at 0x78e7f64ccdc0>

In [44]:
cb_preds = cb_model.predict(X_test)

In [45]:
mean_squared_error(y_test, cb_preds)

0.003801810588274822

In [46]:
cb_param_grid = {'iterations': [100, 500, 1000, 2000, 2500]}

In [47]:
cb_grid_search = GridSearchCV(estimator=cb_model, param_grid=cb_param_grid, cv=2, scoring='neg_mean_squared_error', verbose=3)

In [48]:
cb_grid_search.fit(X_train, y_train)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1004:	learn: 0.0481521	total: 2.51s	remaining: 3.73s
1005:	learn: 0.0481458	total: 2.51s	remaining: 3.73s
1006:	learn: 0.0481315	total: 2.52s	remaining: 3.73s
1007:	learn: 0.0481223	total: 2.52s	remaining: 3.74s
1008:	learn: 0.0481062	total: 2.53s	remaining: 3.74s
1009:	learn: 0.0480857	total: 2.54s	remaining: 3.74s
1010:	learn: 0.0480767	total: 2.54s	remaining: 3.74s
1011:	learn: 0.0480651	total: 2.55s	remaining: 3.75s
1012:	learn: 0.0480470	total: 2.55s	remaining: 3.75s
1013:	learn: 0.0480336	total: 2.56s	remaining: 3.75s
1014:	learn: 0.0480166	total: 2.56s	remaining: 3.75s
1015:	learn: 0.0480062	total: 2.57s	remaining: 3.75s
1016:	learn: 0.0479912	total: 2.57s	remaining: 3.75s
1017:	learn: 0.0479732	total: 2.58s	remaining: 3.75s
1018:	learn: 0.0479600	total: 2.58s	remaining: 3.76s
1019:	learn: 0.0479504	total: 2.59s	remaining: 3.76s
1020:	learn: 0.0479303	total: 2.6s	remaining: 3.76s
1021:	learn: 0.0479154	total: 2.6s	

In [49]:
cb_grid_search.best_estimator_.get_params()

{'loss_function': 'RMSE', 'random_state': 42, 'iterations': 1000}

In [50]:
cb_grid_search.best_estimator_.fit(X_train, y_train)

Learning rate set to 0.050931
0:	learn: 0.0738356	total: 2.54ms	remaining: 2.54s
1:	learn: 0.0731907	total: 5.13ms	remaining: 2.56s
2:	learn: 0.0726017	total: 7.8ms	remaining: 2.59s
3:	learn: 0.0720271	total: 10.7ms	remaining: 2.65s
4:	learn: 0.0715314	total: 13.2ms	remaining: 2.62s
5:	learn: 0.0710610	total: 15.7ms	remaining: 2.6s
6:	learn: 0.0706376	total: 18.5ms	remaining: 2.62s
7:	learn: 0.0702521	total: 21.2ms	remaining: 2.63s
8:	learn: 0.0698429	total: 23.7ms	remaining: 2.61s
9:	learn: 0.0694267	total: 26.3ms	remaining: 2.6s
10:	learn: 0.0690810	total: 29.2ms	remaining: 2.63s
11:	learn: 0.0687145	total: 31.9ms	remaining: 2.62s
12:	learn: 0.0684633	total: 34.4ms	remaining: 2.61s
13:	learn: 0.0681646	total: 37ms	remaining: 2.61s
14:	learn: 0.0679182	total: 39.9ms	remaining: 2.62s
15:	learn: 0.0676276	total: 42.5ms	remaining: 2.61s
16:	learn: 0.0673612	total: 45ms	remaining: 2.6s
17:	learn: 0.0670742	total: 47.9ms	remaining: 2.61s
18:	learn: 0.0668678	total: 50.5ms	remaining: 2.61s


<catboost.core.CatBoostRegressor at 0x78e807e77df0>

In [51]:
cb_preds = cb_grid_search.best_estimator_.predict(X_test)

In [52]:
mean_squared_error(y_test, cb_preds)

0.003801810588274822

In [53]:
cb_grid_search.best_estimator_.get_feature_importance()

array([1.20214847e+01, 8.45540473e+00, 6.39970097e+00, 1.87596611e-01,
       5.48322183e-01, 7.76341072e-01, 4.10051961e-02, 7.37091355e-03,
       1.73919371e-01, 5.38653911e-01, 1.13122507e+01, 1.19025535e+00,
       1.90673982e+01, 1.28994817e+01, 6.09500788e+00, 4.40109346e+00,
       5.85012409e+00, 4.40584339e+00, 5.24893974e+00, 2.34297612e-01,
       2.27466103e-02, 1.22311243e-01, 7.01878841e-05, 3.72365280e-05,
       3.42954381e-04])

In [54]:
pd.DataFrame({'features': X.columns, 'feature_importances': cb_grid_search.best_estimator_.get_feature_importance()})

Unnamed: 0,features,feature_importances
0,period1,12.021485
1,period2,8.455405
2,period3,6.399701
3,adf_c,0.187597
4,adf_ct,0.548322
5,adf_ctt,0.776341
6,pp_c,0.041005
7,pp_ct,0.007371
8,kpss_c,0.173919
9,kpss_ct,0.538654
