In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sps
sns.set()

plt.rc('font', size=30)
plt.rc('axes', titlesize=30)
plt.rc('axes', labelsize=30)
plt.rc('xtick', labelsize=30)
plt.rc('ytick', labelsize=30)
plt.rc('legend', fontsize=30)
plt.rc('figure', titlesize=30)

from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.covariance import MinCovDet
import cvxpy as cvx
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit

Датафрейм returns - это таблица доходностей, то есть $\frac{p_{new} - p_{old}}{p_{old}}$.

In [2]:
returns = pd.read_csv('work/data/returns.csv')
returns = returns.fillna(0)
returns['date'] = returns['date'].astype(np.datetime64)
returns = returns.set_index('date')

train_size = 1500

returns_train = returns[:train_size]
returns_test = returns[train_size:]

In [59]:
from portfolio_optimizer import PortfolioOptimizer
from helper_functions import plot_results

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [81]:
tscv = TimeSeriesSplit(n_splits=5)

In [108]:
top_comp_grid = np.linspace(20, 30, 3, dtype=int).tolist()
n_iter = 5000
size_of_window_grid = [None] + np.linspace(200, 1800, 9, dtype=int).tolist()
kept_dim_grid = np.linspace(3, 20, 18, dtype=int)

param_grid_pre = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'preprocessing_method': [None, 'PCA', 'to norm PCA'], 
    'preprocessing_kept_dim': kept_dim_grid
}
po_pre = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02), 
    param_distributions=param_grid_pre,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

In [109]:
%%time
po_pre.fit(returns_train)

CPU times: user 2min 35s, sys: 5.02 s, total: 2min 40s
Wall time: 9min 5s



One or more of the test scores are non-finite: [1.16233542 1.42670278        nan ... 0.87283576 0.27096079        nan]



RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(risk_free_return=1.02),
                   n_iter=5000, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0.0028, 0.0029, 0.003 ]),
                                        'n_top_companies': [20, 25, 30],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    480, 720,
                                                                    900],
                                        'preprocessing_kept_dim': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20]),
                  

In [110]:
po_pre.best_params_

{'size_of_window': 200,
 'preprocessing_method': 'to norm PCA',
 'preprocessing_kept_dim': 4,
 'period_change_portfolio': 120,
 'n_top_companies': 20,
 'R': 0.0028000000000000004}

In [121]:
np.seterr('raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [126]:
po = PortfolioOptimizer(preprocessing_method='MPPCA', preprocessing_kept_dim=5, n_models_MPPCA=4, n_top_companies=15)
po.fit(returns_train)

PortfolioOptimizer(n_models_MPPCA=4, n_top_companies=15,
                   preprocessing_kept_dim=5, preprocessing_method='MPPCA')

In [127]:
n_iter = 5000
size_of_window_grid = [None] + np.linspace(200, 1800, 9, dtype=int).tolist()
kept_dim_grid = np.linspace(3, 15, 13, dtype=int)

param_grid_pre_MPPCA = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'preprocessing_kept_dim': kept_dim_grid,
    'n_models_MPPCA': np.linspace(2, 4, 3, dtype=int),
    'is_PCA': [False, True],
    'n_components': kept_dim_grid
}
po_pre_MPPCA = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02, n_top_companies=15,
                                 preprocessing_method='MPPCA'), 
    param_distributions=param_grid_pre_MPPCA,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

In [128]:
po_pre_MPPCA.fit(returns_train)


One or more of the test scores are non-finite: [       nan        nan        nan ... 1.14234129 0.9547715  1.17250746]



RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(n_top_companies=15,
                                                preprocessing_method='MPPCA',
                                                risk_free_return=1.02),
                   n_iter=5000, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0....
       0.0026, 0.0027, 0.0028, 0.0029, 0.003 ]),
                                        'is_PCA': [False, True],
                                        'n_components': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
                                        'n_models_MPPCA': array([2, 3, 4]),
                                        'n_top_companies': [20, 25, 30],
                                        'period_change_portfolio': [None, 120,


In [129]:
po_pre_MPPCA.best_params_

{'size_of_window': 400,
 'preprocessing_kept_dim': 4,
 'period_change_portfolio': 480,
 'n_top_companies': 30,
 'n_models_MPPCA': 2,
 'n_components': 3,
 'is_PCA': False,
 'R': 0.0027}

In [130]:
top_comp_grid = np.linspace(20, 30, 3, dtype=int).tolist()
n_iter = 5000
size_of_window_grid = [None] + np.linspace(200, 1800, 9, dtype=int).tolist()
kept_dim_grid = np.linspace(3, 20, 18, dtype=int)

param_grid_pre_and_PCA = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'preprocessing_method': [None, 'PCA', 'to norm PCA'], 
    'preprocessing_kept_dim': kept_dim_grid,
    'n_components': kept_dim_grid
}
po_pre_and_PCA = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02, is_PCA=True), 
    param_distributions=param_grid_pre_and_PCA,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

In [131]:
%%time
po_pre_and_PCA.fit(returns_train)

CPU times: user 2min 34s, sys: 4.83 s, total: 2min 38s
Wall time: 8min 43s



One or more of the test scores are non-finite: [       nan        nan 0.88994178 ... 1.28910558 0.77402026        nan]



RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(is_PCA=True,
                                                risk_free_return=1.02),
                   n_iter=5000, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0.0028, 0.0029, 0.0...
                                        'n_components': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20]),
                                        'n_top_companies': [20, 25, 30],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    480, 720,
                                            

In [132]:
po_pre_and_PCA.best_params_

{'size_of_window': 400,
 'preprocessing_method': 'PCA',
 'preprocessing_kept_dim': 5,
 'period_change_portfolio': None,
 'n_top_companies': 25,
 'n_components': 5,
 'R': 0.0016}

In [157]:
params = {'size_of_window': 735,
 'preprocessing_kept_dim': 14,
 'period_change_portfolio': 900,
 'n_top_companies': 15,
 'n_models_MPPCA': 2,
 'R': 0.0028000000000000004}
po_MPPCA_pre = PortfolioOptimizer(**params)
po_MPPCA_pre.fit(returns_train)
po_MPPCA_pre.best_estimator_ = po_MPPCA_pre

In [147]:
sp_data_1 = pd.read_csv('Download Data - INDEX_US_S&P US_SPX (4).csv')
sp_data_2 = pd.read_csv('Download Data - INDEX_US_S&P US_SPX (3).csv')
sp_data_3 = pd.read_csv('Download Data - INDEX_US_S&P US_SPX (2).csv')
sp_data_4 = pd.read_csv('Download Data - INDEX_US_S&P US_SPX (1).csv')
sp_data_5 = pd.read_csv('Download Data - INDEX_US_S&P US_SPX (5).csv')

sp_data = sp_data_1.set_index('Date')
sp_data = sp_data['Open']

for sp in [sp_data_2, sp_data_3, sp_data_4, sp_data_5]:
    sp = sp.set_index('Date')
    sp = sp['Open']
    sp_data = pd.concat((sp_data, sp))

sp_data = pd.DataFrame(sp_data)
sp_data.index = pd.to_datetime(sp_data.index)
sp_data['Open'] = sp_data['Open'].replace(to_replace=',', value='', regex=True).astype(float)
sp_data = sp_data.sort_index()
sp_data = sp_data['Open']
sp_shrunk = sp_data.loc[returns_test.index]
sp_shrunk = sp_shrunk / sp_shrunk[0]

In [158]:
names = ['PCA + to norm PCA preproc', 'PCA + PCA preproc', 'MPPCA preproc']
optimizers = [po_pre, po_pre_and_PCA, po_MPPCA_pre]
plot_results(names=names, optimizers=optimizers, df_train=returns_train, df_test=returns_test, 
             title='Return лучших моделей из каждого метода', sp=sp_shrunk)

3it [00:00,  3.75it/s]


Дальше случайные эксперименты.

In [5]:
top_comp_grid = np.linspace(20, 35, 4, dtype=int).tolist()
n_iter = 10
size_of_window_grid = [None] + np.linspace(200, 1800, 321, dtype=int).tolist()
kept_dim_grid = np.linspace(2, 20, 19, dtype=int)

param_grid = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
}
po_randomsearch = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02), 
    param_distributions=param_grid,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

param_grid_MPPCA_preproc = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': [15], 
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'preprocessing_kept_dim': kept_dim_grid,
    'n_models_MPPCA': np.linspace(2, 4, 3, dtype=int)
}
po_randomsearch_MPPCA_preproc = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02, is_MPPCA_preprocessing=True), 
    param_distributions=param_grid_MPPCA_preproc,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16,  # seed для фиксации генератора случайных чисел
    error_score=np.nan
)

param_grid_PCA_preproc = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'preprocessing_kept_dim': kept_dim_grid
}
po_randomsearch_PCA_preproc = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02, is_PCA_preprocessing=True), 
    param_distributions=param_grid_PCA_preproc,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

param_grid_PCA = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'n_components': kept_dim_grid
}
po_randomsearch_PCA = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02, is_PCA=True), 
    param_distributions=param_grid_PCA,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

param_grid_PCA_PCA_preproc = {
    'size_of_window': size_of_window_grid,
    'n_top_companies': top_comp_grid,
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
    'n_components': kept_dim_grid,
    'preprocessing_kept_dim': kept_dim_grid
}
po_randomsearch_PCA_PCA_preproc = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02, is_PCA=True, is_PCA_preprocessing=True), 
    param_distributions=param_grid_PCA_PCA_preproc,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=n_iter,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)



In [77]:
%%time
po_randomsearch.fit(returns_train)

CPU times: user 894 ms, sys: 176 ms, total: 1.07 s
Wall time: 11.4 s



One or more of the test scores are non-finite: [0.82609751 1.38192253        nan 0.85017728 0.83527326 0.81897287
 1.25368694 1.25057056 0.86582736 1.32713175 0.86896582 0.95288309
 1.32891384 1.31122574 0.82194739 1.3815774  1.31234012 0.81986884
 1.31376901 1.45443459 0.73223156 1.49612401 0.95501026 1.22917345
 0.76747729 0.82649727 0.8867902         nan 0.93472793 0.80109597
 0.82338938 0.77196611 1.13298555 1.11952471 0.92100704 0.81788447
 0.7278893  1.39791994 0.88452311 1.09741747 1.01102829 0.83008912
 1.1945957  0.82720312 1.06480328 1.32713175 0.85699528 1.4630585
        nan 0.85620657        nan 0.85300085 1.15007278 0.87089909
 0.80985288 0.71706991 1.19183174 0.83902436 1.09417686 1.19566117
 0.98925971        nan 0.98674398 1.16739418 0.83569057 1.1791037
 0.98895382 1.28993088 0.8585688  1.20511695 1.49783558 1.08998486
 0.9560273  0.76730412 1.27192651 0.91690623 0.90338698        nan
 0.80286275 1.34578261 0.83908162 0.9587744  0.93912682 0.81216136
 0.94265243 1.47

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(risk_free_return=1.02),
                   n_iter=100, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0.0028, 0.0029, 0.003 ]),
                                        'n_top_companies': [25, 26, 27, 28, 29,
                                                            30, 31, 32, 33, 34,
                                                            35],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    480, 720,
                                                                    900],
                     

In [12]:
po_randomsearch.best_params_

{'size_of_window': 590,
 'period_change_portfolio': 360,
 'n_top_companies': 32,
 'R': 0.0022}

In [153]:
params = {'size_of_window': 735,
 'preprocessing_kept_dim': 14,
 'period_change_portfolio': 900,
 'n_top_companies': 15,
 'n_models_MPPCA': 2,
 'R': 0.0028000000000000004}
po_MPPCA_pre = PortfolioOptimizer(**params)
po_MPPCA_pre.fit(returns_train)
po_MPPCA_pre.predict(returns_test)

2017-12-18    1.026519
2017-12-19    1.022419
2017-12-20    1.017837
2017-12-21    1.013022
2017-12-22    1.011817
                ...   
2021-04-26    2.915047
2021-04-27    2.954828
2021-04-28    2.963952
2021-04-29    2.919947
2021-04-30    2.879538
Length: 847, dtype: float64

In [149]:
po_randomsearch_MPPCA_preproc.best_params_

{'size_of_window': 735,
 'preprocessing_kept_dim': 14,
 'period_change_portfolio': 900,
 'n_top_companies': 15,
 'n_models_MPPCA': 2,
 'R': 0.0028000000000000004}

In [30]:
%%time
po_randomsearch_MPPCA_preproc.fit(returns_train)


One or more of the test scores are non-finite: [ 1.04501959  1.377825    1.3015501   1.09260113         nan  1.54522443
         nan         nan  1.36038838         nan  0.2902859          nan
  0.94050363  1.22777029  1.32532131  1.08625823  0.80352032  1.24315858
  1.42739327  1.60181807         nan         nan  1.18935512  1.32829884
  1.20885376  1.42829679  1.24629502  0.99401347  1.19205716         nan
         nan  1.19902108  0.86265666  0.91568626  1.27240907  1.23402135
         nan  1.36702417         nan         nan         nan         nan
         nan  1.18671099  1.35876125  1.27845566  1.25163392         nan
  0.78785977  1.21808298  1.24954181  1.44895688  1.30778175  1.64387112
  1.00430527  1.30778175  0.95203769         nan  1.21301324  1.62207951
         nan  1.39514132  1.25122428         nan  0.96791884         nan
         nan  0.86835253  1.271843    1.25706833         nan         nan
  1.41727293  1.39602626  0.89083165  1.4617805   1.19556616  1.18378799
  1

CPU times: user 11 s, sys: 356 ms, total: 11.3 s
Wall time: 1min 13s


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(is_MPPCA_preprocessing=True,
                                                risk_free_return=1.02),
                   n_iter=200, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0....
                                        'n_models_MPPCA': array([2, 3, 4]),
                                        'n_top_companies': [15],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    480, 720,
                                                                    900],
                                        'preproc

In [17]:
%%time
po_randomsearch_PCA_preproc.fit(returns_train)

CPU times: user 5.45 s, sys: 185 ms, total: 5.64 s
Wall time: 17.6 s



One or more of the test scores are non-finite: [ 1.20421719  0.55977927  0.35187046  0.55649823  0.68838802  1.00463611
  1.15796372  1.25633531  1.25989346         nan         nan  0.8313692
  1.09767494         nan  1.18365291  1.05562768  1.122419    1.08318081
  0.83456217         nan         nan  1.29617494  1.11113805  1.1248371
  0.93265281  1.09635699  1.27244786         nan  1.52336894         nan
  1.1184265   1.16875565         nan  1.31986558  1.3899825   0.9579912
  1.10848607  0.41232285  0.95193054  0.97290334  1.2939293   1.35180573
  0.94083266  1.31097633  1.33018942  0.75184019  1.08319326  1.47573969
  1.31097633  1.46522074  0.77796348         nan  0.66620826  0.52872085
         nan  1.09767494         nan  1.34998346  1.30581628  1.36043782
  1.31844856         nan         nan  0.82833656  1.06814491  1.20199904
  1.1248371   1.41376743  1.38724618  0.55586735  1.43407185  0.97877854
  1.29357334  1.11414347         nan         nan  1.40907064  1.04157736
  0.70

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(is_PCA_preprocessing=True,
                                                risk_free_return=1.02),
                   n_iter=200, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0.00...
                                        'n_top_companies': [25, 26, 27, 28, 29,
                                                            30, 31, 32, 33, 34,
                                                            35],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    480, 720,
                                      

In [34]:
%%time
po_randomsearch_PCA.fit(returns_train)

CPU times: user 2.89 s, sys: 103 ms, total: 2.99 s
Wall time: 9.7 s



One or more of the test scores are non-finite: [0.97847667 0.88567269 0.9477853  1.01734516 0.91297841 0.58987059
 1.75157675 0.98964436 0.92162091 1.10133026 1.13251641 0.96691552
 0.82738308 0.8906148  0.81436565 1.43137514 0.95537085 0.88512368
 0.95765067 1.05079052 0.94976575 0.82933274 0.82967368 0.98499168
 0.994081   1.0391683  0.9646633  0.9849416  1.08805354 0.88961559
        nan 1.08587905 1.2597969  1.03029017 0.93809603 0.94174278
 1.22165158 1.19793496 0.90720811 0.91866868 0.89138689 0.97393798
 0.82976983 1.04580004 1.42387453 0.96729955 0.98490861 0.95327042
 0.87099075 1.04886721 0.79566668 0.87401802 1.26214351 1.01750792
 1.19693966 0.80006001 1.42222991 0.94200948 0.40732441 1.6839044
 0.17682062 1.00258389 0.6755639  1.66690259 0.44451185 1.26222603
 0.97377047 0.82122921 0.8614227  0.90894021 1.34849986 0.41984561
 1.00633136 1.14865444 0.85773967 0.82180958 1.04381145 0.37168469
 0.94454954 0.95337485 0.7122131  1.2246683  0.99393189 0.94400766
 1.31763959 0.9

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(is_PCA=True,
                                                risk_free_return=1.02),
                   n_iter=100, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0.0028, 0.0029, 0.003 ]),
                                        'n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20]),
                                        'n_top_companies': [20, 25, 30, 35, 40],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    480, 720,
                          

In [37]:
po_randomsearch_PCA_PCA_preproc.fit(returns_train)


One or more of the test scores are non-finite: [0.99744757 1.17021957        nan 1.48638726 0.89585915 1.22252033
        nan 0.03787086 1.12043334 1.44257466 1.04961626 1.15607581
 1.25065645 1.37840726 1.10178732 1.1448811  1.46390168 1.22647961
 0.90561664 1.56087044 1.04169684 1.27333375 0.87040287 1.06877374
 1.10901531 1.17389216 1.27681129 0.85180003 0.77991091 0.81618988
 1.2738831  0.46330525 1.48612764 1.4452139  0.86527514 0.6774801
        nan 1.09336932 1.31758375 1.22373177 1.30185574 1.36825618
 0.87207083 0.77603739        nan 0.91256651 1.04153882 0.7695321
 1.18616604 0.94287047 1.15374999 1.04439294 1.12174837 1.26406568
 0.89277928 1.14191926        nan 1.09787715 1.16270601 1.5240741
 1.09287221 1.02540871 1.29407388 1.38058533 0.92610542 1.66487864
 0.84827401 1.01069402 1.0646849  1.65436796 0.93883207 0.98691527
 1.1448811  1.32195313 1.08383562 1.46579228        nan 1.74765428
 0.82808909        nan 0.97823238 0.81161569 1.10752747 1.0996778
 1.34790121 0.9928

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(is_PCA=True,
                                                is_PCA_preprocessing=True,
                                                risk_free_return=1.02),
                   n_iter=200, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026...
                                        'n_top_companies': [25, 26, 27, 28, 29,
                                                            30, 31, 32, 33, 34,
                                                            35],
                                        'period_change_portfolio': [None, 120,
                                                                    240, 360,
                                                                    4

In [38]:
param_grid_no_window = {
    'n_top_companies': np.linspace(1, 100, 100, dtype=int).tolist(),
    'period_change_portfolio': [None, 120, 240, 360, 480, 720, 900],
    'R': np.linspace(1e-3, 3e-3, 21),
}

In [39]:
po_randomsearch_no_window = RandomizedSearchCV(
    estimator=PortfolioOptimizer(risk_free_return=1.02), 
    param_distributions=param_grid_no_window,
    cv=tscv,
    verbose=0,
    n_jobs=-2,  # кол-во параллельных процессов
    n_iter=100,  # кол-во итераций случайного выбора гиперпараметров
    random_state=16  # seed для фиксации генератора случайных чисел
)

In [40]:
%%time
po_randomsearch_no_window.fit(returns_train)

CPU times: user 2.89 s, sys: 97.6 ms, total: 2.99 s
Wall time: 13.6 s



One or more of the test scores are non-finite: [0.48346908 0.73433954 0.74571108        nan 0.74225624        nan
 0.77562365 0.78747676        nan 0.44014606 0.69082709 0.79373038
 1.3062386  1.05671445 1.24589237 1.46941715 1.08148303 0.44014606
 0.43050246 0.8686399         nan 0.64408219 0.90327992 0.77568293
 1.04782358 0.81136505 0.85192335 0.67553613 0.82447609 1.15645442
        nan 0.81319338 1.24113202 0.81603392 0.84035529 0.91426218
 0.87490267 0.83077789 0.81976074 0.78243545 0.7184371  0.76578212
 0.90731449 0.7153566  0.8286175  0.76684354 0.76424049 0.86802987
 0.75894516 0.54065844 0.84403949 0.55508198 0.98299565 0.69722614
 0.97309777        nan 0.77364361 0.80102204 0.76669837 0.82485335
 0.82905943 0.73672797 1.02187278 0.7320942  0.3759742  0.80094925
 0.78243545 1.00239951 0.77548272 0.99587456 0.80595459 0.79317825
 1.30173629 0.87901425 0.75105806 0.75916811 1.31087002 1.0917389
 0.70165401 0.80336386 0.76324176 0.74140492 0.82609751        nan
 0.84553928 0.8

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=PortfolioOptimizer(risk_free_return=1.02),
                   n_iter=100, n_jobs=-2,
                   param_distributions={'R': array([0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017,
       0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024, 0.0025,
       0.0026, 0.0027, 0.0028, 0.0029, 0.003 ]),
                                        'n_top_companies': [1, 2, 3, 4, 5, 6, 7,
                                                            8, 9, 10, 11, 12,
                                                            13, 14, 15, 16, 17,
                                                            18, 19, 20, 21, 22,
                                                            23, 24, 25, 26, 27,
                                                            28, 29, 30, ...],
                                        'period_change_portfolio': [None, 120,

In [98]:
names = ['Без оптимизаций', 'PCA preprocessing', 'PCA', 'PCA + PCA preproc', 'No window', 'MPPCA']
optimizers = [po_randomsearch, po_randomsearch_PCA_preproc, po_randomsearch_PCA, 
              po_randomsearch_PCA_PCA_preproc, po_randomsearch_no_window, po_randomsearch_MPPCA_preproc]
plot_results(names=names, optimizers=optimizers, df_train=returns_train, df_test=returns_test, 
             title='Return лучших моделей из каждого метода')

0it [00:00, ?it/s]


AttributeError: 'PortfolioOptimizer' object has no attribute 'preprocessing_method'

In [251]:
from mppca import MPPCA

mean = returns_train[returns_train.columns].mean()
top_returns = returns_train[mean.sort_values(ascending=False)[:20].index]

mppca = MPPCA(5, 4)
mppca.fit(top_returns.to_numpy())
mppca_top_returns = mppca.transform(top_returns.to_numpy())

np.mean(mppca_top_returns - top_returns.to_numpy()**2)

....................................................................................................

0.0011072023521745541