In [19]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import lightgbm as lgbm
import catboost as ctb
import os
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import time
import pickle

from tqdm import tqdm

from utils import load_json
from features import QuarterlyFeatures, QuarterlyDiffFeatures, BaseCompanyFeatures, FeatureMerger
from targets import QuarterlyTarget, QuarterlyDiffTarget
from models import GroupedOOFModel, TimeSeriesOOFModel, AnsambleModel, LogExpModel
from pipelines import BasePipeline
from data import SF1Data
from metrics import median_absolute_relative_error
sns.set()

config = load_json("config.json")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data_loader = SF1Data(config['sf1_data_path'])
tickers_df = data_loader.load_tickers(
    currency='USD',
    scalemarketcap=['4 - Mid', '5 - Large', '6 - Mega'])
ticker_list = tickers_df['ticker'].unique().tolist()

In [4]:
config['sf1_data_path'] = '../clever_investment_data/cf1_13_01_2021'

In [20]:
columns = ['revenue', 'netinc', 'ncf', 'assets', 'ebitda', 'debt', 'fcf',
           'gp', 'workingcapital', 'cashneq', 'rnd', 'sgna', 'ncfx',
           'divyield', 'currentratio', 'netinccmn']

cat_columns = ['sector', 'sicindustry']

fc1 = QuarterlyFeatures(columns=columns,
                        quarter_counts=[2, 4, 10],
                        max_back_quarter=10)

fc2 = BaseCompanyFeatures(cat_columns=cat_columns)

fc3 = QuarterlyDiffFeatures(columns=columns,
                            compare_idxs=[1, 4],
                            max_back_quarter=10)

feature = FeatureMerger(fc1, fc2, on='ticker')

# feature = FeatureMerger(feature, fc3, on=['ticker', 'date'])


target = QuarterlyTarget(col='marketcap', quarter_shift=0)
#target = QuarterlyDiffTarget(col='marketcap')

base_model = AnsambleModel(base_models=[LogExpModel(lgbm.sklearn.LGBMRegressor()),
                                        LogExpModel(ctb.CatBoostRegressor(verbose=False))], 
                           bagging_fraction=0.7, model_cnt=20)

model = GroupedOOFModel(ctb.CatBoostRegressor(verbose=False),
                        group_column='ticker', fold_cnt=5)

model = TimeSeriesOOFModel(ctb.CatBoostRegressor(verbose=False),
                           time_column='date', fold_cnt=5)

mc_pipeline = BasePipeline(feature, target, model, median_absolute_relative_error)
mc_pipeline.fit(config, ticker_list)
#mc_pipeline.export_core('models_data/diff')

3874it [00:27, 139.11it/s]
2401it [00:03, 675.93it/s]


nan


In [23]:
x = np.array([np.nan, 10])
x[~np.isnan(x)]


array([], dtype=float64)

In [None]:
0.7191348902981789 ans
0.7667987731362407
0.7691278337267149

In [None]:
0.3238006278631045 ans
0.3226811547779296 log_exp_ctb
0.34013409255339005 log_exp_lgb
0.3731881490521315 ctb
0.38325054127451996 lgb

In [4]:
X = feature.calculate(config['sf1_data_path'], ticker_list)
y = target.calculate(config['sf1_data_path'], 
                                  X.index.to_frame(index=False))
leave_mask = (y['y'].isnull() == False)
y = y[leave_mask]#.reset_index(drop=True)
X = X[leave_mask]

3874it [00:29, 132.40it/s]
2401it [00:03, 620.00it/s]


In [5]:
model.fit(X, y['y'])

0
1
2
3
4


In [6]:
    with open('{}.pickle'.format('tmp'), 'wb') as f:
        pickle.dump(model, f)

In [18]:
model.predict(X)

array([8.76774927e+08, 8.59081231e+08, 8.56702809e+08, ...,
       1.00961997e+09, 1.42488678e+09, 3.82320800e+09])

In [8]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter2_revenue_mean,quarter2_revenue_median,quarter2_revenue_max,quarter2_revenue_min,quarter2_revenue_std,quarter2_revenue_diff_mean,quarter2_revenue_diff_median,quarter2_revenue_diff_max,quarter2_revenue_diff_min,quarter2_revenue_diff_std,...,compare4_fcf,compare4_gp,compare4_workingcapital,compare4_cashneq,compare4_rnd,compare4_sgna,compare4_ncfx,compare4_divyield,compare4_currentratio,compare4_netinccmn
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
DQWS,2020-11-13,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,0.0,...,-1.402517,0.736181,-0.058636,1.166008,,0.372814,-0.888710,,0.124283,-0.871435
DQWS,2020-08-14,374226.0,374226.0,527542.0,220910.0,153316.0,-306632.0,-306632.0,-306632.0,-306632.0,0.0,...,-0.357245,4.561310,0.694273,1.308895,,-0.516866,0.579729,,-0.192248,-1.180638
DQWS,2020-05-15,359828.0,359828.0,527542.0,192114.0,167714.0,335428.0,335428.0,335428.0,335428.0,0.0,...,3.586985,5.350196,2.816845,0.512900,,0.132189,-0.168724,,-0.274536,-0.913871
DQWS,2020-04-03,192114.0,192114.0,192114.0,192114.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.647455,3.873477,-6.466597,-0.442308,,-0.542161,-76.354132,,-0.583599,-0.490364
DQWS,2020-03-30,115209.5,115209.5,192114.0,38305.0,76904.5,153809.0,153809.0,153809.0,153809.0,0.0,...,0.108555,3.453074,-2.234651,-0.743831,,-0.247272,0.695825,,-0.756291,-0.278100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LSPD,2021-02-04,51552000.0,51552000.0,57611000.0,45493000.0,6059000.0,12118000.0,12118000.0,12118000.0,12118000.0,0.0,...,,,,,,,,,,
FROG,2021-02-12,40787500.0,40787500.0,42689000.0,38886000.0,1901500.0,3803000.0,3803000.0,3803000.0,3803000.0,0.0,...,,,,,,,,,,
STEP,2021-02-11,245031500.0,245031500.0,247150000.0,242913000.0,2118500.0,4237000.0,4237000.0,4237000.0,4237000.0,0.0,...,,,,,,,,,,
ALGM,2021-02-02,150549000.0,150549000.0,164449000.0,136649000.0,13900000.0,27800000.0,27800000.0,27800000.0,27800000.0,0.0,...,,,,,,,,,,


In [11]:
times = X.reset_index()['date'].astype(np.datetime64).values

In [21]:
fold_cnt = 20

max_time = max(times)
min_time = min(times)
delta = (max_time - min_time) // fold_cnt

time_bounds = []
for fold_id in range(1, fold_cnt):
    time_bounds.append(min_time + fold_id * delta)
time_bounds.append(max_time)

In [54]:
delta

numpy.timedelta64(7374240000000000,'ns')

In [58]:
min_time + np.timedelta64(10000, 'D')

numpy.datetime64('2043-11-03T00:00:00.000000000')

In [48]:
min_time 

numpy.datetime64('2016-06-17T00:00:00.000000000')

In [47]:
X[:0]

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter2_revenue_mean,quarter2_revenue_median,quarter2_revenue_max,quarter2_revenue_min,quarter2_revenue_std,quarter2_revenue_diff_mean,quarter2_revenue_diff_median,quarter2_revenue_diff_max,quarter2_revenue_diff_min,quarter2_revenue_diff_std,...,compare4_fcf,compare4_gp,compare4_workingcapital,compare4_cashneq,compare4_rnd,compare4_sgna,compare4_ncfx,compare4_divyield,compare4_currentratio,compare4_netinccmn
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1


In [45]:
for fold_id in range(fold_cnt - 1):
    time_bounds[fold_id]
    curr_mask = (times > time_bounds[fold_id]) * (times <= time_bounds[fold_id + 1])
    
    X[curr_mask]

221
160
254
204
190
34
220
183
202
1343
1962
1974
1962
2010
1885
2034
1960
1944
815


IndexError: list index out of range

In [38]:
time_bounds[fold_id]

numpy.datetime64('2016-09-10T08:24:00.000000000')

In [33]:
times.shape

(19768,)

In [42]:
221 * 20

4420

In [44]:
len(times[(times > time_bounds[fold_id]) * (times <= time_bounds[fold_id + 1])])

1962

In [40]:
len(times[times <= time_bounds[fold_id]])

211

In [35]:
X[times <= time_bounds[fold_id]]

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter2_revenue_mean,quarter2_revenue_median,quarter2_revenue_max,quarter2_revenue_min,quarter2_revenue_std,quarter2_revenue_diff_mean,quarter2_revenue_diff_median,quarter2_revenue_diff_max,quarter2_revenue_diff_min,quarter2_revenue_diff_std,...,compare4_fcf,compare4_gp,compare4_workingcapital,compare4_cashneq,compare4_rnd,compare4_sgna,compare4_ncfx,compare4_divyield,compare4_currentratio,compare4_netinccmn
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
YHOO,2016-08-08,1.197394e+09,1.197394e+09,1.307637e+09,1.087152e+09,110242500.0,220485000.0,220485000.0,220485000.0,220485000.0,0.0,...,,,,,,,,,,
VNRSQ,2016-07-29,6.903250e+07,6.903250e+07,1.131990e+08,2.486600e+07,44166500.0,-88333000.0,-88333000.0,-88333000.0,-88333000.0,0.0,...,,,,,,,,,,
BATS,2016-08-05,4.884000e+08,4.884000e+08,5.109000e+08,4.659000e+08,22500000.0,-45000000.0,-45000000.0,-45000000.0,-45000000.0,0.0,...,,,,,,,,,,
MEG1,2016-08-05,3.530845e+08,3.530845e+08,3.627060e+08,3.434630e+08,9621500.0,19243000.0,19243000.0,19243000.0,19243000.0,0.0,...,,,,,,,,,,
ACTA,2016-08-05,3.565950e+07,3.565950e+07,3.670900e+07,3.461000e+07,1049500.0,2099000.0,2099000.0,2099000.0,2099000.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CSC,2016-08-09,1.868500e+09,1.868500e+09,1.930000e+09,1.807000e+09,61500000.0,123000000.0,123000000.0,123000000.0,123000000.0,0.0,...,,,,,,,,,,
CLC,2016-06-17,3.406200e+08,3.406200e+08,3.649680e+08,3.162720e+08,24348000.0,48696000.0,48696000.0,48696000.0,48696000.0,0.0,...,,,,,,,,,,
BCR,2016-07-27,9.025000e+08,9.025000e+08,9.315000e+08,8.735000e+08,29000000.0,58000000.0,58000000.0,58000000.0,58000000.0,0.0,...,,,,,,,,,,
ATW,2016-08-02,2.620740e+08,2.620740e+08,2.963510e+08,2.277970e+08,34277000.0,-68554000.0,-68554000.0,-68554000.0,-68554000.0,0.0,...,,,,,,,,,,


In [36]:
y[times <= time_bounds[fold_id]]

Unnamed: 0_level_0,Unnamed: 1_level_0,y
ticker,date,Unnamed: 2_level_1
YHOO,2016-08-08,0.051074
VNRSQ,2016-07-29,0.048364
BATS,2016-08-05,0.036982
MEG1,2016-08-05,0.007214
ACTA,2016-08-05,0.048425
...,...,...
CSC,2016-08-09,-0.062971
CLC,2016-06-17,0.041345
BCR,2016-07-27,0.045956
ATW,2016-08-02,-0.148395


In [26]:
time_bounds[fold_id]

numpy.datetime64('2021-02-18T00:00:00.000000000')

In [None]:
times

In [23]:
len(time_bounds)

20

In [22]:
time_bounds

[numpy.datetime64('2016-09-10T08:24:00.000000000'),
 numpy.datetime64('2016-12-04T16:48:00.000000000'),
 numpy.datetime64('2017-02-28T01:12:00.000000000'),
 numpy.datetime64('2017-05-24T09:36:00.000000000'),
 numpy.datetime64('2017-08-17T18:00:00.000000000'),
 numpy.datetime64('2017-11-11T02:24:00.000000000'),
 numpy.datetime64('2018-02-04T10:48:00.000000000'),
 numpy.datetime64('2018-04-30T19:12:00.000000000'),
 numpy.datetime64('2018-07-25T03:36:00.000000000'),
 numpy.datetime64('2018-10-18T12:00:00.000000000'),
 numpy.datetime64('2019-01-11T20:24:00.000000000'),
 numpy.datetime64('2019-04-07T04:48:00.000000000'),
 numpy.datetime64('2019-07-01T13:12:00.000000000'),
 numpy.datetime64('2019-09-24T21:36:00.000000000'),
 numpy.datetime64('2019-12-19T06:00:00.000000000'),
 numpy.datetime64('2020-03-13T14:24:00.000000000'),
 numpy.datetime64('2020-06-06T22:48:00.000000000'),
 numpy.datetime64('2020-08-31T07:12:00.000000000'),
 numpy.datetime64('2020-11-24T15:36:00.000000000'),
 numpy.datet

In [211]:
time[time <= time_bounds[0]]

array(['2016-08-08T00:00:00.000000000', '2016-07-29T00:00:00.000000000',
       '2016-08-05T00:00:00.000000000', '2016-08-05T00:00:00.000000000',
       '2016-08-05T00:00:00.000000000', '2016-08-12T00:00:00.000000000',
       '2016-08-11T00:00:00.000000000', '2016-08-11T00:00:00.000000000',
       '2016-08-02T00:00:00.000000000', '2016-07-26T00:00:00.000000000',
       '2016-08-05T00:00:00.000000000', '2016-08-11T00:00:00.000000000',
       '2016-08-09T00:00:00.000000000', '2016-08-05T00:00:00.000000000',
       '2016-08-04T00:00:00.000000000', '2016-07-28T00:00:00.000000000',
       '2016-08-04T00:00:00.000000000', '2016-09-09T00:00:00.000000000',
       '2016-08-03T00:00:00.000000000', '2016-08-08T00:00:00.000000000',
       '2016-08-05T00:00:00.000000000', '2016-07-29T00:00:00.000000000',
       '2016-08-04T00:00:00.000000000', '2016-08-02T00:00:00.000000000',
       '2016-09-09T00:00:00.000000000', '2016-08-08T00:00:00.000000000',
       '2016-08-04T00:00:00.000000000', '2016-07-28

In [209]:
len(time_bounds)

19

In [208]:
time_bounds

[numpy.datetime64('2016-09-10T08:24:00.000000000'),
 numpy.datetime64('2016-12-04T16:48:00.000000000'),
 numpy.datetime64('2017-02-28T01:12:00.000000000'),
 numpy.datetime64('2017-05-24T09:36:00.000000000'),
 numpy.datetime64('2017-08-17T18:00:00.000000000'),
 numpy.datetime64('2017-11-11T02:24:00.000000000'),
 numpy.datetime64('2018-02-04T10:48:00.000000000'),
 numpy.datetime64('2018-04-30T19:12:00.000000000'),
 numpy.datetime64('2018-07-25T03:36:00.000000000'),
 numpy.datetime64('2018-10-18T12:00:00.000000000'),
 numpy.datetime64('2019-01-11T20:24:00.000000000'),
 numpy.datetime64('2019-04-07T04:48:00.000000000'),
 numpy.datetime64('2019-07-01T13:12:00.000000000'),
 numpy.datetime64('2019-09-24T21:36:00.000000000'),
 numpy.datetime64('2019-12-19T06:00:00.000000000'),
 numpy.datetime64('2020-03-13T14:24:00.000000000'),
 numpy.datetime64('2020-06-06T22:48:00.000000000'),
 numpy.datetime64('2020-08-31T07:12:00.000000000'),
 numpy.datetime64('2020-11-24T15:36:00.000000000')]

In [186]:
X.join(y)

Unnamed: 0_level_0,Unnamed: 1_level_0,quarter2_revenue_mean,quarter2_revenue_median,quarter2_revenue_max,quarter2_revenue_min,quarter2_revenue_std,quarter2_revenue_diff_mean,quarter2_revenue_diff_median,quarter2_revenue_diff_max,quarter2_revenue_diff_min,quarter2_revenue_diff_std,...,quarter10_netinccmn_diff_mean,quarter10_netinccmn_diff_median,quarter10_netinccmn_diff_max,quarter10_netinccmn_diff_min,quarter10_netinccmn_diff_std,sector,sicindustry,y,pred,metric
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
DQWS,2020-11-13,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,0.0,...,1.223100e+04,10495.0,95618.0,-56532.0,4.359313e+04,3,195,0.412941,1.188729,1.878689
DQWS,2020-08-14,374226.0,374226.0,527542.0,220910.0,153316.0,-306632.0,-306632.0,-306632.0,-306632.0,0.0,...,1.499575e+04,16976.5,95618.0,-56532.0,4.548749e+04,3,195,3.250000,1.764145,0.457186
DQWS,2020-05-15,359828.0,359828.0,527542.0,192114.0,167714.0,335428.0,335428.0,335428.0,335428.0,0.0,...,1.563871e+04,23458.0,95618.0,-56532.0,4.859416e+04,3,195,0.000000,1.227492,
DQWS,2020-04-03,192114.0,192114.0,192114.0,192114.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.905667e+03,11729.0,95618.0,-56532.0,4.937226e+04,3,195,-0.476440,0.927045,2.945775
DQWS,2020-03-30,115209.5,115209.5,192114.0,38305.0,76904.5,153809.0,153809.0,153809.0,153809.0,0.0,...,1.068680e+04,23458.0,95618.0,-56532.0,5.390834e+04,3,195,0.469231,1.287880,1.744661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LSPD,2021-02-04,51552000.0,51552000.0,57611000.0,45493000.0,6059000.0,12118000.0,12118000.0,12118000.0,12118000.0,0.0,...,-2.318500e+07,-23185000.0,-23185000.0,-23185000.0,0.000000e+00,10,307,0.841238,0.180177,0.785819
FROG,2021-02-12,40787500.0,40787500.0,42689000.0,38886000.0,1901500.0,3803000.0,3803000.0,3803000.0,3803000.0,0.0,...,1.551000e+06,1551000.0,1551000.0,1551000.0,0.000000e+00,10,307,0.023568,0.060123,1.551090
STEP,2021-02-11,245031500.0,245031500.0,247150000.0,242913000.0,2118500.0,4237000.0,4237000.0,4237000.0,4237000.0,0.0,...,4.101550e+07,41015500.0,55663000.0,26368000.0,1.464750e+07,5,121,0.322992,0.112765,0.650873
ALGM,2021-02-02,150549000.0,150549000.0,164449000.0,136649000.0,13900000.0,27800000.0,27800000.0,27800000.0,27800000.0,0.0,...,-4.957500e+06,-4957500.0,4764000.0,-14679000.0,9.721500e+06,10,263,0.404578,0.067306,0.833638


In [182]:
pd.merge(X, y, on='ticker', how='left')

Unnamed: 0,ticker,quarter2_revenue_mean,quarter2_revenue_median,quarter2_revenue_max,quarter2_revenue_min,quarter2_revenue_std,quarter2_revenue_diff_mean,quarter2_revenue_diff_median,quarter2_revenue_diff_max,quarter2_revenue_diff_min,...,quarter10_netinccmn_diff_median,quarter10_netinccmn_diff_max,quarter10_netinccmn_diff_min,quarter10_netinccmn_diff_std,sector,sicindustry,date,y,pred,metric
0,DQWS,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,...,10495.0,95618.0,-56532.0,4.359313e+04,3,195,2020-11-13,0.412941,1.188729,1.878689
1,DQWS,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,...,10495.0,95618.0,-56532.0,4.359313e+04,3,195,2020-08-14,3.250000,1.764145,0.457186
2,DQWS,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,...,10495.0,95618.0,-56532.0,4.359313e+04,3,195,2020-05-15,0.000000,1.227492,
3,DQWS,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,...,10495.0,95618.0,-56532.0,4.359313e+04,3,195,2020-04-03,-0.476440,0.927045,2.945775
4,DQWS,235307.5,235307.5,249705.0,220910.0,14397.5,28795.0,28795.0,28795.0,28795.0,...,10495.0,95618.0,-56532.0,4.359313e+04,3,195,2020-03-30,0.469231,1.287880,1.744661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172697,LSPD,51552000.0,51552000.0,57611000.0,45493000.0,6059000.0,12118000.0,12118000.0,12118000.0,12118000.0,...,-23185000.0,-23185000.0,-23185000.0,0.000000e+00,10,307,2021-02-04,0.841238,0.180177,0.785819
172698,FROG,40787500.0,40787500.0,42689000.0,38886000.0,1901500.0,3803000.0,3803000.0,3803000.0,3803000.0,...,1551000.0,1551000.0,1551000.0,0.000000e+00,10,307,2021-02-12,0.023568,0.060123,1.551090
172699,STEP,245031500.0,245031500.0,247150000.0,242913000.0,2118500.0,4237000.0,4237000.0,4237000.0,4237000.0,...,41015500.0,55663000.0,26368000.0,1.464750e+07,5,121,2021-02-11,0.322992,0.112765,0.650873
172700,ALGM,150549000.0,150549000.0,164449000.0,136649000.0,13900000.0,27800000.0,27800000.0,27800000.0,27800000.0,...,-4957500.0,4764000.0,-14679000.0,9.721500e+06,10,263,2021-02-02,0.404578,0.067306,0.833638


In [181]:
y

Unnamed: 0,ticker,date,y,pred,metric
0,DQWS,2020-11-13,0.412941,1.188729,1.878689
1,DQWS,2020-08-14,3.250000,1.764145,0.457186
2,DQWS,2020-05-15,0.000000,1.227492,
3,DQWS,2020-04-03,-0.476440,0.927045,2.945775
4,DQWS,2020-03-30,0.469231,1.287880,1.744661
...,...,...,...,...,...
19763,LSPD,2021-02-04,0.841238,0.180177,0.785819
19764,FROG,2021-02-12,0.023568,0.060123,1.551090
19765,STEP,2021-02-11,0.322992,0.112765,0.650873
19766,ALGM,2021-02-02,0.404578,0.067306,0.833638


In [18]:
y = y[leave_mask].reset_index(drop=True)
X = X[leave_mask].reset_index(drop=True)

mc_pipeline._eval(X=X.drop(['ticker', 'date'], axis=1),
           y=y['y'].values,
           groups=X['ticker'])     

nan


In [None]:
0.39497680805725816

In [17]:
mc_pipeline.export_core('models_data/diff')

In [6]:
imp_df = pd.DataFrame()
imp_df['column'] = mc_pipeline.model.columns
imp_df['importance'] = mc_pipeline.model.base_models[0].feature_importances_
imp_df = imp_df.sort_values('importance', ascending=False)[:]
imp_df[:10]

Unnamed: 0,column,importance
135,quarter2_divyield_diff_mean,40
134,quarter2_divyield_std,38
481,sicindustry,34
145,quarter2_currentratio_diff_mean,32
105,quarter2_rnd_diff_mean,32
256,quarter4_cashneq_diff_median,31
44,quarter2_ebitda_std,30
84,quarter2_workingcapital_std,27
404,quarter10_workingcapital_std,24
465,quarter10_currentratio_diff_mean,24


In [11]:
mc_pipeline = BasePipeline.load('models_data/ctb.pickle')
mc_df = mc_pipeline.execute(config, ticker_list)

In [8]:
mc_df

Unnamed: 0,ticker,date,y
0,DQWS,2020-11-13,4.195258e+09
1,DQWS,2020-08-14,4.008153e+09
2,DQWS,2020-05-15,4.053623e+09
3,DQWS,2020-04-03,4.149051e+09
4,DQWS,2020-03-30,4.065268e+09
...,...,...,...
20071,AVIR,2020-12-10,7.639594e+09
20072,AVIR,2020-10-26,1.071128e+10
20073,LESL,2021-02-08,2.427790e+09
20074,LESL,2020-12-23,7.391225e+09
