In [2]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import lightgbm as lgbm
import xgboost as xgb
import catboost as ctb
import os
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import time
import pickle

from tqdm import tqdm

from utils import load_json
from features import QuarterlyFeatures, BaseCompanyFeatures, FeatureMerger
from targets import QuarterlyTarget, QuarterlyDiffTarget
from models import GroupedOOFModel, AnsambleModel
import pipelines.marketcap
from data import SF1Data
sns.set()

config = load_json("config.json")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
data_loader = SF1Data(config['sf1_data_path'])
tickers_df = data_loader.load_tickers(
    currency='USD',
    scalemarketcap=['4 - Mid', '5 - Large', '6 - Mega'])
ticker_list = tickers_df['ticker'].unique().tolist()

In [23]:
config['sf1_data_path'] = '../clever_investment_data/cf1_13_01_2021'

In [5]:
columns = ['revenue', 'netinc', 'ncf', 'assets', 'ebitda', 'debt', 'fcf',
           'gp', 'workingcapital', 'cashneq', 'rnd', 'sgna', 'ncfx',
           'divyield', 'currentratio', 'netinccmn']

cat_columns = ['sector', 'sicindustry']

fc1 = QuarterlyFeatures(columns=columns,
                        quarter_counts=[2, 4, 10],
                        max_back_quarter=10)

fc2 = BaseCompanyFeatures(cat_columns=cat_columns)

feature = FeatureMerger(fc1, fc2, on='ticker')
target = QuarterlyTarget(col='marketcap', quarter_shift=0)
target = QuarterlyDiffTarget(col='marketcap')

base_model = AnsambleModel(base_models=[lgbm.sklearn.LGBMRegressor(),
                                        ctb.CatBoostRegressor(verbose=False)], 
                           bagging_fraction=0.7, model_cnt=20)

# lgbm.sklearn.LGBMRegressor()
# ctb.CatBoostRegressor(verbose=False)
model = GroupedOOFModel(lgbm.sklearn.LGBMRegressor(), fold_cnt=5)
#model = GroupedOOFModel(lgbm.sklearn.LGBMRegressor(learning_rate=0.02, n_estimators=500), fold_cnt=5)


mc_pipeline = pipelines.marketcap.MarketcapPipeline(feature, target, model)
mc_pipeline.fit(config, ticker_list)
#mc_pipeline.export_core('models_data/ans')

3874it [00:28, 135.35it/s]
2401it [00:03, 683.04it/s]
2401it [00:03, 682.66it/s]


0.5005838592441726


In [17]:
mc_pipeline.export_core('models_data/diff')

In [9]:
X = feature.calculate(config['sf1_data_path'], ['AAPL'])
y = target.calculate(config['sf1_data_path'], X[['ticker', 'date']])

1it [00:00, 10.93it/s]
1it [00:00, 73.13it/s]
1it [00:00, 71.86it/s]


In [10]:
X

Unnamed: 0,ticker,date,quarter2_revenue_mean,quarter2_revenue_median,quarter2_revenue_max,quarter2_revenue_min,quarter2_revenue_std,quarter2_revenue_diff_mean,quarter2_revenue_diff_median,quarter2_revenue_diff_max,...,quarter10_netinccmn_max,quarter10_netinccmn_min,quarter10_netinccmn_std,quarter10_netinccmn_diff_mean,quarter10_netinccmn_diff_median,quarter10_netinccmn_diff_max,quarter10_netinccmn_diff_min,quarter10_netinccmn_diff_std,sector,sicindustry
0,AAPL,2021-01-28,88068500000.0,88068500000.0,111439000000,-9223372036854775808,23370500000.0,46741000000.0,46741000000.0,46741000000,...,28755000000,-9223372036854775808,5792580000.0,1625556000.0,1420000000.0,16082000000,-9223372036854775808,7824322000.0,0,0
1,AAPL,2020-10-30,62191500000.0,62191500000.0,64698000000,-9223372036854775808,2506500000.0,5013000000.0,5013000000.0,5013000000,...,22236000000,-9223372036854775808,3971161000.0,-181500000.0,712000000.0,8550000000,-9223372036854775808,6283588000.0,0,0
2,AAPL,2020-07-31,58999000000.0,58999000000.0,59685000000,-9223372036854775808,686000000.0,1372000000.0,1372000000.0,1372000000,...,22236000000,-9223372036854775808,4178495000.0,-410285700.0,4000000.0,8550000000,-9223372036854775808,6686197000.0,0,0
3,AAPL,2020-05-01,75066000000.0,75066000000.0,91819000000,-9223372036854775808,16753000000.0,-33506000000.0,-33506000000.0,-33506000000,...,22236000000,-9223372036854775808,4298027000.0,-479333300.0,1062500000.0,8550000000,-9223372036854775808,7219607000.0,0,0
4,AAPL,2020-01-29,77929500000.0,77929500000.0,91819000000,-9223372036854775808,13889500000.0,27779000000.0,27779000000.0,27779000000,...,22236000000,-9223372036854775808,4386647000.0,1622200000.0,3642000000.0,8550000000,-9223372036854775808,6004050000.0,0,0
5,AAPL,2019-10-31,58924500000.0,58924500000.0,64040000000,-9223372036854775808,5115500000.0,10231000000.0,10231000000.0,10231000000,...,19965000000,-9223372036854775808,3382823000.0,-109750000.0,1062500000.0,5840000000,-9223372036854775808,5482930000.0,0,0
6,AAPL,2019-07-31,55912000000.0,55912000000.0,58015000000,-9223372036854775808,2103000000.0,-4206000000.0,-4206000000.0,-4206000000,...,19965000000,-9223372036854775808,3780617000.0,-1360333000.0,-1517000000.0,5840000000,-9223372036854775808,5816144000.0,0,0
7,AAPL,2019-05-01,71162500000.0,71162500000.0,84310000000,-9223372036854775808,13147500000.0,-26295000000.0,-26295000000.0,-26295000000,...,19965000000,-9223372036854775808,3516736000.0,-1282000000.0,-1282000000.0,5840000000,-9223372036854775808,7122000000.0,0,0
8,AAPL,2019-01-30,73605000000.0,73605000000.0,84310000000,-9223372036854775808,10705000000.0,21410000000.0,21410000000.0,21410000000,...,19965000000,-9223372036854775808,2920000000.0,5840000000.0,5840000000.0,5840000000,-9223372036854775808,0.0,0,0


In [11]:
y

Unnamed: 0,ticker,date,y
0,AAPL,2021-01-28,450663914920
1,AAPL,2020-10-30,33500690360
2,AAPL,2020-07-31,564389256910
3,AAPL,2020-05-01,-166216964750
4,AAPL,2020-01-29,313836581800
5,AAPL,2019-10-31,142540494200
6,AAPL,2019-07-31,-5852201800
7,AAPL,2019-05-01,189418289000
8,AAPL,2019-01-30,-177424762820


In [14]:
pd.DataFrame(SF1Data(config['sf1_data_path']).load_quarterly_data(['AAPL']))[['date', 'marketcap']]

Unnamed: 0,date,marketcap
0,2021-01-28,2301480080640
1,2020-10-30,1850816165720
2,2020-07-31,1817315475360
3,2020-05-01,1252926218450
4,2020-01-29,1419143183200
5,2019-10-31,1105306601400
6,2019-07-31,962766107200
7,2019-05-01,968618309000
8,2019-01-30,779200020000
9,2018-11-05,956624782820


In [15]:
2301480080640 - 1850816165720

450663914920

In [16]:
1850816165720 - 1817315475360

33500690360

In [None]:
0.3729494787734865
0.4280805656394533
0.3859319795936919
0.38879646304551746

In [None]:
0.3651129822271524

In [11]:
target = QuarterlyTarget(col='marketcap', quarter_shift=1)

mc_pipeline = pipelines.marketcap.MarketcapPipeline(feature, target, model)
mc_pipeline.fit(config, ticker_list)
mc_pipeline.export_core('models_data/next_quarter_marketcap')

3874it [00:28, 135.96it/s]
2400it [00:03, 687.36it/s]


0.4158734292526798


Process ForkPoolWorker-73:
Process ForkPoolWorker-61:
Process ForkPoolWorker-69:
Process ForkPoolWorker-70:
Process ForkPoolWorker-80:
Process ForkPoolWorker-76:
Process ForkPoolWorker-65:
Process ForkPoolWorker-77:
Process ForkPoolWorker-66:
Process ForkPoolWorker-78:
Traceback (most recent call last):
Process ForkPoolWorker-79:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-74:
Process ForkPoolWorker-62:
Process ForkPoolWorker-67:
Process ForkPoolWorker-72:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-68:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-64:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-71:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", lin

  File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
KeyboardInterrupt
  File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:


In [6]:
imp_df = pd.DataFrame()
imp_df['column'] = mc_pipeline.model.columns
imp_df['importance'] = mc_pipeline.model.base_models[0].feature_importances_
imp_df = imp_df.sort_values('importance', ascending=False)[:]
imp_df[:10]

Unnamed: 0,column,importance
135,quarter2_divyield_diff_mean,40
134,quarter2_divyield_std,38
481,sicindustry,34
145,quarter2_currentratio_diff_mean,32
105,quarter2_rnd_diff_mean,32
256,quarter4_cashneq_diff_median,31
44,quarter2_ebitda_std,30
84,quarter2_workingcapital_std,27
404,quarter10_workingcapital_std,24
465,quarter10_currentratio_diff_mean,24


In [11]:
pipeline_path = 'models_data/marketcap_pipeline_16.02.21_15:11'
mc_pipeline = pipelines.marketcap.load(pipeline_path)

In [7]:
mc_df = mc_pipeline.execute(config, ticker_list)

3874it [00:28, 136.31it/s]


In [8]:
mc_df

Unnamed: 0,ticker,date,y
0,DQWS,2020-11-13,4.195258e+09
1,DQWS,2020-08-14,4.008153e+09
2,DQWS,2020-05-15,4.053623e+09
3,DQWS,2020-04-03,4.149051e+09
4,DQWS,2020-03-30,4.065268e+09
...,...,...,...
20071,AVIR,2020-12-10,7.639594e+09
20072,AVIR,2020-10-26,1.071128e+10
20073,LESL,2021-02-08,2.427790e+09
20074,LESL,2020-12-23,7.391225e+09
