In [13]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import lightgbm as lgbm
import os
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import time
import pickle

from tqdm import tqdm

from utils import load_json
from df_utils import form_last_quarter_df, form_pred_df
from features import QuarterlyFeatures, BaseCompanyFeatures, FeatureMerger
from targets import QuarterlyTarget
from data import load_quarterly_data_cf1, load_cf1_df, translate_currency_cf1
from models import GroupedOOFModel
from pipelines import MarketcapPipeline
sns.set()

config = load_json("config.json")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
tickers_df = pd.read_csv('{}/cf1/tickers.csv'.format(config['data_path']))
tickers_df = tickers_df[tickers_df['currency']=='USD']
tickers_df = tickers_df[tickers_df['scalemarketcap'].apply(lambda x: x in ['4 - Mid', '5 - Large', '6 - Mega'])]
ticker_list = tickers_df['ticker'].unique().tolist()

In [21]:
columns = ['revenue', 'netinc', 'ncf', 'assets', 'ebitda', 'debt', 'fcf', 'gp', 'workingcapital',
          'cashneq', 'rnd', 'sgna', 'ncfx', 'divyield', 'currentratio', 'netinccmn']

cat_columns = ['sector', 'sicindustry']

fc1 = QuarterlyFeatures(config=config, 
                        columns=columns,
                        quarter_counts=[2, 4, 10],
                        max_back_quarter=10)

fc2 = BaseCompanyFeatures(config=config, cat_columns=cat_columns)

feature = FeatureMerger(fc1, fc2, on='ticker')
target = QuarterlyTarget(config=config, col='marketcap', quarter_shift=0)
model = GroupedOOFModel(lgbm.sklearn.LGBMRegressor(), fold_cnt=5)
                
marketcap_pipeline = MarketcapPipeline(config, feature, target, model)
marketcap_pipeline.fit(ticker_list)

3874it [00:31, 124.57it/s]
2396it [00:03, 658.53it/s]


0.3943818017783071


In [None]:
0.3651129822271524

In [44]:
mp_df = marketcap_pipeline.execute(ticker_list)

3874it [00:31, 123.92it/s]


In [19]:
imp_df = pd.DataFrame()
imp_df['column'] = marketcap_pipeline.model.columns
imp_df['importance'] = marketcap_pipeline.model.base_models[0].feature_importances_
imp_df = imp_df.sort_values('importance', ascending=False)[:]
imp_df[:10]

Unnamed: 0,column,importance
481,sicindustry,91
455,quarter10_divyield_diff_mean,46
461,quarter10_currentratio_median,34
471,quarter10_netinccmn_median,31
376,quarter10_debt_diff_median,30
404,quarter10_workingcapital_std,29
361,quarter10_ebitda_median,27
366,quarter10_ebitda_diff_median,24
480,sector,23
456,quarter10_divyield_diff_median,22


In [10]:
with open('models_data/marketcap_model.pickle', 'wb') as f:
    pickle.dump(marketcap_model, f)
    
with open('models_data/feat_calculator.pickle', 'wb') as f:
    pickle.dump(feat_calculator, f)
