In [8]:
from processor.processor import Processor as processor
from database.adatabase import ADatabase
from xgboost import XGBRegressor
from statistics import mean
import math
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from tqdm import tqdm
import warnings
import pytz
import copy
warnings.simplefilter(action="ignore")
import pickle

In [2]:
factors =  [
            "assets"
            ,"liabilities"
            ,"netincomeloss"
            ,"adjclose" 
            ,"rf"
            ,"spy"
           ]
required = ["year","quarter","ticker"]
required.extend(factors)
market = ADatabase("market")
sec = ADatabase("sec")
market = ADatabase("market")
fred = ADatabase("fred")
db = ADatabase("sapling")

In [3]:
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",attrs={"id":"constituents"})[0].rename(columns={"Symbol":"ticker"})

In [4]:
fred.connect()
market_yield = fred.retrieve("market_yield")
market_yield = market_yield.rename(columns={"value":"rf"})
market_yield["rf"] = market_yield["rf"].replace(".",np.nan)
market_yield.dropna(inplace=True)
market_yield["rf"] = [float(x)/100 for x in market_yield["rf"]]
market_yield["date"] = market_yield["date"].shift(-5)
market_yield = processor.column_date_processing(market_yield)
spy = fred.retrieve("sp500")
spy = spy.rename(columns={"value":"spy"})
spy["spy"] = spy["spy"].replace(".",np.nan)
spy.dropna(inplace=True)
spy["spy"] = [float(x) for x in spy["spy"]]
spy = processor.column_date_processing(spy)
fred.disconnect()

In [5]:
data = []
sec.connect()
market.connect()
for ticker in tqdm(sp500["ticker"]):
    try:
        cik = int(sp500[sp500["ticker"]==ticker]["CIK"].item())
        filing = sec.query("filings",{"cik":cik}).drop("date",axis=1)
        prices = processor.column_date_processing(market.query("prices",{"ticker":ticker})).drop("date",axis=1)
        filing["ticker"] = ticker
        filing["year"] = filing["year"] + 1
        ticker_data = prices.merge(filing,on=["year","quarter","ticker"],how="left")
        ticker_data = ticker_data.merge(market_yield[["year","quarter","rf"]].groupby(["year","quarter"]).mean().reset_index(),on=["year","quarter"],how="left")
        ticker_data = ticker_data.merge(spy[["year","quarter","spy"]].groupby(["year","quarter"]).mean().reset_index(),on=["year","quarter"],how="left")
        ticker_data = ticker_data.groupby(["year","quarter","ticker"]).mean().reset_index()
        ticker_data.sort_values(["year","quarter"],inplace=True)
        ticker_data["y"] = ticker_data["adjclose"].shift(-1)
        data.append(ticker_data.bfill().ffill().dropna())
    except Exception as e:
        print(ticker,str(e))
        continue
sec.disconnect()
market.disconnect()

 13%|████████████████▊                                                                                                                   | 64/503 [00:06<00:38, 11.29it/s]

BRK.B 'date'


 16%|████████████████████▋                                                                                                               | 79/503 [00:07<00:34, 12.19it/s]

BF.B 'date'


 27%|███████████████████████████████████▋                                                                                               | 137/503 [00:13<00:32, 11.37it/s]

CRWD 'date'


 43%|████████████████████████████████████████████████████████▎                                                                          | 216/503 [00:21<00:20, 13.94it/s]

GEV "['date'] not found in axis"


 45%|██████████████████████████████████████████████████████████▊                                                                        | 226/503 [00:22<00:24, 11.12it/s]

GDDY 'date'


 56%|█████████████████████████████████████████████████████████████████████████▋                                                         | 283/503 [00:27<00:17, 12.79it/s]

KKR 'date'


 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 423/503 [00:41<00:07, 11.32it/s]

SOLV "['date'] not found in axis"


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:49<00:00, 10.14it/s]

ZBH 'date'
ZTS 'date'





In [6]:
training_data = pd.concat(data).sort_values(["year","quarter"]).merge(sp500[["ticker","GICS Sector"]],on="ticker")

In [7]:
model = XGBRegressor(fit_intercept=True)
model_data = training_data[(training_data["year"]<=2023) & (training_data["year"]>=2016)].dropna()
model.fit(model_data[factors],model_data["y"])

In [15]:
db.cloud_connect()
db.store("model",pd.DataFrame([{"model":pickle.dumps(model),"date":datetime.now()}]))
db.disconnect()