In [1]:
import pandas as pd
from database.market import Market
from database.strategy import Strategy
from database.sec import SEC
from modeler.modeler import Modeler as m
from datetime import datetime, timedelta, timezone
import numpy as np
import math
from tqdm import tqdm
import pickle
from sklearn.preprocessing import OneHotEncoder

In [2]:
sec = SEC()
market = Market()
strat = Strategy()

In [3]:
market.connect()
sp5 = market.retrieve("sp500")
prices = market.retrieve("prices")
market.disconnect()

In [4]:
prices["date"] = pd.to_datetime(prices["date"])
prices["year"] = [x.year for x in prices["date"]]
prices["quarter"] = [x.quarter for x in prices["date"]]

In [5]:
quarterly_grouped = prices.groupby(["year","quarter","ticker"]).mean()
quarterly_grouped["category"] = [math.ceil(x/100) * 100 for x in quarterly_grouped["adjClose"]]
quarterly_grouped["category"] = [1000 if x > 100 else x for x in quarterly_grouped["category"]]

In [6]:
quarterly_grouped.reset_index(inplace=True)
groups = quarterly_grouped.merge(sp5.rename(columns={"Symbol":"ticker"}),on="ticker",how="left")
g = groups[["year","quarter","ticker","adjClose","category","GICS Sector","CIK"]]

In [7]:
g["string_category"]  = [str(x) for x in g["category"]]
g["classification"] = g["string_category"] + g["GICS Sector"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g["string_category"]  = [str(x) for x in g["category"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g["classification"] = g["string_category"] + g["GICS Sector"]


In [8]:
numberss = len(g["classification"].unique())

In [9]:
enc = OneHotEncoder(handle_unknown="ignore")
transformed = [[x] for x in g["classification"]]
encoding = enc.fit_transform(transformed)
df_encoding = pd.DataFrame(encoding.toarray())
for col in df_encoding.columns:
    g[col] = df_encoding[col]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g[col] = df_encoding[col]


In [10]:
yearly_gap = 1 
training_years = 1
fails = []
filings = []
columns = []
sec.connect()
for cik in tqdm(list(g["CIK"].unique())):
    try:
        filing = sec.retrieve_filing_data(cik)
        symbols = sp5[sp5["CIK"]==cik]["Symbol"]
        if symbols.index.size > 1:
            ticker = str(list(symbols)[0])
        else:
            ticker = symbols.item()
        funds = filing.copy()
        drop_columns = ["adsh","cik","_id"]
        for column in funds.columns:
            if str(column).islower() and str(column) != "filed" and str(column) not in ["year","quarter","ticker"]:
                drop_columns.append(column)
        funds["filed"] = [datetime.strptime(str(x),"%Y%m%d").replace(tzinfo=timezone.utc) if "-" not in str(x) else \
                         datetime.strptime(str(x).split(" ")[0],"%Y-%m-%d").replace(tzinfor=timezone.utc) for x in funds["filed"]]
        funds["quarter"] = [x.quarter for x in funds["filed"]]
        funds["year"] = [x.year + yearly_gap for x in funds["filed"]]
        funds["ticker"] = ticker
        funds.drop(drop_columns,axis=1,inplace=True,errors="ignore")
        qa = funds.copy()
        for col in qa.columns:
            test = qa[col].fillna(-99999)
            availability = 1 - (len([x for x in test if x == -99999]) / qa.index.size)
            if availability < 0.95:
                funds.drop(col,inplace=True,axis=1)
        filings.append(funds)
    except Exception as e:
        print("prep",ticker,str(e))
        fails.append([ticker,str(e)])
sec.disconnect()

  8%|███▏                                     | 38/491 [00:21<03:27,  2.19it/s]

prep BEN can only convert an array of size 1 to a Python scalar


 90%|███████████████████████████████████▊    | 440/491 [03:51<00:17,  2.88it/s]

prep FRC 'filed'


100%|████████████████████████████████████████| 491/491 [04:11<00:00,  1.95it/s]


In [11]:
try:
    f = pd.concat(filings)
    for col in tqdm(f.columns):
        test = f[col].fillna(-99999)
        availability = len([x for x in test != -99999 if x == True]) / test.index.size
        if availability < 0.7:
            f.drop(col,axis=1,inplace=True)
except Exception as e:
    print(str(e))

100%|██████████████████████████████████████| 2240/2240 [02:42<00:00, 13.77it/s]


In [12]:
g.columns

Index([           'year',         'quarter',          'ticker',
              'adjClose',        'category',     'GICS Sector',
                   'CIK', 'string_category',  'classification',
                       0,                 1,                 2,
                       3,                 4,                 5,
                       6,                 7,                 8,
                       9,                10,                11,
                      12,                13,                14,
                      15,                16,                17,
                      18,                19,                20,
                      21,                22],
      dtype='object')

In [13]:
try:
    data = f.merge(g.drop(["string_category","classification","adjClose","category","GICS Sector","CIK"],axis=1), \
                  on=["year","quarter","ticker"],how="left")
    factors = list(data.columns)
    factors = [x for x in factors if x not in ["year","quarter","ticker"]]
    for i in range(numberss):
        factors.remove(i)
    for col in factors:
        data[col].replace([np.inf,-np.inf,np.nan,np.NaN],f[col].mean(),inplace=True)
except Exception as e:
    print(str(e))

In [14]:
for col in data.columns:
    data.rename(columns= {col:str(col)},inplace=True)

In [15]:
# market.connect()
# data = market.retrieve("financial_categorization_data")
# market.disconnect()

In [16]:
data.drop(["_id","filed"],axis=1,inplace=True,errors="ignore")

In [17]:
factors

['AccumulatedOtherComprehensiveIncomeLossNetOfTax',
 'Assets',
 'AssetsCurrent',
 'CashAndCashEquivalentsAtCarryingValue',
 'EarningsPerShareBasic',
 'EarningsPerShareDiluted',
 'EntityCommonStockSharesOutstanding',
 'IncomeTaxExpenseBenefit',
 'LiabilitiesAndStockholdersEquity',
 'LiabilitiesCurrent',
 'NetIncomeLoss',
 'OtherAssetsNoncurrent',
 'RetainedEarningsAccumulatedDeficit',
 'StockholdersEquity',
 'filed',
 'CommonStockValue',
 'Goodwill',
 'PropertyPlantAndEquipmentNet']

In [18]:
year_range = range(2013,2021)
yearly_gap = 1
training_years = 1
for year in tqdm(year_range):
    try:
        training_data = data[(data["year"] < year) & (data["year"] >= year - yearly_gap)]
        factors = list(data.columns)
        factors = [x for x in factors if x not in ["year","quarter","ticker"]]
        for i in range(numberss):
            try:
                factors.remove(str(i))
            except:
                continue
        for col in factors:
            training_data[col].replace([np.inf,-np.inf,np.nan,np.NaN],training_data[col].mean(),inplace=True)
        training_data.dropna(inplace=True)
        x = training_data[factors]
        y = training_data[[str(x) for x in range(numberss)]]
        prediction_data = data[data["year"]==year]
        refined_data = {"X":x.reset_index(drop=True),"y":y.reset_index(drop=True)}
        models = m.xgb_classify(refined_data.copy(),multioutput=True)
        model = models["model"]
        for col in factors:
            prediction_data[col].replace([np.inf,-np.inf,np.nan,np.NaN],prediction_data[col].mean(),inplace=True)
        prediction_data.dropna(inplace=True)
        predictions = enc.inverse_transform(model.predict(prediction_data[factors]))
        prediction_data["prediction"] = [x[0] for x in predictions]
        prediction_data["score"] = models["score"].item()
        sim = prediction_data[["year","quarter","ticker","prediction","score"]]
#         models["model"] = [pickle.dumps(x) for x in models["model"]]
#         models["year"] = year
        strat.connect()
#         strat.store("stock_category_models",models)
        strat.store("predicted_stock_categories",sim)
        strat.disconnect()
    except Exception as e:
        print(year,str(e))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy 

In [19]:
sim

Unnamed: 0,year,quarter,ticker,prediction,score
41,2020,1,AAPL,100Information Technology,0.713992
42,2020,2,AAPL,100Information Technology,0.713992
43,2020,3,AAPL,100Information Technology,0.713992
44,2020,4,AAPL,100Information Technology,0.713992
81,2020,1,ABMD,1000Health Care,0.713992
...,...,...,...,...,...
21809,2020,3,AMCR,100Real Estate,0.713992
21810,2020,4,AMCR,100Real Estate,0.713992
21817,2020,2,CTVA,,0.713992
21818,2020,3,CTVA,,0.713992
