In [99]:
import numpy as np
import asyncio
import pandas as pd
import sqlite3
import json
import pickle
from datetime import datetime, timedelta, date
from collections import namedtuple, defaultdict
from typing import Tuple
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from typing import List, Tuple, Dict
from sklearn.compose import ColumnTransformer
import warnings
import plotly.express as px
import plotly.graph_objects as go
from xgboost import XGBRegressor
from copy import deepcopy
import os
import sys
import copy
import yfinance_ez as yf

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from finance_ml.variants.linear_model.main import main
from finance_ml.variants.linear_model.hyperparams import Hyperparams
from finance_ml.variants.linear_model.preprocessing import preprocess_data, preprocess_quarterly_data
from finance_ml.variants.linear_model.backtest import compute_performance
from finance_ml.utils.constants import (
    QuarterlyColumns as QC, StockPupColumns, STOCKPUP_TABLE_NAME, QUARTERLY_DB_FILE_PATH,
    YF_QUARTERLY_TABLE_NAME, INDEX_COLUMNS, MISSING_SECTOR, MISSING_INDUSTRY,
    STOCK_GENERAL_INFO_CSV, FORMULAE, Q_DELTA_PREFIX, YOY_DELTA_PREFIX,
    QUARTER, YEAR, VS_MKT_IDX, CATEGORICAL_COLUMNS, TICKER_SYMBOL, TARGET_COLUMN
)
from finance_ml.utils import QuarterlyIndex

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
hp = Hyperparams()

df, prediction_candidate_df = preprocess_data(hp)
_, market_index_df = preprocess_quarterly_data(hp)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Initial combined data size: (64205, 37)
Preprocessed quarterly df: (64205, 73)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [89]:
portfolio = await compute_performance(df=df,
                          start_date=date(2020,1,1),
                          hyperparams=hp,
                          end_date=None)


Fitting data for: QuarterlyIndex  Q4 2019
Filtering data between 4Q1999 and 4Q2019
QuarterFilter removed 11269 rows. Output size: (49790, 73)
Train Size: (39832, 72), Test Size: (9958, 72)
[1]	valid_0's l1: 28.6246	valid_0's l2: 4255.03
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 28.4678	valid_0's l2: 4139.8
[3]	valid_0's l1: 28.3202	valid_0's l2: 4039
[4]	valid_0's l1: 28.2499	valid_0's l2: 4024.82
[5]	valid_0's l1: 28.168	valid_0's l2: 3984.09
[6]	valid_0's l1: 28.0979	valid_0's l2: 3932.1
[7]	valid_0's l1: 28.0357	valid_0's l2: 3936.97
[8]	valid_0's l1: 28.0055	valid_0's l2: 3927.71
[9]	valid_0's l1: 27.9536	valid_0's l2: 3906.58
[10]	valid_0's l1: 27.917	valid_0's l2: 3888.06
[11]	valid_0's l1: 27.8914	valid_0's l2: 3845.64
[12]	valid_0's l1: 27.8693	valid_0's l2: 3836.33
[13]	valid_0's l1: 27.843	valid_0's l2: 3829.07
[14]	valid_0's l1: 27.8223	valid_0's l2: 3820.51
[15]	valid_0's l1: 27.7919	valid_0's l2: 3814.45
[16]	valid_0's l1: 27.7801	valid

[1]	valid_0's l1: 28.5204	valid_0's l2: 4267.68
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 28.4008	valid_0's l2: 4202.33
[3]	valid_0's l1: 28.2978	valid_0's l2: 4102.92
[4]	valid_0's l1: 28.1941	valid_0's l2: 4050.32
[5]	valid_0's l1: 28.1043	valid_0's l2: 3953.35
[6]	valid_0's l1: 28.027	valid_0's l2: 3896.23
[7]	valid_0's l1: 27.9444	valid_0's l2: 3839.8
[8]	valid_0's l1: 27.8962	valid_0's l2: 3792.44
[9]	valid_0's l1: 27.8814	valid_0's l2: 3742.79
[10]	valid_0's l1: 27.8511	valid_0's l2: 3722.34
[11]	valid_0's l1: 27.8046	valid_0's l2: 3681.78
[12]	valid_0's l1: 27.7988	valid_0's l2: 3698.41
[13]	valid_0's l1: 27.7953	valid_0's l2: 3678.54
[14]	valid_0's l1: 27.7587	valid_0's l2: 3669.01
[15]	valid_0's l1: 27.7587	valid_0's l2: 3636
[16]	valid_0's l1: 27.7653	valid_0's l2: 3639.52
[17]	valid_0's l1: 27.726	valid_0's l2: 3620.04
[18]	valid_0's l1: 27.6737	valid_0's l2: 3612.17
[19]	valid_0's l1: 27.6663	valid_0's l2: 3583.22
[20]	valid_0's l1: 27.64

[67]	valid_0's l1: 28.0932	valid_0's l2: 8656.63
[68]	valid_0's l1: 28.0771	valid_0's l2: 8643.31
[69]	valid_0's l1: 28.0666	valid_0's l2: 8663.1
[70]	valid_0's l1: 28.0593	valid_0's l2: 8653.55
[71]	valid_0's l1: 28.0403	valid_0's l2: 8653.71
[72]	valid_0's l1: 28.031	valid_0's l2: 8638.04
[73]	valid_0's l1: 28.0392	valid_0's l2: 8641.28
[74]	valid_0's l1: 28.0302	valid_0's l2: 8635.6
[75]	valid_0's l1: 28.0248	valid_0's l2: 8626.12
[76]	valid_0's l1: 28.0089	valid_0's l2: 8604.76
[77]	valid_0's l1: 28.0137	valid_0's l2: 8630.2
[78]	valid_0's l1: 28.0006	valid_0's l2: 8622.15
[79]	valid_0's l1: 27.9848	valid_0's l2: 8596.34
[80]	valid_0's l1: 27.9855	valid_0's l2: 8585.76
[81]	valid_0's l1: 27.9779	valid_0's l2: 8575.64
[82]	valid_0's l1: 27.9662	valid_0's l2: 8547.91
[83]	valid_0's l1: 27.9827	valid_0's l2: 8556.56
[84]	valid_0's l1: 27.9807	valid_0's l2: 8565.71
[85]	valid_0's l1: 27.969	valid_0's l2: 8562.32
[86]	valid_0's l1: 27.9613	valid_0's l2: 8573.25
[87]	valid_0's l1: 27.959

In [90]:
portfolio.performance

OrderedDict([(QuarterlyIndex  Q4 2019, 1000),
             (QuarterlyIndex  Q1 2020, 963.54),
             (QuarterlyIndex  Q2 2020, 766.94),
             (QuarterlyIndex  Q3 2020, 611.35),
             (QuarterlyIndex  Q4 2020, 647.91),
             (QuarterlyIndex  Q1 2021, 1119.35),
             (QuarterlyIndex  Q2 2021, 1673.4)])

In [143]:
starting_cash = 1000
q_indexes = [QuarterlyIndex("^DJI", q.quarter, q.year) for q in portfolio.performance]
dow_prices = [market_index_df.loc[idx.to_tuple()][QC.PRICE_AVG] for idx in q_indexes]
dow_prices = {q_idx.to_xQyyyy(): price / dow_prices[0] * starting_cash for q_idx, price in zip(q_indexes, dow_prices)}

In [144]:
dow_prices

{'4Q2019': 1000.0,
 '1Q2020': 1058.329094998045,
 '2Q2020': 865.053327819581,
 '3Q2020': 1013.4810336028744,
 '4Q2020': 1079.700461356067,
 '1Q2021': 1115.3784091930547,
 '2Q2021': 1208.4453686422391}

In [141]:
import seaborn as sns

data = pd.DataFrame([portfolio.performance, dow_prices], index=['Model', 'DOW'])
data_T = data.transpose().reset_index()
sns.lineplot(data=data_T)
data_T

TypeError: float() argument must be a string or a number, not 'QuarterlyIndex'