In [43]:
import numpy as np
import asyncio
import pandas as pd
import sqlite3
import json
import pickle
from datetime import datetime, timedelta, date
from collections import namedtuple, defaultdict
from typing import Tuple
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from typing import List, Tuple, Dict
from sklearn.compose import ColumnTransformer
import warnings
import plotly.express as px
import plotly.graph_objects as go
from xgboost import XGBRegressor
from copy import deepcopy
import os
import sys
import copy
import yfinance_ez as yf

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from finance_ml.variants.linear_model.main import main
from finance_ml.variants.linear_model.hyperparams import Hyperparams
from finance_ml.variants.linear_model.preprocessing import preprocess_data, preprocess_quarterly_data
from finance_ml.variants.linear_model.backtest import compute_performance
from finance_ml.utils.constants import (
    QuarterlyColumns as QC, StockPupColumns, STOCKPUP_TABLE_NAME, QUARTERLY_DB_FILE_PATH,
    YF_QUARTERLY_TABLE_NAME, INDEX_COLUMNS, MISSING_SECTOR, MISSING_INDUSTRY,
    STOCK_GENERAL_INFO_CSV, FORMULAE, Q_DELTA_PREFIX, YOY_DELTA_PREFIX,
    QUARTER, YEAR, VS_MKT_IDX, CATEGORICAL_COLUMNS, TICKER_SYMBOL
)

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
hp = Hyperparams()

df, prediction_candidate_df = preprocess_data(hp)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Initial combined data size: (64205, 37)
Preprocessed quarterly df: (64205, 75)


In [11]:
portfolio = await compute_performance(df=df,
                          start_date=date(2020,1,1),
                          hyperparams=hp,
                          end_date=None)


Fitting data for: QuarterlyIndex  Q4 2019
Filtering data between 4Q1999 and 4Q2019
QuarterFilter removed 11269 rows. Output size: (49790, 73)
Train Size: (39832, 72), Test Size: (9958, 72)
[1]	valid_0's l1: 28.6246	valid_0's l2: 4255.03
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 28.4678	valid_0's l2: 4139.8
[3]	valid_0's l1: 28.3202	valid_0's l2: 4039
[4]	valid_0's l1: 28.2499	valid_0's l2: 4024.82
[5]	valid_0's l1: 28.168	valid_0's l2: 3984.09
[6]	valid_0's l1: 28.0979	valid_0's l2: 3932.1
[7]	valid_0's l1: 28.0357	valid_0's l2: 3936.97
[8]	valid_0's l1: 28.0055	valid_0's l2: 3927.71
[9]	valid_0's l1: 27.9536	valid_0's l2: 3906.58
[10]	valid_0's l1: 27.917	valid_0's l2: 3888.06
[11]	valid_0's l1: 27.8914	valid_0's l2: 3845.64
[12]	valid_0's l1: 27.8693	valid_0's l2: 3836.33
[13]	valid_0's l1: 27.843	valid_0's l2: 3829.07
[14]	valid_0's l1: 27.8223	valid_0's l2: 3820.51
[15]	valid_0's l1: 27.7919	valid_0's l2: 3814.45
[16]	valid_0's l1: 27.7801	valid

[34]	valid_0's l1: 27.5221	valid_0's l2: 3446.52
[35]	valid_0's l1: 27.5131	valid_0's l2: 3442.04
[36]	valid_0's l1: 27.4987	valid_0's l2: 3447.77
[37]	valid_0's l1: 27.4746	valid_0's l2: 3395.45
[38]	valid_0's l1: 27.4641	valid_0's l2: 3371.93
[39]	valid_0's l1: 27.4672	valid_0's l2: 3359.1
[40]	valid_0's l1: 27.4452	valid_0's l2: 3353.58
[41]	valid_0's l1: 27.4488	valid_0's l2: 3347.06
[42]	valid_0's l1: 27.4742	valid_0's l2: 3346.49
[43]	valid_0's l1: 27.472	valid_0's l2: 3352.98
[44]	valid_0's l1: 27.498	valid_0's l2: 3358.69
[45]	valid_0's l1: 27.4753	valid_0's l2: 3339.91
Early stopping, best iteration is:
[40]	valid_0's l1: 27.4452	valid_0's l2: 3353.58
Filtering data between 3Q2020 and 4Q2020
QuarterFilter removed 60978 rows. Output size: (81, 72)
Failed to find data for QuarterlyIndex ENDP Q4 2020
Failed to find data for QuarterlyIndex GCI Q4 2020
Failed to find data for QuarterlyIndex LIFE Q4 2020
Failed to find data for QuarterlyIndex SIG Q4 2020
Failed to find data for Quar

ValueError: Input data must be 2 dimensional and non empty.

In [37]:
idx = pd.IndexSlice
df.loc[idx['AAPL', :, 2020]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PriceAvg,Ebit,TotalStockholderEquity,DividendPerShare,Industry,Volatility,AssetsToLiabilitiesRatio,AvgPriceToEarningsRatio,DebtToEquityRatio,ReturnOnEquity,PriceToBookRatio,ProfitMargin,RnDtoRevenue,CashToRevenue,ExpensesToRevenue,Q_Delta_PriceAvg,Q_Delta_PriceHigh,Q_Delta_PriceLow,Q_Delta_PriceEoQ,Q_Delta_Ebit,Q_Delta_GrossProfit,Q_Delta_TotalRevenue,Q_Delta_ResearchDevelopment,Q_Delta_TotalOperatingExpenses,Q_Delta_IncomeBeforeTax,Q_Delta_IncomeTaxExpense,Q_Delta_OperatingIncome,Q_Delta_NetIncome,Q_Delta_DividendsPaid,Q_Delta_RepurchaseOfStock,Q_Delta_Depreciation,Q_Delta_IssuanceOfStock,Q_Delta_Cash,Q_Delta_CommonStock,Q_Delta_TotalAssets,Q_Delta_TotalLiab,Q_Delta_LongTermDebt,Q_Delta_ShortLongTermDebt,Q_Delta_TotalStockholderEquity,Q_Delta_Volume,Q_Delta_Earnings,Q_Delta_RnDtoRevenue,Q_Delta_CashToRevenue,YOY_Delta_PriceAvg,YOY_Delta_PriceHigh,YOY_Delta_PriceLow,YOY_Delta_PriceEoQ,YOY_Delta_Ebit,YOY_Delta_GrossProfit,YOY_Delta_TotalRevenue,YOY_Delta_ResearchDevelopment,YOY_Delta_TotalOperatingExpenses,YOY_Delta_IncomeBeforeTax,YOY_Delta_IncomeTaxExpense,YOY_Delta_OperatingIncome,YOY_Delta_NetIncome,YOY_Delta_DividendsPaid,YOY_Delta_RepurchaseOfStock,YOY_Delta_Depreciation,YOY_Delta_IssuanceOfStock,YOY_Delta_Cash,YOY_Delta_CommonStock,YOY_Delta_TotalAssets,YOY_Delta_TotalLiab,YOY_Delta_LongTermDebt,YOY_Delta_ShortLongTermDebt,YOY_Delta_TotalStockholderEquity,YOY_Delta_Volume,YOY_Delta_Earnings,YOY_Delta_RnDtoRevenue,YOY_Delta_CashToRevenue,Q_Delta_PriceAvg_vs_^DJI,YOY_Delta_PriceAvg_vs_^DJI,Volatility_vs_^DJI,PredictedAppreciation
TickerSymbol,Quarter,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1
AAPL,1,2020,73.568,12853000000.0,78425000000.0,0.048125,Consumer Electronics,0.348521,1.324104,7.324572e-09,1.268448,0.171106,0.050048,0.192907,0.078284,0.688937,0.779586,0.15823,3.525309,0.030203,-0.198592,-0.497321,-0.364795,-0.364914,0.025612,-0.313811,-0.493209,-0.487778,-0.497321,-0.494109,-0.046341,-0.150509,-0.010653,213.0,0.010133,0.04481,-0.059357,-0.03629,-0.042889,0.016432,-0.124046,0.396201,-0.266111,0.614918,0.590544,0.732642,0.650665,0.575775,,-6.672109,,600.164948,,,,,0.462554,-0.026987,-5.019311,,,,0.008966,40.757198,-0.063152,0.024719,-0.012361,inf,-0.259163,,-0.131217,,-0.998322,2.712714,4.353237,4.03998,75.16
AAPL,2,2020,76.4068,13091000000.0,72282000000.0,0.05125,Consumer Electronics,0.412398,1.294954,5.582844e-09,1.405011,0.239921,0.036781,0.18854,0.079719,0.55932,0.780665,0.038587,0.1212,0.071818,0.527632,0.018517,0.013858,0.023528,0.042278,0.024945,0.000152,-0.00106,0.018517,0.000356,0.083259,-0.064069,-0.012204,0.0,-0.16904,0.013824,-0.009538,0.012758,0.055699,-0.277425,-0.07833,0.005093,0.362605,0.018319,-0.188141,0.589722,0.755193,0.396862,0.804745,0.134009,0.121274,0.109201,0.117689,0.102425,0.10293,0.009106,0.134009,0.12037,0.00744,-0.032775,-0.061712,427.0,-0.339343,0.122778,-0.015191,0.085387,0.107281,-0.44497,-0.250622,0.649412,0.216641,0.007652,-0.404385,-0.211295,-5.912096,1.135617,68.63
AAPL,3,2020,107.487619,14775000000.0,65339000000.0,0.205,Consumer Electronics,0.405721,1.252714,8.481624e-09,1.510078,0.247693,0.027661,0.195879,0.076942,0.587592,0.771631,0.406781,0.457567,0.496226,0.264172,0.128638,0.08858,0.083991,0.046238,0.071447,0.134277,0.18259,0.128638,0.126189,-0.039661,-0.9374,-0.018169,-0.934287,0.138783,-0.934827,0.020621,0.055035,0.049113,-1.0,-0.096054,1.922805,-0.074017,-0.034828,0.050547,1.078421,1.404437,0.878153,1.055837,-0.0544,0.015465,0.010275,0.211192,0.031147,-0.076022,-0.087259,-0.0544,-0.074017,0.009198,-0.93767,-0.150047,-0.927885,-0.221685,-0.929745,-0.043212,0.042419,0.074722,-1.0,-0.277926,4.915959,-0.430068,0.198874,-0.229601,2.370764,27.561837,3.277298,
AAPL,4,2020,118.476508,33534000000.0,66224000000.0,0.205,Consumer Electronics,0.196495,1.23008,4.120205e-09,1.499169,0.488765,0.028035,0.258034,0.04633,0.323136,0.699082,0.102234,-0.014776,0.204659,0.177393,1.269645,0.795455,0.722449,0.037164,0.560503,1.253473,1.165171,1.269645,1.268997,0.029052,0.571388,-0.013323,0.246667,-0.052767,0.019004,0.093137,0.113251,0.006223,,0.013545,-0.633754,1.268997,-0.397855,-0.450066,0.865255,6.286091,0.990239,0.822218,0.31151,0.25871,0.213681,0.159964,0.175925,0.295586,0.310158,0.31151,0.293173,0.02091,-0.921791,-0.053267,16.53125,-0.094566,-0.929653,0.039446,0.146336,0.066643,-1.0,-0.260323,0.502197,1.101052,-0.04426,-0.253977,1.56468,10.856334,1.398268,


In [44]:
def add_target_column(df: pd.DataFrame, hyperparams: Hyperparams) -> pd.DataFrame:
    """Adds a target column of predicted appreciation"""
    tickers = set(df.index.levels[TICKER_SYMBOL])

    new_df = pd.DataFrame()
    for ticker in tickers:
        ticker_df = df[df.index.isin([ticker], level=TICKER_SYMBOL)][[
            QC.PRICE_AVG, QC.DIVIDEND_PER_SHARE]]
        ticker_df = ticker_df.sort_index(level=YEAR, ascending=True)

        ticker_df["FuturePrice"] = ticker_df[QC.PRICE_AVG].shift(
            periods=-hyperparams.N_QUARTERS_OUT_TO_PREDICT)

        if hyperparams.INCLUDE_DIVIDENDS_IN_PREDICTED_PRICE:
            ticker_df["TotalDividendsPaid"] = df[QC.DIVIDEND_PER_SHARE].rolling(
                min_periods=1, window=hyperparams.N_QUARTERS_OUT_TO_PREDICT).sum().shift(
                1 - hyperparams.N_QUARTERS_OUT_TO_PREDICT).fillna(0)

            ticker_df["FuturePrice"] = ticker_df["FuturePrice"] + ticker_df["TotalDividendsPaid"]

        # Target column is predicted appreciation (as a percentage. 2.0 = 2% increase)
        ticker_df[TARGET_COLUMN] = round(
            100.0 * (ticker_df["FuturePrice"] - ticker_df[QC.PRICE_AVG]
                     ) / ticker_df[QC.PRICE_AVG], 2)
        new_df = new_df.append(ticker_df[[TARGET_COLUMN]])

    output = df.join(new_df)
    return output

df_aapl = df.loc[idx['AAPL', :, :]]
df_aapl.drop(columns=['PredictedAppreciation'], inplace=True)
add_target_column(df_aapl, hp)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



NameError: name 'QC' is not defined