.Rhistory

model_type <- errors$model[errors$model != "naive_average"][which.min(errors$mean_error[errors$model != "naive_average"])]
use_model <- case_when(errors$model[which.min(errors$mean_error)] == "naive_average" ~ F,
T ~ T)
varlist <- varlist[str_detect(model_type,varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
test_set <- tidify(test_set)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = test_set,insample=F)
#See if our model forecast performed better than a naive average
compare_errors(df = test_set) %>% kable(caption = paste0("Comparing model forecast to naive forecast ",global_optimizer[[2]]))
# use_model <- case_when(compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),2] >= compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),3]~F,
#                        T ~ use_model)
#Calculate model standard deviations
sds <- get_sds(df=test_set,use_model=use_model)
#Retrain yield spread model on full data set (without partitioning), chart
#historical forecast against historical data, and forecast returns and sharpe
#ratio for the current date
#Filter dataset to keep only the data we're interested in
yieldspread <- yieldspread_full %>%
filter(symbol == ticker_to_study) %>%
mutate(em_ig_discount = us_hy_yield - em_ig_yield) %>%
full_join(yieldspread_full %>%
filter(symbol %in% c(ticker_to_study,"VOO")) %>%
select(date,symbol,div_yield) %>%
spread(symbol,div_yield) %>%
mutate(div_premium = IEMG - VOO) %>%
mutate(div_premium = (div_premium-min(div_premium,na.rm=T))/(max(div_premium,na.rm=T)-min(div_premium,na.rm=T))+1) %>%
select(date,div_premium)) %>%
.[c("date","symbol",
paste0("return_",timeframes_to_optimize,"_yr"),
varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
yieldspread <- tidify(yieldspread)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = yieldspread,insample=T)
#If sds are higher than when we plotted out of sample, use the higher value
sds <- sds %>%
mutate(comparer = get_sds(df=yieldspread,print=F,use_model=use_model)$Model) %>%
mutate(Model = case_when(Model > comparer~Model,
T~comparer)) %>%
select(-comparer)
#Based on forecast returns, standard deviations, and risk-free rates, calculate
#expected sharpe ratios
sharpes <- bind_rows(sharpes,get_sharpes())
sharpes
rm(sds)
# Analyze VGK Returns ----------------------------------------------------
#Define a ticker and a list of variables to study
ticker_to_study <- "VGK"
varlist <- c("eur_spread","dxy","div_yield","raw_materials","industrial_materials",
"brent_crude","usd_eur","nat_gas","eur_hy_yield","ecb_bs",
"eur_10yr_yield","eur_cli","eur_cpi_12mo","eur_cpi_3mo"
# ,"change_in_ecb_bs","change_in_usd_eur","change_in_dxy",
# "change_in_industrial_materials","change_in_raw_materials",
# "change_in_brent_crude","change_in_nat_gas"
)
#Filter dataset to keep only the data we're interested in
yieldspread <- yieldspread_full %>%
filter(symbol == ticker_to_study) %>%
.[c("date","symbol",
paste0("return_",timeframes_to_optimize,"_yr"),
varlist)]
#Partition time series for training, testing, and validation
test_set <- yieldspread[yieldspread$date >= max(yieldspread$date) - years(4),]
yieldspread <- yieldspread[yieldspread$date < max(yieldspread$date) - years(4),]
#Calculate correlation coefficients between variables of interest and next-period return
cors <- corchecker(df = yieldspread,
ticker = ticker_to_study,
vars_to_test = varlist)
#Print correlation coefficients in readable format
cors %>%
knitr::kable(caption = paste("Correlation coefficients for various variables and next-period",ticker_to_study,"returns"))
# #Winnow list of variables to study, keeping only the top 5
# varlist <- row.names(cors[1:5,])
#Delete correlation coefficients table from workspace
rm(cors)
#Call function to use five-fold cross-validation to compare error rates on linear
#models that combine the selected variables
errors <- cross_validator(varlist)
#Choose model that minimizes error on average
errors <- errors %>%
group_by(model) %>%
summarize(mean_error = mean(error_measure)) %>%
arrange(mean_error)
errors %>% knitr::kable(caption = paste("Mean",global_optimizer[[2]],"by model"))
model_type <- errors$model[errors$model != "naive_average"][which.min(errors$mean_error[errors$model != "naive_average"])]
use_model <- case_when(errors$model[which.min(errors$mean_error)] == "naive_average" ~ F,
T ~ T)
varlist <- varlist[str_detect(model_type,varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
test_set <- tidify(test_set)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = test_set,insample=F)
#See if our model forecast performed better than a naive average
compare_errors(df = test_set) %>% kable(caption = paste0("Comparing model forecast to naive forecast ",global_optimizer[[2]]))
# use_model <- case_when(compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),2] >= compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),3]~F,
#                        T ~ use_model)
#Calculate model standard deviations
sds <- get_sds(df=test_set,use_model=use_model)
#Retrain yield spread model on full data set (without partitioning), chart
#historical forecast against historical data, and forecast returns and sharpe
#ratio for the current date
#Filter dataset to keep only the data we're interested in
yieldspread <- yieldspread_full %>%
filter(symbol == ticker_to_study) %>%
.[c("date","symbol",
paste0("return_",timeframes_to_optimize,"_yr"),
varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
yieldspread <- tidify(yieldspread)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = yieldspread,insample=T)
#If sds are higher than when we plotted out of sample, use the higher value
sds <- sds %>%
mutate(comparer = get_sds(df=yieldspread,print=F,use_model=use_model)$Model) %>%
mutate(Model = case_when(Model > comparer~Model,
T~comparer)) %>%
select(-comparer)
#Based on forecast returns, standard deviations, and risk-free rates, calculate
#expected sharpe ratios
sharpes <- bind_rows(sharpes,get_sharpes())
sharpes
rm(sds)
# Cross-Asset Comparison -----------------------------------------------
#For comparison purposes, let's focus on one-year forecasts
sharpes <- sharpes %>%
filter(Timeframe == "1-year forward return")
sharpes %>%
mutate(`Asset class` = case_when(Ticker=="EMHY"~"Med-Term EM HY Bonds",
Ticker=="GOVT"~"Med-Term US Govt Bonds",
Ticker=="TLT"~"Long-Term US Govt Bonds",
Ticker=="IEMG"~"EM Stocks",
Ticker=="VCLT"~"Long-Term US Corp Bonds",
Ticker=="VGK"~"EU Stocks",
Ticker=="VIOO"~"US Small Caps",
Ticker=="VOO"~"US Stocks",
Ticker=="EMHY"~"EM HY Bonds",
Ticker=="VWOB"~"EM Govt Bonds",
Ticker=="BSV"~"Short-Term US Govt Bonds",
Ticker=="SPAXX"~"Money-Market")) %>%
select(`Asset class`,Ticker,Model,`Expected return`,`Expected standard deviation`,`Expected Sharpe`) %>%
arrange(desc(`Expected Sharpe`)) %>%
kable(caption = "Sharpe ratios, 1-year timeframe")
#Plot forecasted one-year returns and standard deviations
sharpes %>%
mutate(`Index ETF` = case_when(Ticker=="EMHY"~"EMHY\nMed-Term\nEM HY\nBonds",
Ticker=="GOVT"~"GOVT\nMed-Term\nUS Govt\nBonds",
Ticker=="TLT"~"TLT\nLong-Term\nUS Govt\nBonds",
Ticker=="IEMG"~"IEMG\nEM Stocks",
Ticker=="VCLT"~"VCLT\nLong-Term\nUS Corp\nBonds",
Ticker=="VGK"~"VGK\nEU Stocks",
Ticker=="VOO"~"VOO\nUS Stocks",
Ticker=="VIOO"~"VIOO\nUS Small Caps",
Ticker=="EMHY"~"EMHY\nMed-Term\nEM HY\nBonds",
Ticker=="VWOB"~"VWOB\nMed-Term\nEM Govt\nBonds",
Ticker=="BSV"~"BSV\nShort-Term\nUS Govt\nBonds",
Ticker=="SPAXX"~"Money-\nMarket")) %>%
mutate(`Index ETF` = fct_reorder(`Index ETF`,`Expected return`,.desc=T)) %>%
ggplot(aes(x=`Index ETF`,y=`Expected return`)) +
geom_col(fill="lightblue") +
geom_errorbar(aes(ymin=`Expected return`-`Expected standard deviation`,ymax=`Expected return`+`Expected standard deviation`)) +
scale_y_continuous(labels = percent) +
ggtitle("Forecasted 1-year returns and standard deviations for select stock\nand bond index ETFs") +
labs(caption = "Copyright 2022 Christopher C. Smith") +
#scale_x_discrete(guide = guide_axis(angle = 45)) +
theme_minimal()
#save the plot
ggsave(filename = "multi-asset-model.jpg",
scale = 1,
width = 1920/300,
height = 1080/300,
units = "in")
# Development Notes -------------------------------------------------------
# Near-term development ideas:
# FRED download sometimes fails randomly for different variables.
# Add a check-and-redownload step.
# Try adding unemployment as an additional variable. Also look at US Census
# Household Survey for possible variables
# cur_data() has been deprecated; look into replacing with pick()
# Quite a few FRED series update on a lag from the actual release (e.g., commodities
# prices). Try to figure out how to scrape the actual release to get a faster/
# fresher signal.
# https://datahelp.imf.org/knowledgebase/articles/1968408-how-to-use-the-api-python-and-r
# Maybe just remove the VGK special dividend?
# Europe natural gas data series lags by 1-2 months, unfortunately. Either add a month
# to all dates, use Henry Hub spot price from FRED instead, or find a series of
# TTF gas spot prices that I can scrape with rvest and convert to US dollars
# Since Yahoo! Finance dividend data starts at 2013, go back to fetching from Nasdaq.com
# Try out the spreads between various dividend/bond yields?
# See if I can get data on dividend yield futures?
# I need to handle tuning the change variables at the same time that I do
# cross-validation for model building, so as to avoid overtraining. First train
# these variables, then train bivariate models. Maybe take the top few bivariate
# models and try multivariate models with those variables?
# My change variables all perform very poorly, which makes me a little suspicious
# of how they're being calculated. Make sure they're being calculated correctly.
# Explore using P/E or earnings yield rather than div yield to predict index returns?
# (Dividend yield is problematic because it's a poor proxy for shareholder yield)
# Can I find a higher-frequency P/E series than multpl.com? Can I find series for
# Europe & EM? Earnings yield preferable to P/E.
# Try reverse repo, maybe with a stochastic transformation
# Future development ideas:
# Try optimizing a rebalance model, maybe using decision trees?
# Would be interesting to see how different US sectors react to yields and yield spreads
#Keep only assets with an expected Sharpe ratio greater than 1
#Calculate portfolio allocations (medium risk) as a proportion of Sharpes > 1
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
select(-`Allocation1`,-`Allocation2`) %>%
select(-Timeframe) %>%
mutate(`Expected return` = scales::percent(`Expected return`,accuracy=0.1),
`Expected standard deviation` = scales::percent(`Expected standard deviation`,accuracy=0.1),
Allocation = scales::percent(Allocation,accuracy=0.1)) %>%
arrange(Ticker)
#Create treemap of medium-risk allocation
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Ticker` = case_when(Ticker=="EMHY"~"Medium-Term\nEM HY\nBonds",
Ticker=="GOVT"~"Medium-Term\nUS Govt\nBonds",
Ticker=="TLT"~"Long-Term\nUS Govt\nBonds",
Ticker=="IEMG"~"EM Stocks",
Ticker=="VCLT"~"Long-Term\nUS Corp\nBonds",
Ticker=="VGK"~"EU Stocks",
Ticker=="VOO"~"US Stocks",
Ticker=="EMHY"~"Medium-Term\nEM HY\nBonds",
Ticker=="VWOB"~"Medium-Term\nEM Govt\nBonds",
Ticker=="BSV"~"Short-Term\nUS Govt\nBonds",
Ticker=="SPAXX"~"Money-\nMarket")) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
select(-`Allocation1`,-`Allocation2`) %>%
select(-Timeframe) %>%
ggplot(aes(area = Allocation,fill=Ticker,label=paste0(Ticker,"\n",scales::percent(Allocation)))) +
treemapify::geom_treemap() +
treemapify::geom_treemap_text() +
theme(legend.position = "none")
#Calculate total expected portfolio return given this allocation (medium risk)
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
summarize(`Expected portfolio return` = scales::percent(sum(`Expected return`*Allocation)))
#Calculate how much I need to buy/sell for rebalancing purposes (medium risk)
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
select(-`Allocation1`,-`Allocation2`) %>%
mutate(Ticker = case_when(Ticker == "SPAXX"~".SPAXX",
T~Ticker)) %>%
arrange(Ticker) %>%
mutate(cash = Allocation*(9963)) %>%
mutate(change=cash-c(1621,451,1526,0,0,561,1124)) %>%
select(Ticker,cash,change)
#Calculate how much I need to buy/sell for rebalancing purposes (high risk)
sharpes %>%
filter(`Expected Sharpe` >= 1) %>%
slice_max(`Expected return`,n=3) %>%
mutate(`Allocation` = (`Expected return`)/(sum(`Expected return`))) %>%
mutate(Ticker = case_when(Ticker == "SPAXX"~".SPAXX",
T~Ticker)) %>%
arrange(Ticker) %>%
mutate(cash = Allocation*(4325)) %>%
mutate(change=cash-c(sum(492,465,898),0,1085)) %>%
select(Ticker,cash,change)
#Calculate how much I need to buy/sell for rebalancing purposes (high risk)
sharpes %>%
filter(`Expected Sharpe` >= 1) %>%
slice_max(`Expected return`,n=3) %>%
mutate(`Allocation` = (`Expected return`)/(sum(`Expected return`))) %>%
mutate(Ticker = case_when(Ticker == "SPAXX"~".SPAXX",
T~Ticker)) %>%
arrange(Ticker) %>%
mutate(cash = Allocation*(4434)) %>%
mutate(change=cash-c(0,1125,0)) %>%
select(Ticker,cash,change)
library(devtools)
create("multi_asset_model")
create("asset-forecaster")
create("assetforecaster")
wd()
install.packages("gitcreds")
library(gitcreds)
gitcreds_set()
#Script to predict 10-year yields from PCE inflation rate, factoring in how
#10-year yields have fallen over time
#NOTE: The script will only work if you have a valid FRED API key in .Renviron!
#The easiest way to edit .Renviron is by calling usethis::edit_r_environ().
#Then add the line "FRED_API_KEY = yourAPIkeyhere", without quotes, and save file.
#You'll also want to set img_save_location in either the console or .Rprofile.
#The easiest way to edit .Rprofile is by calling file.edit("~/.Rprofile").
#Load libraries
library(tidyverse)
library(lubridate)
library(fredr)
library(rvest)
#Set FRED API key from .Renviron file
readRenviron("~/.Renviron")
#Get FRED series for 10-year yields using fredr()
#Calculate average yield for each month
tenyr_df <- fredr("DGS10")
tenyr_df <- tenyr_df %>%
select(date,value) %>%
filter(!is.na(value)) %>%
mutate(year = year(date),month = month(date)) %>%
group_by(year,month) %>%
summarize(mean_yield = mean(value,na.rm=T))
#Get FRED series for PCE inflation rate using fredr()
#Since PCE inflation is reported with a two-month lag, add two months to all dates
pce_df <- fredr("PCETRIM12M159SFRBDAL")
pce_df <- pce_df %>%
mutate(twelve_month_pce = value,
date=date %m+% months(2)) %>%
filter(!is.na(twelve_month_pce)) %>%
select(date,twelve_month_pce) %>%
mutate(year = year(date),month = month(date))
#Join the two tables
joined_df <- inner_join(tenyr_df,pce_df) %>%
ungroup() %>%
mutate(date = as.Date(paste(as.character(year),as.character(month),"1",sep="/"),format="%Y/%m/%d"),
sequence = row_number())
#Predict 10-year yield based on this simple log regression model using PCE rate
fit <- joined_df %>% lm(mean_yield ~ twelve_month_pce, data = .)
initial_predicted_yield <- predict(fit,newdata=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce)))
#Plot PCE rate against 10-year yield, with date shown as color
joined_df %>%
ggplot(aes(x=twelve_month_pce,y=mean_yield,col=date)) +
geom_point() +
geom_smooth(method = "lm",formula = y~log(x)) +
geom_point(data=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce),mean_yield = last(joined_df$mean_yield),date = last(joined_df$date)),color="red") +
scale_x_continuous(breaks = seq(0,10,by = .5)) +
scale_y_continuous(limits=c(0,16),breaks = seq(0,16.25,by = 1.25)) +
labs(x="TTM PCE Inflation Rate",y="Monthly Average 10-Year Yield",title="10-Year Yield by PCE Inflation Rate")
#Plot yield vs. date
joined_df %>%
mutate(twelve_month_pce_strat = factor(round(twelve_month_pce))) %>%
ggplot(aes(x=date,y=mean_yield)) +
geom_point() +
geom_smooth(method="lm",formula=y~log(x)) +
labs(x="Date",y="Monthly Average 10-Year Yield",title="10-Year Yield over Time")
#Stratify by PCE inflation rate and plot yield vs. date
joined_df %>%
mutate(twelve_month_pce_strat = factor(round(twelve_month_pce))) %>%
group_by(twelve_month_pce_strat) %>%
mutate(n=n()) %>%
filter(n>20) %>%
ggplot(aes(x=date,y=mean_yield)) +
geom_point() +
geom_smooth(method="lm",formula=y~log(x)) +
facet_wrap( ~ twelve_month_pce_strat) +
labs(x="Date",y="Monthly Average 10-Year Yield",title="10-Year Yield over Time, Stratified by PCE Inflation Rate")
#Subtract PCE inflation model from the data and regress for time effect
joined_df %>%
mutate(pce_model = predict(lm(mean_yield ~ log(twelve_month_pce), data = .)),
mean_yield_residual = mean_yield - pce_model) %>%
ggplot(aes(x=date,y=mean_yield_residual)) +
geom_point() +
geom_smooth(method="lm",formula=y~x) +
labs(x="Date",y="Residual Monthly Average 10-Year Yield",title="Residual 10-Year Yield over Time after Subtracting PCE Inflation-Rate\nLog Regression Model")
#Partition data for 5-fold cross-validation
library(caret)
set.seed(1)
test_index <- createFolds(joined_df$mean_yield,k=5)
#Define function to get root mean squared error of predictions
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2))
}
#Perform five-fold cross-validation and calculate RMSEs
rmses <- map_dfr(1:5,function(x){
#Partition data
train_set <- joined_df[-test_index[[x]],]
test_set <- joined_df[test_index[[x]],]
#Do a multiple regression, accounting for both PCE inflation and date
fit <- train_set %>% lm(mean_yield ~ sequence + twelve_month_pce, data = .)
fit1 <- train_set %>% lm(mean_yield ~ sequence + log(twelve_month_pce), data = .)
fit2 <- train_set %>% lm(mean_yield ~ log(sequence) + twelve_month_pce, data = .)
fit3 <- train_set %>% lm(mean_yield ~ log(sequence) + log(twelve_month_pce), data = .)
#Choose the model that minimizes RMSE
return(data.frame(Model=c("Double linear","Linear date, log PCE","Log date, linear PCE","Double log"),
RMSE = c(RMSE(test_set$mean_yield,predict(fit,test_set)),
RMSE(test_set$mean_yield,predict(fit1,test_set)),
RMSE(test_set$mean_yield,predict(fit2,test_set)),
RMSE(test_set$mean_yield,predict(fit3,test_set))),
Trial = rep(x,times=4)))
})
#Choose model that minimizes RMSE on average
rmses %>% group_by(Model) %>% summarize(Mean_RMSE = mean(RMSE)) %>% knitr::kable()
#Retrain the winning model on the whole data set
fit <- joined_df %>% lm(mean_yield ~ sequence + log(twelve_month_pce), data = .)
#Predict the model for rates today
df <- data.frame(twelve_month_pce = c(seq(1,8.5,by=0.5),last(joined_df$twelve_month_pce)),sequence=rep(last(joined_df$sequence),times=17)) %>%
mutate(predicted_yield = predict(fit,newdata = .)) %>%
mutate(highlight = twelve_month_pce %in% last(joined_df$twelve_month_pce))
final_predicted_yield <- predict(fit,newdata=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce),sequence = last(joined_df$sequence)))
#Plot the rate prediction curve for the current date
df %>%
ggplot(aes(x=twelve_month_pce,y=predicted_yield,col=highlight)) +
geom_point(size=2,show.legend = FALSE) +
geom_line(show.legend = FALSE) +
geom_text(data=df[df$highlight==TRUE,],aes(x=twelve_month_pce,y=predicted_yield,label=round(predicted_yield,1)),nudge_x=-0.3) +
scale_x_continuous(breaks=seq(0,10,by=.5)) +
labs(x="TTM PCE Inflation Rate",y="Monthly Average 10-Year Yield",title="Predicted 10-Year Yield by PCE Inflation Rate for the Current Date") +
theme(legend.position = "none")
#Subtract all time effects prior to the present
#Re-plot PCE rate against 10-year yield
joined_df %>%
mutate(mean_yield = mean_yield - fit$coefficients["sequence"]*sequence + fit$coefficients["sequence"]*last(sequence)) %>%
ggplot(aes(x=twelve_month_pce,y=mean_yield)) +
geom_point() +
geom_smooth(method = "lm",formula = y~log(x)) +
geom_point(data=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce),mean_yield = last(joined_df$mean_yield),date = last(joined_df$date)),color="red") +
scale_x_continuous(breaks = seq(0,10,by = .5)) +
labs(x="TTM PCE Inflation Rate",y="Monthly Average 10-Year Yield",title="10-Year Yield by PCE Inflation Rate, Controlling for Changes in Yields \nover Time")
#Navigate to Multpl and get P/E ratio history, clean up data
pe <- "https://www.multpl.com/s-p-500-pe-ratio/table/by-month" %>%
read_html() %>%
html_nodes(css = '#datatable') %>%
html_table() %>%
.[[1]] %>%
as.data.frame()
names(pe) <- c("date","pe")
pe <- pe %>%
mutate(pe = str_remove_all(pe,"[a-z]")) %>%
mutate(pe = as.numeric(pe)) %>%
mutate(date = as.Date(date,format = "%b %d, %Y"))
if(month(pe$date[1])==month(pe$date[2])){
pe <- pe[-2,]
}
#Join PE data with PCE inflation data
pe <- pe %>%
mutate(year = year(date),
month = month(date)) %>%
select(-date)
pe_joined <- inner_join(tenyr_df,pe) %>%
inner_join(.,pce_df) %>%
ungroup() %>%
mutate(date = as.Date(paste(as.character(year),as.character(month),"1",sep="/"),format="%Y/%m/%d"),
sequence = row_number())
#Partition data for five-fold cross-validation
test_index <- createFolds(pe_joined$pe,k=5)
#Perform five-fold cross-validation and calculate RMSEs
rmses <- map_dfr(1:5,function(x){
#Partition data
train_set <- pe_joined[-test_index[[x]],]
test_set <- pe_joined[test_index[[x]],]
#Do a multiple regression, accounting for both PCE inflation and yield
fit <- train_set %>% lm(pe ~ twelve_month_pce + mean_yield, data = .)
fit1 <- train_set %>% lm(pe ~ twelve_month_pce + log(mean_yield), data = .)
fit2 <- train_set %>% lm(pe ~ log(twelve_month_pce) + mean_yield, data = .)
fit3 <- train_set %>% lm(pe ~ log(twelve_month_pce) + log(mean_yield), data = .)
#Choose the model that minimizes RMSE
return(data.frame(Model=c("Double linear","Linear PCE, log yield","Log PCE, linear yield","Double log"),
RMSE = c(RMSE(test_set$pe,predict(fit,test_set)),
RMSE(test_set$pe,predict(fit1,test_set)),
RMSE(test_set$pe,predict(fit2,test_set)),
RMSE(test_set$pe,predict(fit3,test_set))),
Trial = rep(x,times=4)))
})
#Choose model that minimizes RMSE on average
rmses %>% group_by(Model) %>% summarize(Mean_RMSE = mean(RMSE)) %>% knitr::kable()
#Retrain the winning model on the whole data set
fit <- pe_joined %>% lm(pe ~ mean_yield + twelve_month_pce, data = .)
#Predict the model for rates and inflation today
predicted_pe <- predict(fit,newdata=data.frame(twelve_month_pce = last(pe_joined$twelve_month_pce),mean_yield = last(tenyr_df$mean_yield)))
#Predict the model for the predicted inflation rate
predicted_pe2 <- predict(fit,newdata=data.frame(twelve_month_pce = last(pe_joined$twelve_month_pce),mean_yield = final_predicted_yield))
#Plot current P/E vs. model prediction
pe_joined %>%
mutate(adj_predicted_pe = predict(fit,newdata=data.frame(twelve_month_pce = last(.$twelve_month_pce),mean_yield = .$mean_yield))) %>%
mutate(unadj_predicted_pe = predict(fit,newdata=.)) %>%
mutate(adj_pe = pe+(adj_predicted_pe-unadj_predicted_pe)) %>%
ggplot(aes(x=mean_yield,y=adj_pe)) +
geom_point() +
geom_line(aes(y=adj_predicted_pe),color="blue",linewidth=1) +
geom_point(aes(x=last(mean_yield),y=last(adj_pe)),color="red") +
geom_point(aes(x=final_predicted_yield,y=predicted_pe2),color="green") +
scale_y_continuous(limits = c(0,125))+
labs(x="10-year Treasury yield",y="S&P 500 P/E",title="S&P 500 P/E vs. 10-year Treasury yield, adjusted for PCE inflation\nrate")