-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
512 lines (512 loc) · 24.7 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
model_type <- errors$model[errors$model != "naive_average"][which.min(errors$mean_error[errors$model != "naive_average"])]
use_model <- case_when(errors$model[which.min(errors$mean_error)] == "naive_average" ~ F,
T ~ T)
varlist <- varlist[str_detect(model_type,varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
test_set <- tidify(test_set)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = test_set,insample=F)
#See if our model forecast performed better than a naive average
compare_errors(df = test_set) %>% kable(caption = paste0("Comparing model forecast to naive forecast ",global_optimizer[[2]]))
# use_model <- case_when(compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),2] >= compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),3]~F,
# T ~ use_model)
#Calculate model standard deviations
sds <- get_sds(df=test_set,use_model=use_model)
#Retrain yield spread model on full data set (without partitioning), chart
#historical forecast against historical data, and forecast returns and sharpe
#ratio for the current date
#Filter dataset to keep only the data we're interested in
yieldspread <- yieldspread_full %>%
filter(symbol == ticker_to_study) %>%
mutate(em_ig_discount = us_hy_yield - em_ig_yield) %>%
full_join(yieldspread_full %>%
filter(symbol %in% c(ticker_to_study,"VOO")) %>%
select(date,symbol,div_yield) %>%
spread(symbol,div_yield) %>%
mutate(div_premium = IEMG - VOO) %>%
mutate(div_premium = (div_premium-min(div_premium,na.rm=T))/(max(div_premium,na.rm=T)-min(div_premium,na.rm=T))+1) %>%
select(date,div_premium)) %>%
.[c("date","symbol",
paste0("return_",timeframes_to_optimize,"_yr"),
varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
yieldspread <- tidify(yieldspread)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = yieldspread,insample=T)
#If sds are higher than when we plotted out of sample, use the higher value
sds <- sds %>%
mutate(comparer = get_sds(df=yieldspread,print=F,use_model=use_model)$Model) %>%
mutate(Model = case_when(Model > comparer~Model,
T~comparer)) %>%
select(-comparer)
#Based on forecast returns, standard deviations, and risk-free rates, calculate
#expected sharpe ratios
sharpes <- bind_rows(sharpes,get_sharpes())
sharpes
rm(sds)
# Analyze VGK Returns ----------------------------------------------------
#Define a ticker and a list of variables to study
ticker_to_study <- "VGK"
varlist <- c("eur_spread","dxy","div_yield","raw_materials","industrial_materials",
"brent_crude","usd_eur","nat_gas","eur_hy_yield","ecb_bs",
"eur_10yr_yield","eur_cli","eur_cpi_12mo","eur_cpi_3mo"
# ,"change_in_ecb_bs","change_in_usd_eur","change_in_dxy",
# "change_in_industrial_materials","change_in_raw_materials",
# "change_in_brent_crude","change_in_nat_gas"
)
#Filter dataset to keep only the data we're interested in
yieldspread <- yieldspread_full %>%
filter(symbol == ticker_to_study) %>%
.[c("date","symbol",
paste0("return_",timeframes_to_optimize,"_yr"),
varlist)]
#Partition time series for training, testing, and validation
test_set <- yieldspread[yieldspread$date >= max(yieldspread$date) - years(4),]
yieldspread <- yieldspread[yieldspread$date < max(yieldspread$date) - years(4),]
#Calculate correlation coefficients between variables of interest and next-period return
cors <- corchecker(df = yieldspread,
ticker = ticker_to_study,
vars_to_test = varlist)
#Print correlation coefficients in readable format
cors %>%
knitr::kable(caption = paste("Correlation coefficients for various variables and next-period",ticker_to_study,"returns"))
# #Winnow list of variables to study, keeping only the top 5
# varlist <- row.names(cors[1:5,])
#Delete correlation coefficients table from workspace
rm(cors)
#Call function to use five-fold cross-validation to compare error rates on linear
#models that combine the selected variables
errors <- cross_validator(varlist)
#Choose model that minimizes error on average
errors <- errors %>%
group_by(model) %>%
summarize(mean_error = mean(error_measure)) %>%
arrange(mean_error)
errors %>% knitr::kable(caption = paste("Mean",global_optimizer[[2]],"by model"))
model_type <- errors$model[errors$model != "naive_average"][which.min(errors$mean_error[errors$model != "naive_average"])]
use_model <- case_when(errors$model[which.min(errors$mean_error)] == "naive_average" ~ F,
T ~ T)
varlist <- varlist[str_detect(model_type,varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
test_set <- tidify(test_set)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = test_set,insample=F)
#See if our model forecast performed better than a naive average
compare_errors(df = test_set) %>% kable(caption = paste0("Comparing model forecast to naive forecast ",global_optimizer[[2]]))
# use_model <- case_when(compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),2] >= compare_errors(df = test_set)[nrow(compare_errors(df = test_set)),3]~F,
# T ~ use_model)
#Calculate model standard deviations
sds <- get_sds(df=test_set,use_model=use_model)
#Retrain yield spread model on full data set (without partitioning), chart
#historical forecast against historical data, and forecast returns and sharpe
#ratio for the current date
#Filter dataset to keep only the data we're interested in
yieldspread <- yieldspread_full %>%
filter(symbol == ticker_to_study) %>%
.[c("date","symbol",
paste0("return_",timeframes_to_optimize,"_yr"),
varlist)]
#Retrain the optimal model on the full training data set
models <- map(timeframes_to_optimize,function(t){
lm(yieldspread,formula = paste0("return_",t,"_yr ~ ",model_type))
})
#Forecast returns on test set and convert to tidy data frame
yieldspread <- tidify(yieldspread)
#Choose focus variable for charting, then create chart
focus_var <- varlist[1]
chartit(df = yieldspread,insample=T)
#If sds are higher than when we plotted out of sample, use the higher value
sds <- sds %>%
mutate(comparer = get_sds(df=yieldspread,print=F,use_model=use_model)$Model) %>%
mutate(Model = case_when(Model > comparer~Model,
T~comparer)) %>%
select(-comparer)
#Based on forecast returns, standard deviations, and risk-free rates, calculate
#expected sharpe ratios
sharpes <- bind_rows(sharpes,get_sharpes())
sharpes
rm(sds)
# Cross-Asset Comparison -----------------------------------------------
#For comparison purposes, let's focus on one-year forecasts
sharpes <- sharpes %>%
filter(Timeframe == "1-year forward return")
sharpes %>%
mutate(`Asset class` = case_when(Ticker=="EMHY"~"Med-Term EM HY Bonds",
Ticker=="GOVT"~"Med-Term US Govt Bonds",
Ticker=="TLT"~"Long-Term US Govt Bonds",
Ticker=="IEMG"~"EM Stocks",
Ticker=="VCLT"~"Long-Term US Corp Bonds",
Ticker=="VGK"~"EU Stocks",
Ticker=="VIOO"~"US Small Caps",
Ticker=="VOO"~"US Stocks",
Ticker=="EMHY"~"EM HY Bonds",
Ticker=="VWOB"~"EM Govt Bonds",
Ticker=="BSV"~"Short-Term US Govt Bonds",
Ticker=="SPAXX"~"Money-Market")) %>%
select(`Asset class`,Ticker,Model,`Expected return`,`Expected standard deviation`,`Expected Sharpe`) %>%
arrange(desc(`Expected Sharpe`)) %>%
kable(caption = "Sharpe ratios, 1-year timeframe")
#Plot forecasted one-year returns and standard deviations
sharpes %>%
mutate(`Index ETF` = case_when(Ticker=="EMHY"~"EMHY\nMed-Term\nEM HY\nBonds",
Ticker=="GOVT"~"GOVT\nMed-Term\nUS Govt\nBonds",
Ticker=="TLT"~"TLT\nLong-Term\nUS Govt\nBonds",
Ticker=="IEMG"~"IEMG\nEM Stocks",
Ticker=="VCLT"~"VCLT\nLong-Term\nUS Corp\nBonds",
Ticker=="VGK"~"VGK\nEU Stocks",
Ticker=="VOO"~"VOO\nUS Stocks",
Ticker=="VIOO"~"VIOO\nUS Small Caps",
Ticker=="EMHY"~"EMHY\nMed-Term\nEM HY\nBonds",
Ticker=="VWOB"~"VWOB\nMed-Term\nEM Govt\nBonds",
Ticker=="BSV"~"BSV\nShort-Term\nUS Govt\nBonds",
Ticker=="SPAXX"~"Money-\nMarket")) %>%
mutate(`Index ETF` = fct_reorder(`Index ETF`,`Expected return`,.desc=T)) %>%
ggplot(aes(x=`Index ETF`,y=`Expected return`)) +
geom_col(fill="lightblue") +
geom_errorbar(aes(ymin=`Expected return`-`Expected standard deviation`,ymax=`Expected return`+`Expected standard deviation`)) +
scale_y_continuous(labels = percent) +
ggtitle("Forecasted 1-year returns and standard deviations for select stock\nand bond index ETFs") +
labs(caption = "Copyright 2022 Christopher C. Smith") +
#scale_x_discrete(guide = guide_axis(angle = 45)) +
theme_minimal()
#save the plot
ggsave(filename = "multi-asset-model.jpg",
scale = 1,
width = 1920/300,
height = 1080/300,
units = "in")
# Development Notes -------------------------------------------------------
# Near-term development ideas:
# FRED download sometimes fails randomly for different variables.
# Add a check-and-redownload step.
# Try adding unemployment as an additional variable. Also look at US Census
# Household Survey for possible variables
# cur_data() has been deprecated; look into replacing with pick()
# Quite a few FRED series update on a lag from the actual release (e.g., commodities
# prices). Try to figure out how to scrape the actual release to get a faster/
# fresher signal.
# https://datahelp.imf.org/knowledgebase/articles/1968408-how-to-use-the-api-python-and-r
# Maybe just remove the VGK special dividend?
# Europe natural gas data series lags by 1-2 months, unfortunately. Either add a month
# to all dates, use Henry Hub spot price from FRED instead, or find a series of
# TTF gas spot prices that I can scrape with rvest and convert to US dollars
# Since Yahoo! Finance dividend data starts at 2013, go back to fetching from Nasdaq.com
# Try out the spreads between various dividend/bond yields?
# See if I can get data on dividend yield futures?
# I need to handle tuning the change variables at the same time that I do
# cross-validation for model building, so as to avoid overtraining. First train
# these variables, then train bivariate models. Maybe take the top few bivariate
# models and try multivariate models with those variables?
# My change variables all perform very poorly, which makes me a little suspicious
# of how they're being calculated. Make sure they're being calculated correctly.
# Explore using P/E or earnings yield rather than div yield to predict index returns?
# (Dividend yield is problematic because it's a poor proxy for shareholder yield)
# Can I find a higher-frequency P/E series than multpl.com? Can I find series for
# Europe & EM? Earnings yield preferable to P/E.
# Try reverse repo, maybe with a stochastic transformation
# Future development ideas:
# Try optimizing a rebalance model, maybe using decision trees?
# Would be interesting to see how different US sectors react to yields and yield spreads
#Keep only assets with an expected Sharpe ratio greater than 1
#Calculate portfolio allocations (medium risk) as a proportion of Sharpes > 1
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
select(-`Allocation1`,-`Allocation2`) %>%
select(-Timeframe) %>%
mutate(`Expected return` = scales::percent(`Expected return`,accuracy=0.1),
`Expected standard deviation` = scales::percent(`Expected standard deviation`,accuracy=0.1),
Allocation = scales::percent(Allocation,accuracy=0.1)) %>%
arrange(Ticker)
#Create treemap of medium-risk allocation
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Ticker` = case_when(Ticker=="EMHY"~"Medium-Term\nEM HY\nBonds",
Ticker=="GOVT"~"Medium-Term\nUS Govt\nBonds",
Ticker=="TLT"~"Long-Term\nUS Govt\nBonds",
Ticker=="IEMG"~"EM Stocks",
Ticker=="VCLT"~"Long-Term\nUS Corp\nBonds",
Ticker=="VGK"~"EU Stocks",
Ticker=="VOO"~"US Stocks",
Ticker=="EMHY"~"Medium-Term\nEM HY\nBonds",
Ticker=="VWOB"~"Medium-Term\nEM Govt\nBonds",
Ticker=="BSV"~"Short-Term\nUS Govt\nBonds",
Ticker=="SPAXX"~"Money-\nMarket")) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
select(-`Allocation1`,-`Allocation2`) %>%
select(-Timeframe) %>%
ggplot(aes(area = Allocation,fill=Ticker,label=paste0(Ticker,"\n",scales::percent(Allocation)))) +
treemapify::geom_treemap() +
treemapify::geom_treemap_text() +
theme(legend.position = "none")
#Calculate total expected portfolio return given this allocation (medium risk)
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
summarize(`Expected portfolio return` = scales::percent(sum(`Expected return`*Allocation)))
#Calculate how much I need to buy/sell for rebalancing purposes (medium risk)
sharpes %>%
filter(`Expected Sharpe` >= 1,
) %>%
mutate(`Hacked Sharpe` = `Expected return` - `Expected standard deviation`) %>%
mutate(`Allocation1` = (`Hacked Sharpe`)/(sum(`Hacked Sharpe`))) %>%
mutate(`Allocation2` = (`Expected Sharpe`-1)/(sum(`Expected Sharpe`-1))) %>%
mutate(`Allocation` = (1.75*`Allocation1`+0.25*`Allocation2`)/2) %>%
select(-`Allocation1`,-`Allocation2`) %>%
mutate(Ticker = case_when(Ticker == "SPAXX"~".SPAXX",
T~Ticker)) %>%
arrange(Ticker) %>%
mutate(cash = Allocation*(9963)) %>%
mutate(change=cash-c(1621,451,1526,0,0,561,1124)) %>%
select(Ticker,cash,change)
#Calculate how much I need to buy/sell for rebalancing purposes (high risk)
sharpes %>%
filter(`Expected Sharpe` >= 1) %>%
slice_max(`Expected return`,n=3) %>%
mutate(`Allocation` = (`Expected return`)/(sum(`Expected return`))) %>%
mutate(Ticker = case_when(Ticker == "SPAXX"~".SPAXX",
T~Ticker)) %>%
arrange(Ticker) %>%
mutate(cash = Allocation*(4325)) %>%
mutate(change=cash-c(sum(492,465,898),0,1085)) %>%
select(Ticker,cash,change)
#Calculate how much I need to buy/sell for rebalancing purposes (high risk)
sharpes %>%
filter(`Expected Sharpe` >= 1) %>%
slice_max(`Expected return`,n=3) %>%
mutate(`Allocation` = (`Expected return`)/(sum(`Expected return`))) %>%
mutate(Ticker = case_when(Ticker == "SPAXX"~".SPAXX",
T~Ticker)) %>%
arrange(Ticker) %>%
mutate(cash = Allocation*(4434)) %>%
mutate(change=cash-c(0,1125,0)) %>%
select(Ticker,cash,change)
library(devtools)
create("multi_asset_model")
create("asset-forecaster")
create("assetforecaster")
wd()
install.packages("gitcreds")
library(gitcreds)
gitcreds_set()
#Script to predict 10-year yields from PCE inflation rate, factoring in how
#10-year yields have fallen over time
#NOTE: The script will only work if you have a valid FRED API key in .Renviron!
#The easiest way to edit .Renviron is by calling usethis::edit_r_environ().
#Then add the line "FRED_API_KEY = yourAPIkeyhere", without quotes, and save file.
#You'll also want to set img_save_location in either the console or .Rprofile.
#The easiest way to edit .Rprofile is by calling file.edit("~/.Rprofile").
#Load libraries
library(tidyverse)
library(lubridate)
library(fredr)
library(rvest)
#Set FRED API key from .Renviron file
readRenviron("~/.Renviron")
#Get FRED series for 10-year yields using fredr()
#Calculate average yield for each month
tenyr_df <- fredr("DGS10")
tenyr_df <- tenyr_df %>%
select(date,value) %>%
filter(!is.na(value)) %>%
mutate(year = year(date),month = month(date)) %>%
group_by(year,month) %>%
summarize(mean_yield = mean(value,na.rm=T))
#Get FRED series for PCE inflation rate using fredr()
#Since PCE inflation is reported with a two-month lag, add two months to all dates
pce_df <- fredr("PCETRIM12M159SFRBDAL")
pce_df <- pce_df %>%
mutate(twelve_month_pce = value,
date=date %m+% months(2)) %>%
filter(!is.na(twelve_month_pce)) %>%
select(date,twelve_month_pce) %>%
mutate(year = year(date),month = month(date))
#Join the two tables
joined_df <- inner_join(tenyr_df,pce_df) %>%
ungroup() %>%
mutate(date = as.Date(paste(as.character(year),as.character(month),"1",sep="/"),format="%Y/%m/%d"),
sequence = row_number())
#Predict 10-year yield based on this simple log regression model using PCE rate
fit <- joined_df %>% lm(mean_yield ~ twelve_month_pce, data = .)
initial_predicted_yield <- predict(fit,newdata=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce)))
#Plot PCE rate against 10-year yield, with date shown as color
joined_df %>%
ggplot(aes(x=twelve_month_pce,y=mean_yield,col=date)) +
geom_point() +
geom_smooth(method = "lm",formula = y~log(x)) +
geom_point(data=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce),mean_yield = last(joined_df$mean_yield),date = last(joined_df$date)),color="red") +
scale_x_continuous(breaks = seq(0,10,by = .5)) +
scale_y_continuous(limits=c(0,16),breaks = seq(0,16.25,by = 1.25)) +
labs(x="TTM PCE Inflation Rate",y="Monthly Average 10-Year Yield",title="10-Year Yield by PCE Inflation Rate")
#Plot yield vs. date
joined_df %>%
mutate(twelve_month_pce_strat = factor(round(twelve_month_pce))) %>%
ggplot(aes(x=date,y=mean_yield)) +
geom_point() +
geom_smooth(method="lm",formula=y~log(x)) +
labs(x="Date",y="Monthly Average 10-Year Yield",title="10-Year Yield over Time")
#Stratify by PCE inflation rate and plot yield vs. date
joined_df %>%
mutate(twelve_month_pce_strat = factor(round(twelve_month_pce))) %>%
group_by(twelve_month_pce_strat) %>%
mutate(n=n()) %>%
filter(n>20) %>%
ggplot(aes(x=date,y=mean_yield)) +
geom_point() +
geom_smooth(method="lm",formula=y~log(x)) +
facet_wrap( ~ twelve_month_pce_strat) +
labs(x="Date",y="Monthly Average 10-Year Yield",title="10-Year Yield over Time, Stratified by PCE Inflation Rate")
#Subtract PCE inflation model from the data and regress for time effect
joined_df %>%
mutate(pce_model = predict(lm(mean_yield ~ log(twelve_month_pce), data = .)),
mean_yield_residual = mean_yield - pce_model) %>%
ggplot(aes(x=date,y=mean_yield_residual)) +
geom_point() +
geom_smooth(method="lm",formula=y~x) +
labs(x="Date",y="Residual Monthly Average 10-Year Yield",title="Residual 10-Year Yield over Time after Subtracting PCE Inflation-Rate\nLog Regression Model")
#Partition data for 5-fold cross-validation
library(caret)
set.seed(1)
test_index <- createFolds(joined_df$mean_yield,k=5)
#Define function to get root mean squared error of predictions
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2))
}
#Perform five-fold cross-validation and calculate RMSEs
rmses <- map_dfr(1:5,function(x){
#Partition data
train_set <- joined_df[-test_index[[x]],]
test_set <- joined_df[test_index[[x]],]
#Do a multiple regression, accounting for both PCE inflation and date
fit <- train_set %>% lm(mean_yield ~ sequence + twelve_month_pce, data = .)
fit1 <- train_set %>% lm(mean_yield ~ sequence + log(twelve_month_pce), data = .)
fit2 <- train_set %>% lm(mean_yield ~ log(sequence) + twelve_month_pce, data = .)
fit3 <- train_set %>% lm(mean_yield ~ log(sequence) + log(twelve_month_pce), data = .)
#Choose the model that minimizes RMSE
return(data.frame(Model=c("Double linear","Linear date, log PCE","Log date, linear PCE","Double log"),
RMSE = c(RMSE(test_set$mean_yield,predict(fit,test_set)),
RMSE(test_set$mean_yield,predict(fit1,test_set)),
RMSE(test_set$mean_yield,predict(fit2,test_set)),
RMSE(test_set$mean_yield,predict(fit3,test_set))),
Trial = rep(x,times=4)))
})
#Choose model that minimizes RMSE on average
rmses %>% group_by(Model) %>% summarize(Mean_RMSE = mean(RMSE)) %>% knitr::kable()
#Retrain the winning model on the whole data set
fit <- joined_df %>% lm(mean_yield ~ sequence + log(twelve_month_pce), data = .)
#Predict the model for rates today
df <- data.frame(twelve_month_pce = c(seq(1,8.5,by=0.5),last(joined_df$twelve_month_pce)),sequence=rep(last(joined_df$sequence),times=17)) %>%
mutate(predicted_yield = predict(fit,newdata = .)) %>%
mutate(highlight = twelve_month_pce %in% last(joined_df$twelve_month_pce))
final_predicted_yield <- predict(fit,newdata=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce),sequence = last(joined_df$sequence)))
#Plot the rate prediction curve for the current date
df %>%
ggplot(aes(x=twelve_month_pce,y=predicted_yield,col=highlight)) +
geom_point(size=2,show.legend = FALSE) +
geom_line(show.legend = FALSE) +
geom_text(data=df[df$highlight==TRUE,],aes(x=twelve_month_pce,y=predicted_yield,label=round(predicted_yield,1)),nudge_x=-0.3) +
scale_x_continuous(breaks=seq(0,10,by=.5)) +
labs(x="TTM PCE Inflation Rate",y="Monthly Average 10-Year Yield",title="Predicted 10-Year Yield by PCE Inflation Rate for the Current Date") +
theme(legend.position = "none")
#Subtract all time effects prior to the present
#Re-plot PCE rate against 10-year yield
joined_df %>%
mutate(mean_yield = mean_yield - fit$coefficients["sequence"]*sequence + fit$coefficients["sequence"]*last(sequence)) %>%
ggplot(aes(x=twelve_month_pce,y=mean_yield)) +
geom_point() +
geom_smooth(method = "lm",formula = y~log(x)) +
geom_point(data=data.frame(twelve_month_pce = last(joined_df$twelve_month_pce),mean_yield = last(joined_df$mean_yield),date = last(joined_df$date)),color="red") +
scale_x_continuous(breaks = seq(0,10,by = .5)) +
labs(x="TTM PCE Inflation Rate",y="Monthly Average 10-Year Yield",title="10-Year Yield by PCE Inflation Rate, Controlling for Changes in Yields \nover Time")
#Navigate to Multpl and get P/E ratio history, clean up data
pe <- "https://www.multpl.com/s-p-500-pe-ratio/table/by-month" %>%
read_html() %>%
html_nodes(css = '#datatable') %>%
html_table() %>%
.[[1]] %>%
as.data.frame()
names(pe) <- c("date","pe")
pe <- pe %>%
mutate(pe = str_remove_all(pe,"[a-z]")) %>%
mutate(pe = as.numeric(pe)) %>%
mutate(date = as.Date(date,format = "%b %d, %Y"))
if(month(pe$date[1])==month(pe$date[2])){
pe <- pe[-2,]
}
#Join PE data with PCE inflation data
pe <- pe %>%
mutate(year = year(date),
month = month(date)) %>%
select(-date)
pe_joined <- inner_join(tenyr_df,pe) %>%
inner_join(.,pce_df) %>%
ungroup() %>%
mutate(date = as.Date(paste(as.character(year),as.character(month),"1",sep="/"),format="%Y/%m/%d"),
sequence = row_number())
#Partition data for five-fold cross-validation
test_index <- createFolds(pe_joined$pe,k=5)
#Perform five-fold cross-validation and calculate RMSEs
rmses <- map_dfr(1:5,function(x){
#Partition data
train_set <- pe_joined[-test_index[[x]],]
test_set <- pe_joined[test_index[[x]],]
#Do a multiple regression, accounting for both PCE inflation and yield
fit <- train_set %>% lm(pe ~ twelve_month_pce + mean_yield, data = .)
fit1 <- train_set %>% lm(pe ~ twelve_month_pce + log(mean_yield), data = .)
fit2 <- train_set %>% lm(pe ~ log(twelve_month_pce) + mean_yield, data = .)
fit3 <- train_set %>% lm(pe ~ log(twelve_month_pce) + log(mean_yield), data = .)
#Choose the model that minimizes RMSE
return(data.frame(Model=c("Double linear","Linear PCE, log yield","Log PCE, linear yield","Double log"),
RMSE = c(RMSE(test_set$pe,predict(fit,test_set)),
RMSE(test_set$pe,predict(fit1,test_set)),
RMSE(test_set$pe,predict(fit2,test_set)),
RMSE(test_set$pe,predict(fit3,test_set))),
Trial = rep(x,times=4)))
})
#Choose model that minimizes RMSE on average
rmses %>% group_by(Model) %>% summarize(Mean_RMSE = mean(RMSE)) %>% knitr::kable()
#Retrain the winning model on the whole data set
fit <- pe_joined %>% lm(pe ~ mean_yield + twelve_month_pce, data = .)
#Predict the model for rates and inflation today
predicted_pe <- predict(fit,newdata=data.frame(twelve_month_pce = last(pe_joined$twelve_month_pce),mean_yield = last(tenyr_df$mean_yield)))
#Predict the model for the predicted inflation rate
predicted_pe2 <- predict(fit,newdata=data.frame(twelve_month_pce = last(pe_joined$twelve_month_pce),mean_yield = final_predicted_yield))
#Plot current P/E vs. model prediction
pe_joined %>%
mutate(adj_predicted_pe = predict(fit,newdata=data.frame(twelve_month_pce = last(.$twelve_month_pce),mean_yield = .$mean_yield))) %>%
mutate(unadj_predicted_pe = predict(fit,newdata=.)) %>%
mutate(adj_pe = pe+(adj_predicted_pe-unadj_predicted_pe)) %>%
ggplot(aes(x=mean_yield,y=adj_pe)) +
geom_point() +
geom_line(aes(y=adj_predicted_pe),color="blue",linewidth=1) +
geom_point(aes(x=last(mean_yield),y=last(adj_pe)),color="red") +
geom_point(aes(x=final_predicted_yield,y=predicted_pe2),color="green") +
scale_y_continuous(limits = c(0,125))+
labs(x="10-year Treasury yield",y="S&P 500 P/E",title="S&P 500 P/E vs. 10-year Treasury yield, adjusted for PCE inflation\nrate")