# Project Part F: Better Model

![](banner_project.jpg)

In [1]:
analyst = "Citlalli Villarreal" # Replace this with your name

In [2]:
f = "setup.R"; for (i in 1:10) { if (file.exists(f)) break else f = paste0("../", f) }; source(f)
options(repr.matrix.max.rows=674)
options(repr.matrix.max.cols=200)
update_geom_defaults("point", list(size=1))                                

## Directions

### Objective

Recommend a portfolio of 12 company investments that maximizes 12-month return of an overall \$1,000,000 investment.

### Approach

_<< Retrieve a transformed dataset about public company fundamentals and use it reproduce the construction of a selected model.

Allocate the capital to each investment opportunity such that each investment opportunity has 25% of the total remaininng budget depending on their ranking (ranking is determined by the quantity of the investment opportunity growth).

Retrieve an investment opportunities dataset, comprising fundamentals for some set of public companies over some one-year period.  Transform the representation of the investment opportunities to match the representation expected by the model, leveraging previous analysis.

Use the mutiple aggregation of models to make predictions about the investment opportunities and accordingly recommend a portfolio of 12 company investments. >>_

## Business Model

_<< Below I initialized 3 business parameters (budget, portfolio size and allocation). Allocation represents the capital a company is allocated which is determined by the company's ranking (ranking is determined by the quantity of the investment opportunity growth). Each company is allocated 25% of the remaining budget.. >>_

In [3]:
# Set the business parameters.

budget = 1000000
portfolio_size = 12

data.frame(budget, portfolio_size)

budget,portfolio_size
1000000,12


In [4]:
# Set any additional business parameters.
#allocation = rep(budget/portfolio_size, portfolio_size)
new_budget = budget
allocation = c()
for (i in c(1:12)) {
    if (i < 12) {
    allocate = new_budget*0.25
    allocation = c(allocation, allocate)
    new_budget = new_budget - allocate
    } else {
    allocation = c(allocation, new_budget)  
    new_budget = new_budget - new_budget
    }
}
fmt(allocation)

allocation
250000.0
187500.0
140625.0
105468.75
79101.56
59326.17
44494.63
33370.97
25028.23
18771.17


## Build Model

_<< A neural network and linear regression model is constructed to predict the growth provided a PC1, PC2, and PC. The two models are used simultaneously in order to provided the best portfolio compromised of the top 12 growth companies. >>_

In [5]:
# Retrieve "My Data.csv"
#   ... OR ...
# Retrieve "Company Fundamentals 2017.csv"

# This is the ORIGINAL data.

# Retrieve "My Data.csv".  This is the ORIGINAL model training data.
data = read.csv("My Data.csv", header=TRUE)

In [6]:
# Transform representation of data, if necessary
data$big_growth = factor(data$big_growth, levels=c("YES","NO"))

# Present a few rows ...
data[1:6,]

gvkey,tic,conm,PC1,PC2,PC3,prccq,growth,big_growth
1004,AIR,AAR CORP,1.4097638,0.2124544,-0.18735809,43.69,0.0507455507,NO
1045,AAL,AMERICAN AIRLINES GROUP INC,-2.8093139,0.2246363,1.43661206,32.11,-0.3828560446,NO
1050,CECE,CECO ENVIRONMENTAL CORP,1.5247216,0.4396434,-0.16785608,6.75,0.3157894737,YES
1062,ASA,ASA GOLD AND PRECIOUS METALS,1.5736687,0.6384403,0.01227541,8.66,-0.2164739518,NO
1072,AVX,AVX CORP,1.2812646,0.4529129,0.09293832,15.25,-0.1184971098,NO
1075,PNW,PINNACLE WEST CAPITAL CORP,0.3697622,-0.4860613,-0.01283639,85.2,0.0002347969,NO


In [7]:
# Construct a model to predict big_growth or growth.
# Present a brief summary of the model parameters.

set.seed(12345)

data$class.bin = as.binary(data$big_growth, "YES")
model.nn = neuralnet(class.bin ~ PC1+PC2 +PC3, data, hidden=c(3,2), algorithm="rprop+", act.fct="logistic", linear.output=FALSE, rep=1)

model.lm = lm(formula=growth ~ PC1 + PC2, data=data)

## Evaluate Model (5-fold cross-validation)

_<< Below the data is partition using the 5-fold cross validation and then for each fold the RMSE and profit is calculated. Lastly, the cross-validation RMSE and profit is calculated by taking the average of all RMSE and profit for each individual fold respectively. >>_

In [8]:
# Calculate and present the model's estimated profit and profit rate.
set.seed(0)
fold = createFolds(data$growth, k=5, list=TRUE)
str(fold)

RMSE <- function(actual, predicted) { sqrt(mean((actual - predicted)^2)) }
profit <- function(dataset) {sum((1+dataset$growth) * allocation) - budget}

rmse_profit <- function(train, test) {
    
    model.nn = neuralnet(class.bin ~ PC1+PC2 +PC3, train, hidden=c(3,2), algorithm="rprop+", act.fct="logistic", linear.output=FALSE, rep=1)
    model.lm = lm(formula=growth ~ PC1 + PC2, data=train)    
    outcome.predicted.nn=predict(model.nn, test)
    outcome.predicted.lm=predict(model.lm, test)
    new.test = cbind(outcome.predicted.nn, test)
    new.test = cbind(outcome.predicted.lm, new.test)
    new.test= new.test[order(-new.test$outcome.predicted.lm),]
    rmse = RMSE(test$growth, outcome.predicted.lm)
    prof = profit(new.test[1:12,])
    return (c(rmse, prof))
}

data.train.1 = data[setdiff(1:nrow(data), fold$Fold1),]
data.train.2 = data[setdiff(1:nrow(data), fold$Fold2),]
data.train.3 = data[setdiff(1:nrow(data), fold$Fold3),]
data.train.4 = data[setdiff(1:nrow(data), fold$Fold4),]
data.train.5 = data[setdiff(1:nrow(data), fold$Fold5),]

data.test.1 = data[fold$Fold1,]
data.test.2 = data[fold$Fold2,]
data.test.3 = data[fold$Fold3,]
data.test.4 = data[fold$Fold4,]
data.test.5 = data[fold$Fold5,]

rmse_profit1 = rmse_profit(data.train.1, data.test.1)
rmse_profit2 = rmse_profit(data.train.2, data.test.2)
rmse_profit3 = rmse_profit(data.train.3, data.test.3)
rmse_profit4 = rmse_profit(data.train.4, data.test.4)
rmse_profit5 = rmse_profit(data.train.5, data.test.5)


data.frame(fold=c(1, 2, 3, 4, 5), rmse=c(rmse_profit1[1], rmse_profit2[1], rmse_profit3[1], rmse_profit4[1], rmse_profit5[1]), 
           profit=c(rmse_profit1[2], rmse_profit2[2], rmse_profit3[2], rmse_profit4[2], rmse_profit5[2]))



List of 5
 $ Fold1: int [1:862] 8 11 16 22 30 32 38 40 41 44 ...
 $ Fold2: int [1:860] 3 9 10 23 26 27 34 39 52 64 ...
 $ Fold3: int [1:862] 2 7 19 29 35 42 53 57 61 62 ...
 $ Fold4: int [1:861] 1 4 5 6 15 17 28 33 36 43 ...
 $ Fold5: int [1:860] 12 13 14 18 20 21 24 25 31 37 ...


fold,rmse,profit
1,0.4445404,-36823.83
2,0.4355719,-129386.25
3,0.5040684,-78544.34
4,0.399233,-46341.18
5,0.5459912,68050.63


In [9]:
# Present the model's 5-fold cross-validation estimated RMSE, profit, and profit rate.
rmse.cv = sum(rmse_profit1[1], rmse_profit2[1], rmse_profit3[1], rmse_profit4[1], rmse_profit5[1])/5
profit.cv =  sum(rmse_profit1[2], rmse_profit2[2], rmse_profit3[2], rmse_profit4[2], rmse_profit5[2])/5
profit_rate.cv = sum(c(rmse_profit1[2], rmse_profit2[2], rmse_profit3[2], rmse_profit4[2], rmse_profit5[2])/budget)/5
fmt(data.frame(rmse.cv, profit.cv, profit_rate.cv), "5-Fold Cross-Validation Estimated Performance")

rmse.cv,profit.cv,profit_rate.cv
0.465881,-44608.99,-0.044609


## Investment Opportunities

_<< Below the data is retrieve from the CSV file called "Investment Opportunities.csv". Once the data is retrieved, I partition the data by the calendar quarter. Then the different datasets representing different quarters are joined into one dataset. This new dataset is filtered and manipulated through the use of various data objects (e.g. RDS). The final resulting dataset, "data.filter", is used to train the model.. >>_

### Retrieve Data

In [10]:
# Retrieve "Investment Opportunities.csv"
# Present the dataset size ...

datax = read.csv("Investment Opportunities.csv", header=TRUE)
size(datax)

observations,variables
918,680


### Transform Representation of Data 

In [11]:
# Transform representation of the investment opportunity data as required to match the
# representation of the orginal ORIGINAL data.
datax$quarter = quarter(mdy(datax[,2]))

data.current.q1 = datax[(datax$quarter==1) & !is.na(datax$prccq), -ncol(datax)]
data.current.q2 = datax[(datax$quarter==2) & !is.na(datax$prccq), -ncol(datax)]
data.current.q3 = datax[(datax$quarter==3) & !is.na(datax$prccq), -ncol(datax)]
data.current.q4 = datax[(datax$quarter==4) & !is.na(datax$prccq), -ncol(datax)]

data.current.q1 = data.current.q1[!duplicated(data.current.q1$gvkey),]
data.current.q2 = data.current.q2[!duplicated(data.current.q2$gvkey),]
data.current.q3 = data.current.q3[!duplicated(data.current.q3$gvkey),]
data.current.q4 = data.current.q4[!duplicated(data.current.q4$gvkey),]

colnames(data.current.q1)[-c(1, 10, 12)] = paste0(colnames(data.current.q1)[-c(1, 10, 12)], ".q1")
colnames(data.current.q2)[-c(1, 10, 12)] = paste0(colnames(data.current.q2)[-c(1, 10, 12)], ".q2")
colnames(data.current.q3)[-c(1, 10, 12)] = paste0(colnames(data.current.q3)[-c(1, 10, 12)], ".q3")
colnames(data.current.q4)[-c(1, 10, 12)] = paste0(colnames(data.current.q4)[-c(1, 10, 12)], ".q4")

m12 = merge(data.current.q1, data.current.q2, by=c("gvkey", "tic", "conm"), all=TRUE)
m34 = merge(data.current.q3, data.current.q4, by=c("gvkey", "tic", "conm"), all=TRUE)
data.current = merge(m12, m34, by=c("gvkey", "tic", "conm"), all=TRUE, sort=TRUE)

data.current = data.current[!is.na(data.current$prccq.q4),]

cn = readRDS("My Filter.rds")
data.current.filter=data.current[,cn]
ml = readRDS("My Imputation.rds")
data.current.imputed=put_impute(data.current.filter, ml)
pc = readRDS("My PC.rds")
data.pc = predict(pc, data.current.imputed)
prevars = readRDS("My Predictors.rds")
data.filter = cbind(data.current.imputed, data.pc)
data.filter = data.filter[,prevars]
data.filter[1:6,]


gvkey,tic,conm,PC1,PC2,PC3
1004,AIR,AAR CORP,1.419587,0.05796411,-0.2576737
1410,ABM,ABM INDUSTRIES INC,1.0563147,0.07293782,-0.160247
1562,AMSWA,AMERICAN SOFTWARE -CL A,1.6304006,0.32243636,-0.1278981
1618,AXR,AMREP CORP,0.8877064,0.14517578,-0.6410072
1632,ADI,ANALOG DEVICES,-1.6234366,-0.48540835,-0.9770837
1686,APOG,APOGEE ENTERPRISES INC,1.4219415,-0.15294429,-0.3697524


## Apply Model

_<< The neural network and linear regression models previously constructed above are used to predict the growths of each investment opportunity. Then a portfolio of 12 investment opportunities is presented (Note that the investment opportunities presented are in the portfolio have the highest overall predicted growth among the other investment opportunities). . >>_

### Predict & Make Portfolio Recommendation

In [12]:
# Use the model to predict growths of each investment opportunity.
# Recommend a portfolio of allocations to 12 investment opportunities: gvkey, tic, conm, allocation

outcome.predicted.nn = predict(model.nn, data.filter)
outcome.predicted.lm=predict(model.lm, data.filter)
new.data.filter = cbind(data.filter, outcome.predicted.nn)
new.data.filter = cbind(new.data.filter, outcome.predicted.lm)
new.data.filter = new.data.filter[order(-new.data.filter$outcome.predicted.lm),]
new.data.filter = new.data.filter[1:12,c('gvkey', 'tic', 'conm')]
portfolio = cbind(new.data.filter, allocation)
fmt(portfolio, 'portfolio')
    

gvkey,tic,conm,allocation
23809,AZO,AUTOZONE INC,250000.0
29692,WEBC,WEBCO INDUSTRIES INC,187500.0
3570,CBRL,CRACKER BARREL OLD CTRY STOR,140625.0
178704,ULTA,ULTA BEAUTY INC,105468.75
63172,FDS,FACTSET RESEARCH SYSTEMS INC,79101.56
65430,PLCE,CHILDRENS PLACE INC,59326.17
3504,COO,COOPER COS INC (THE),44494.63
125276,CRSS,CROSSROADS SYSTEMS INC,33370.97
1864,REX,REX AMERICAN RESOURCES CORP,25028.23
7921,NDSN,NORDSON CORP,18771.17


### Store Portfolio Recommendation

In [13]:
# Store portfolio recommendation

write.csv(portfolio, paste0(analyst, ".csv"), row.names=FALSE)

### Confirm That Format Is Correct

In [14]:
portfolio.retrieved = read.csv(paste0(analyst, ".csv"), header=TRUE)
opportunities = unique(read.csv("Investment Opportunities.csv", header=TRUE)$gvkey)

columns = all(colnames(portfolio.retrieved) == c("gvkey", "tic", "conm", "allocation"))
companies = all(portfolio.retrieved$gvkey %in% opportunities)
allocations = round(sum(portfolio.retrieved$allocation)) == budget
                         
check = data.frame(analyst, columns, companies, allocations)
fmt(check, "Portfolio Recommendation | Format Check")

analyst,columns,companies,allocations
Citlalli Villarreal,True,True,True


<font size=1;>
<p style="text-align: left;">
Copyright (c) Berkeley Data Analytics Group, LLC
<span style="float: right;">
Document revised January 22, 2021
</span>
</p>
</font>