-
Notifications
You must be signed in to change notification settings - Fork 56
/
CaseStudy-MY-RubberPlantation.R
302 lines (251 loc) · 11.7 KB
/
CaseStudy-MY-RubberPlantation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# clear the workspace
rm(list=ls())
# Load the required libraries
library(ggplot2)
library(GGally) # for the ggpairs()
library(corrplot)
library(plyr)
library(mice)
library(car)
library(rpart)
library(randomForest)
library(gbm)
library(moments) # for skewness function
library(tidyr) # for the gather()
library(VIM) # for missing data visualization
# Data source
# Department of Statistics, Malaysia: http://www.dosm.gov.my/v1/index.php?r=column3/accordion&menu_id=aHhRYUpWS3B4VXlYaVBOeUF0WFpWUT09
# load the rubber estate data
df1<- read.csv("data/rubberestate/rubber-paidemployee.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
df2<- read.csv("data/rubberestate/rubber-plantedarea.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
df3<- read.csv("data/rubberestate/rubber-production.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
df4<- read.csv("data/rubberestate/rubber-taparea.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
df5<- read.csv("data/rubberestate/rubber-yield.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
# Exploratory Data Analysis
dim(df1)
dim(df2)
dim(df3)
dim(df4)
dim(df5)
names(df1) # observation: column name too long. rename them
names(df2) # additional space after column names. do formatting
names(df3)
names(df4)
names(df5)
head(df1) # You cannot have employees in decimals. Round this variable
names(df.master)
# Basic Data Management
# Renaming the column name
# Reference: http://stackoverflow.com/questions/9283171/named-columns-renaming
df1<- rename(df1, c("Total.Number.of.Paid.Employee.During.the.Last.Pay.Period..Estate." = "TotalPaidEmployee"))
df2<-rename(df2, c("Planted.Area..Estate....000..Hectare" = "AreaPlantedHect"))
df3<-rename(df3, c("Production..Estate....000..Tonne" = "ProduceTonne"))
df4<-rename(df4, c("Tapped.Area..Estate....000..Hectare" = "TapAreaHect"))
df5<-rename(df5, c("Yeild.per.Hectare..Estate...Kg." = "YieldperHectKg"))
# Rounding the column value for TotalPaidEmployee because there cant be for example 2.5 employees
df1$TotalPaidEmployee<- round(df1$TotalPaidEmployee)
# Inner Join the data frames on common column
df.m1<- merge(df1,df2, by="Year")
df.m2<- merge(df3,df4, by="Year")
df.m3<- merge(df.m2, df5, by="Year")
df.master<- merge(df.m1, df.m3, by="Year")
summary(df.master)
# Missing data treatment
aggr_plot <- aggr(df.master, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE,
labels=names(df.master), cex.axis=.7, gap=3,
ylab=c("Histogram of missing data","Pattern"))
colSums(is.na(df.master)) # There is only 1 missing row
#PMM (Predictive Mean Matching) – For numeric variables
tempData <- mice(df.master,m=5,maxit=50,meth='pmm',seed=1234)
df.master<- mice::complete(tempData,1)
colSums(is.na(df.master))
# Describe the data
library(Hmisc)
describe(df.master)
library(psych)
describe(df.master)
######### Data Visualization
# Univariate visualization- Histogram (Continuous data- use density plot, histogram)
ggplot(df.master) +
geom_histogram(aes(x=AreaPlantedHect), fill="gray", binwidth = 250, stat = "bin", position = "stack")
# density plot
d<- density(df.master$AreaPlantedHect)
plot(d, main = "Kernel Density of Area Planted per Hectare")
polygon(d, col="red", border = "blue")
rug(df.master$AreaPlantedHect, col = "brown")
ggplot(df.master) + geom_density(aes(x=TotalPaidEmployee, colour="red"))
ggplot(df.master) + geom_density(aes(x=AreaPlantedHect))
ggplot(df.master) + geom_density(aes(x=ProduceTonne))
ggplot(df.master) + geom_density(aes(x=TapAreaHect))
ggplot(df.master) + geom_density(aes(x=YieldperHectKg))
# boxplots for continuous variables
library(magrittr) # for the pipe operator
library(dplyr) # for select()
# Method 1: selecting individual predictor name
boxplot(df.master %>%
# Note that the ‘%>%’ (pipe) passes data from the command before to the one after.
select(AreaPlantedHect,YieldperHectKg,ProduceTonne,TapAreaHect,TotalPaidEmployee))
# Method 2: Use the minus sign before the predictor you dont want to plot such that the remaining predictors are plotted
boxplot(df.master %>%
select(-Year),
col = c("red","sienna","palevioletred1","royalblue2","brown"),
ylab="Count", xlab="Predictors"
)
# Line plots- visualizing relationship between two variables
ggplot(df.master)+ geom_line(aes(x=AreaPlantedHect, y=YieldperHectKg, color="red")) # interesting: the yield per hectare decline (after 600 hectares) as planted area size increases
ggplot(df.master)+ geom_line(aes(x=AreaPlantedHect, y=ProduceTonne, color="red")) # produce increases with area but then it begins to decline after 600 hectares
ggplot(df.master)+ geom_line(aes(x=AreaPlantedHect, y=TapAreaHect, color="red")) # there is a positive linear relationship between area planted and tap area
ggplot(df.master)+ geom_line(aes(x=AreaPlantedHect, y=TotalPaidEmployee, color="red")) # Again, a positive linear realtionship between area planted and paid employees but there is a sharp decline at 600 hectares
## Checking for High correlation
cor(df.master$AreaPlantedHect, df.master$YieldperHectKg) # negative correlation, proving the point above that the yield per hectare decreases as plantation size increases
cor(df.master$AreaPlantedHect, df.master$TapAreaHect) # very strong positive correlation
cor(df.master$AreaPlantedHect, df.master$ProduceTonne) # very strong positive correlation
cor(df.master$AreaPlantedHect, df.master$TotalPaidEmployee) # very strong positive correlation, as land size increases more labour is required
correlations<- cor(df.master)
# Visualizing the high correlations
corrplot(correlations, method="number")
# we can see that vars 'TotalPaidEmployee','AreaPlantedHect','ProduceTonee' and 'TapAreaHect' are high positive correlated.
# Year and YieldPerHect have low positive correlation; TotalPaidEmployee and YieldHect have semi-strong negative correlation;
# AreaPlantedHect and YieldPerHect has strong netaive correlation
# ProduceTonne and YieldperhectKg has low negative correlation
# YieldperHect and TapAreaHect has low negative correlation
highCor<- c("TotalPaidEmployee","AreaPlantedHect","ProduceTonne","TapAreaHect")
# Scatterplot for variables that have high correlations
attach(df.master)
plot(TotalPaidEmployee,AreaPlantedHect) # strong positive correlation
plot(TapAreaHect,ProduceTonne) # strong positive correlation
plot(TotalPaidEmployee, YieldperHectKg) #semi-strong negative correlation
plot(ProduceTonne,YieldperHectKg) # low negative correlation
plot(TapAreaHect,YieldperHectKg) # low negative correlation
plot(Year,AreaPlantedHect) # strong negative correlation
plot(Year,TotalPaidEmployee)# strong negative correlation
# Basic scatterplot matrix
pairs(~., data = df.master, lower.panel=panel.smooth,pch=20,
main="Rubber production Scatterplot Matrix") # where panel.smooth can be used to plot a loess curve for each plot in a scatterplot matrix
ggpairs(~., data=df.master)
pm= ggpairs(data = df.master,
columns = 1:ncol(df.master),
upper = list(continuous="density"),
title = "Correlations among predictors"
)
print(pm)
## Checking relationship between TotalPaidEmployee, AreaPlantedHect, ProduceTonne, TapAreaHect
plot1<-ggpairs(data=Training, columns=2:5,
mapping = aes(color = "dark green"),
axisLabels="show")
plot1
boxplot1=boxplot(YieldperHectKg~TapAreaHect, data = Training,
col=(c("gold","darkgreen")),
main="Yield (Hectare) vs. Rubber tapped area (Hectare)",
xlab="Rubber tapped area (Hectare)", ylab="Yield (Hectare)")
library(RColorBrewer)
#we will select the first 4 colors in the Set1 palette
cols<-brewer.pal(n=4,name="Set1")
#cols contain the names of four different colors
plot(Training$AreaPlantedHect, Training$YieldperHectKg, pch=16, col=cols,
main=" Does larger plantation area yield more rubber?",
xlab = "Area planted (in hectare)",
ylab = "Yield in Kg (per hectare)"
)
### Data Transformation: skewed variable treatment
# A variable is considered ‘highly skewed’ if its absolute value is
# greater than 1.
# A variable is considered ‘moderately skewed’ if its absolute value is
# greater than 0.5.
skewedVars <- NA
for(i in names(df.master)){
if(is.numeric(df.master[,i])){
if(i != "YieldperHectKg"){
# Enters this block if variable is non-categorical
skewVal <- skewness(df.master[,i])
print(paste(i, skewVal, sep = ": "))
if(abs(skewVal) > 0.5){
skewedVars <- c(skewedVars, i)
}
}
}
}
# None of the predictors are skewed
### Feature Importance
library(Boruta)
# run boruta analysis
set.seed(1234)
# pull out the response variable
response <- df.master$YieldperHectKg
bor.results <- Boruta(df.master,response,
maxRuns=101,
doTrace=0)
bor.results # basically, all the variables are considered important for response variable prediction
cat("\n\nRelevant Attributes:\n")
getSelectedAttributes(bor.results)
plot(bor.results)
######## Splitting the dataset into train and test#####
ratio = sample(1:nrow(df.master), size = 0.25*nrow(df.master))
Test = df.master[ratio,] #Test dataset 25% of total
Training = df.master[-ratio,] #Train dataset 75% of total
dim(Training)
dim(Test)
# Evaluation metric function
RMSE <- function(x,y){
a <- sqrt(sum((log(x)-log(y))^2)/length(y))
return(a)
}
# Multiple Linear regression
linear.mod<- lm(YieldperHectKg~., data = Training)
summary(linear.mod)
help(plot)
plot(linear.mod, pch=16, which = 1)
predict<- predict(linear.mod, Test)
RMSE0<- RMSE(predict, Test$YieldperHectKg)
RMSE0
# Calculate prediction accuracy and error rates
actuals_preds <- data.frame(cbind(actuals=Test$YieldperHectKg, predicteds=predict)) # make actuals_predicteds dataframe.
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
min_max_accuracy <- mean (apply(actuals_preds, 1, min) / apply(actuals_preds, 1, max))
min_max_accuracy
mape <- mean(abs((actuals_preds$predicteds - actuals_preds$actuals))/actuals_preds$actuals)
mape
# Model Diagnostics
# Check the AIC and BIC
AIC(linear.mod)
BIC(linear.mod)
## creating another model to compare the AIC and BIC
linear.mod1<- lm(YieldperHectKg~ProduceTonne+TapAreaHect, data = Training)
AIC(linear.mod1)
BIC(linear.mod1)
# Regression model with significant predictors only
linear.mod.1<- lm(YieldperHectKg~ TotalPaidEmployee+AreaPlantedHect+Year, data = Training)
summary(linear.mod.1)
predict<- predict(linear.mod, Test)
RMSE0<- RMSE(predict, Test$YieldperHectKg)
RMSE0
#### R, Regression Trees, function rpart(), method "anova" ####
model <- rpart(YieldperHectKg ~., data = Training, method = "anova")
predict <- predict(model, Test)
# RMSE
RMSE1 <- RMSE(predict, Test$YieldperHectKg)
RMSE1 <- round(RMSE1, digits = 3)
RMSE1 #0.098
plot1 <- predict-Test$YieldperHectKg
### R, Random Forests, function randomForest(), method "anova" ####
help("randomForest")
model.forest <- randomForest(YieldperHectKg ~., data = Training, method = "anova",
ntree = 300,
mtry = 2, #mtry is sqrt(6)
replace = F,
nodesize = 1,
importance = T)
varImpPlot(model.forest) # Look at the IncNodePurity plot. From this plot we see that important vars are `TotalPaidEmployee`, `ProduceTonne` and `TapAreaHect`
prediction <- predict(model.forest,Test)
rmse <- sqrt(mean((log(prediction)-log(Test$YieldperHectKg))^2))
round(rmse, digits = 3) # 0.049
# Alternative way to caluclaute RMSE
RMSE2 <- RMSE(predict, Test$YieldperHectKg)
RMSE2 <- round(RMSE2, digits = 3) #.098
plot2 <- predict-Test$YieldperHectKg
#### CONCLUSION: TO PREDICT Yield per Hectare in Kg
# Linear Regression: 0.04533296
#Regression Tree RMSE: 0.098
#Random Forest RMSE: 0.049
#Regression Tree gives the most accutate result in predicting