In [None]:
# load all the library
library(readxl)
library(psych)
library(ggplot2)
library(GGally)
library(lubridate)
library(dplyr)
library(bestNormalize)
library(zoo)
library(xts)
library(leaps)
library(car)
library(ggpubr)
library(TSA)
library(forecast)
#library(stl)

In [None]:
# set working directory
# setwd('C:/Users/Dhaval/Documents/CSC 672/Final_Project_Dhaval_Delvadia/2015contest_CSV')

########################### read solararray tables ###############################
d_sp<-read.csv("../input/solararray_production.csv", header=TRUE)
d_ss<-read.csv("../input/solararray_solarangle.csv", header=TRUE)
d_sw<-read.csv("../input/solararray_weather.csv", header = TRUE)
d_sw$Hour<-d_sw$Hour+1 #change the hours from 0 to 23 to 1 to 24

# remove duplicates from solararray_solarangle
d_ss<-d_ss[!duplicated(d_ss),]


d_sp$Date<-as.Date(d_sp$Date, format('%m/%d/%Y'))
d_sp<-data.frame(Year=as.numeric(format(d_sp$Date, format="%Y")), 
                 Month=as.numeric(format(d_sp$Date, format="%m")),
                 Day=as.numeric(format(d_sp$Date, format="%d")),
                 d_sp$Hour,
                 d_sp$Electricity_KW_HR)
names(d_sp)<-c('Year', 'Month', 'Day', 'Hour', 'Electricity_KW_HR')



#d.ss.sw<-right_join(d_sw,d_ss, by=NULL)
d.ss.sw<-left_join(d_sw,d_ss, by=NULL)
# now merge d_sp table with d.sp.ss
d.s<-left_join(d.ss.sw, d_sp, by=NULL)

##### Lets make a copy for later transformation
ok<- complete.cases(d.s)
df_sNNA<- d.s[ok,]

# Based on the imputation, the missing values of the Solar_Elevation was
# added by averaging solar_Elevation values grouped by the Month, Day, and Hour and taking the mean of the
# all those solar_Elevation values and then replacing the NA value with the average values. In this case,
# the average value for the Solar_Elevation should work since the earth during the same month, day, and
# Hour should be almost same regardless of the hours.
d_ns <- d.s%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Solar_Elevation = ifelse(is.na(Solar_Elevation), mean(Solar_Elevation, na.rm = T), Solar_Elevation))

# We will also imputed the Electricity_KW_HR by taking the averages based on the grouping of 
# the hour, day, and month.
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Electricity_KW_HR = ifelse(is.na(Electricity_KW_HR), mean(Electricity_KW_HR, na.rm = T), Electricity_KW_HR))

# After imputing these values, we get result that the 
# hours from 0 to 8 and 17 to 23 hours is NaNs because those hours are night time 
# so there was no solar Electricity generated. 
# Therefore, these values were replaced with 0.
d_ns$Electricity_KW_HR<-replace(d_ns$Electricity_KW_HR, is.nan(d_ns$Electricity_KW_HR), 0)

# We also had 184 NA values in Wind_Speed column.  On this column, we replaced 
# the NA by grouping the values and then replace mean value based on grouping by hour, day, and month. 
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Wind_Speed = ifelse(is.na(Wind_Speed), mean(Wind_Speed, na.rm = T), Wind_Speed))

# We also had 142 NA values in Visibility column. On this column, we placed the NA 
# by grouping the values and then replace mean value based on grouping by hour, day, and month.
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Visibility = ifelse(is.na(Visibility), mean(Visibility, na.rm = T), Visibility))

# We also had 150 NA values in Temperature column. On this column, we placed 
# the NA by grouping the values and then replace mean value based on grouping by hour, day, and month.
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Temperature = ifelse(is.na(Temperature), mean(Temperature, na.rm = T), Temperature))

# Pressure
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Pressure = ifelse(is.na(Pressure), mean(Pressure, na.rm = T), Pressure))

d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Pressure = ifelse(is.nan(Pressure), mean(Pressure, na.rm = T), Pressure))

d_ns$Pressure[is.na(d_ns$Pressure)]<-mean(d_ns$Pressure, na.rm=TRUE)


# Precipitation
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Precipitation = ifelse(is.na(Precipitation), mean(Precipitation, na.rm = T), Precipitation))

# We also had 270 NA values in Humidity_Fraction column. On this column, we placed 
# the NA by grouping the values and then replace mean value based on grouping by hour, day, and month.
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Humidity_Fraction = ifelse(is.na(Humidity_Fraction), mean(Humidity_Fraction, na.rm = T), Humidity_Fraction))

# We also had 270 NA values in Dew_Point column. On this column, we placed 
# the NA by grouping the values and then replace mean value based on grouping by hour, day, and month.
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Dew_Point = ifelse(is.na(Dew_Point), mean(Dew_Point, na.rm = T), Dew_Point))

# We also had 191 NA values in Cloud_Cover_Fraction column. On this column, we placed 
# the NA by grouping the values and then replace mode value based on grouping by hour, day, and month.
d_ns <- d_ns%>% 
  group_by(Month, Day, Hour) %>% 
  mutate(Cloud_Cover_Fraction = ifelse(is.na(Cloud_Cover_Fraction), median(Cloud_Cover_Fraction,na.rm=T), Cloud_Cover_Fraction))

###We will have to add zero to those solor production colums where the output is NULL
write.csv(d_ns, 'df_solararray_complete.csv', col.names = TRUE)



In [None]:
summary(d_ns)

In [None]:
hist(d_ns$Electricity_KW_HR)

In [None]:
hist(log(d_ns$Electricity_KW_HR + 1))

In [None]:
bestNormalize(d_ns$Electricity_KW_HR,standardize=FALSE)

In [None]:
sum(isna(d_ns$Electricity_KW_HR))

In [None]:
hist(orderNorm(d_ns$Electricity_KW_HR))

In [None]:
tst_Elec<-d_ns%>%group_by(Month,Day,Hour)

In [None]:
d_ns%>%group_by(Month,Day,Hour)

# Make the scenario (test) file complete

In [None]:
########################### read solararray tables ###############################
d_pws<-read.csv("../input/powercity_weather_scenario.csv", header=TRUE)
d_cds<-read.csv("../input/calendar_days_scenario.csv", header=TRUE)
d_ss<-read.csv("../input/solararray_solarangle.csv", header=TRUE)

# remove duplicates from solararray_solarangle
d_ss<-d_ss[!duplicated(d_ss),]
d_pws$Hour<-d_pws$Hour+1 # add an hour to powercity weather scenario file so hours start at 1 instead of 0.

d_p_c<-left_join(d_pws, d_cds, by=c('Year','Month','Day'))
d_n<-d_ss%>%select(Month,Day,Hour,Solar_Elevation)%>%group_by(Month,Day,Hour)%>%summarise(Solar_Elevation=median(Solar_Elevation))
d_scenario<-merge(d_n, d_p_c, by=c('Month','Day','Hour'))
#d_scenario<-subset(d_scenario, select=-c(''))
#df <- subset(df, select = -c(a, c))
###We will have to add zero to those solor production colums where the output is NULL
d_scenario$Cloud_Cover_Fraction<-ifelse(is.na(d_scenario$Cloud_Cover_Fraction), mean(d_scenario$Cloud_Cover_Fraction, na.rm=TRUE), d_scenario$Cloud_Cover_Fraction)
d_scenario$Dew_Point <- ifelse(is.na(d_scenario$Dew_Point), mean(d_scenario$Dew_Point, na.rm=TRUE), d_scenario$Dew_Point)
d_scenario$Humidity_Fraction<-ifelse(is.na(d_scenario$Humidity_Fraction), mean(d_scenario$Humidity_Fraction, na.rm=TRUE), d_scenario$Humidity_Fraction)
d_scenario$Precipitation <- ifelse(is.na(d_scenario$Precipitation), mean(d_scenario$Precipitation, na.rm=TRUE), d_scenario$Precipitation)
d_scenario$Visibility<-ifelse(is.na(d_scenario$Visibility), mean(d_scenario$Visibility, na.rm=TRUE), d_scenario$Visibility)
d_scenario$Wind_Speed <- ifelse(is.na(d_scenario$Wind_Speed), mean(d_scenario$Wind_Speed, na.rm=TRUE), d_scenario$Wind_Speed)
d_scenario$Pressure <- ifelse(is.na(d_scenario$Pressure), mean(d_scenario$Pressure, na.rm=TRUE), d_scenario$Pressure)
d_scenario$Temperature <- ifelse(is.na(d_scenario$Temperature), median(d_scenario$Temperature, na.rm=TRUE), d_scenario$Temperature)
write.csv(d_scenario, 'scenario.csv', col.names = TRUE)

In [None]:
sum(is.na(d_scenario['Dew_Point']))

In [None]:
sum(is.na(d_scenario))
summary(d_scenario)

In [None]:
d_ns

# EDA

In [None]:
d_ns[2:15]

In [None]:
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
minMaxNorm_d_ns <- as.data.frame(lapply(d_ns[2:15], normalize))

In [None]:
minMaxNorm_d_ns

In [None]:
x <- rgamma(100, 1, 1)
x

In [None]:
as.vector(d_ns[15]

In [None]:
yeojohnson_obj <- yeojohnson(d_ns[15])
yeojohnson_obj
p <- predict(yeojohnson_obj)
x2 <- predict(yeojohnson_obj, newdata = p, inverse = TRUE)

In [None]:
arcsinh_x_obj <- arcsinh_x(d_ns[1])
arcsinh_x_obj
p <- predict(arcsinh_x_obj)
x2 <- predict(arcsinh_x_obj, newdata = p, inverse = TRUE)

all.equal(x2, x)

In [None]:
qplot(log(Electricity_KW_HR+1), data=d_ns, fill='red', ylab='Count')
qplot(sqrt(Electricity_KW_HR), data=d_ns, fill="#AFE5AD", ylab='Count')
qplot(as.numeric(Electricity_KW_HR), data=minMaxNorm_d_ns, fill='green', ylab='Count')
qplot(p, data=d_ns, fill="#AFE5AD", ylab='Count')


In [None]:
### ADD density to each histogram ########
#par(mfrow=c(2,2))
# Plot Histograms for each variables 
h1<-ggplot(data=d_ns, aes(x=Electricity_KW_HR)) + 
    geom_histogram(color="black", fill="#AFE5AD", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
    #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

qt=QuantileTransformer(n_quantiles=10, random_state=0)
a=qt.fit_transform(np.array(y.iloc[:,0]).reshape(-1,1))
plt.hist(a, bins='auto',facecolor='red',)
plt.ylabel('No of times')
plt.show()
h11<-



#stat_function( 
#    fun = function(x, mean, sd, n, bw){ 
#      dnorm(x = x, mean = mean, sd = sd) * n * bw
#    }, 
#    args = c(mean = mean, sd = sd, n = n, bw = binwidth))
#gg <- gg + stat_function(fun=dnorm,
#                         color="red",
#                         args=list(mean=mean(mtcars$mpg), 
#                                  sd=sd(mtcars$mpg)))
#stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
#    color = "darkred", size = 1)

h2<-ggplot(data=d_ns, aes(x=Solar_Elevation)) + 
    geom_histogram(color="black", fill="cadetblue2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))+
    stat_function(fun=dnorm, color='red', args=list(mean=mean(d_ns$Solar_Elevation), sd=sd(d_ns$Solar_Elevation)))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h3<-ggplot(data=d_ns, aes(x=Wind_Speed)) + 
    geom_histogram(color="black", fill="azure", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h4<-ggplot(data=d_ns, aes(x=Visibility)) + 
    geom_histogram(color="black", fill="bisque", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 4
ggarrange(h1, h2, h3, h4,
          ncol = 2, nrow = 2)

h5<-ggplot(data=d_ns, aes(x=Temperature)) + 
    geom_histogram(color="black", fill="blueviolet", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h6<-ggplot(data=d_ns, aes(x=Pressure)) + 
    geom_histogram(color="black", fill="coral2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h7<-ggplot(data=d_ns, aes(x=Precipitation)) + 
    geom_histogram(color="black", fill="cornflowerblue", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h8<-ggplot(data=d_ns, aes(x=Humidity_Fraction)) + 
    geom_histogram(color="black", fill="chartreuse2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 4
ggarrange(h5, h6, h7, h8,
          ncol = 2, nrow = 2)

h9<-ggplot(data=d_ns, aes(x=Dew_Point)) + 
    geom_histogram(color="black", fill="cyan", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h10<-ggplot(data=d_ns, aes(x=Cloud_Cover_Fraction)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 2
ggarrange(h9, h10,
          ncol = 2, nrow = 2)

In [None]:
summary(log10(d_ns$Electricity_KW_HR+1))

In [None]:
########### CORRELATION MATRIX WITH HEATMAP ######################################
ggcorr(d_ns, palette = "RdYlGn", name = "rho", label = TRUE, label_size=3, label_color = "black", size = 3, hjust = 0.75)

Based on the correlation plot, we can see the multicolliniarity between Dew_Point and Temperature. Therefore, we can remove one of these variables since other is redendent. When we fit the model with all variables, we will remove the Dew_Point variable.

In [None]:
library(ggplot2)
library(reshape2)

#cormat <- round(cor(d_ns[,2:15]),2)
#cormat

melted_cormat <- melt(cormat)
melted_cormat

#ggplot(data = melted_cormat, aes(x=X1, y=X2, fill=value)) + 
#  geom_tile()

cormat <- round(cor(d_ns[2:15]),2)

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
}

upper_tri <- get_upper_tri(cormat)
upper_tri

# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)



# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap)


ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 0),
  legend.position = c(0.6, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))+
ggtitle("Correlation Plot of Solar Electricity Production Dataset")

In [None]:
d_ns

In [None]:
# Heatmap of the solar energy production
library(reshape) # used for cast function
library(plyr) # used for ddply function
library(RColorBrewer) # used to customize heatmap colors

yearMonth <- ddply(d_ns, c( "Year", "Month" ), summarize, sum=sum(Electricity_KW_HR))

yearMonth_matrix <- cast(yearMonth, Year ~ Month)
rownames(yearMonth_matrix) <- yearMonth_matrix[,1]
yearMonth_matrix[,1] <- NULL
yearMonth_matrix <- as.matrix(t(yearMonth_matrix))
yearMonth_matrix <- yearMonth_matrix[nrow(yearMonth_matrix):1,]
yearMonth_matrix[1:5, 1:5]

# Heatmap two
library(gplots)
heatmap.2(yearMonth_matrix, 
          dendrogram = "none", Colv = FALSE, Rowv = FALSE,
          scale = "none", col = brewer.pal(11, "Oranges"),
          key = TRUE, density.info = "none", key.title = NA, key.xlab = "Electricity (KWh)",
          trace = "none",
          main = "Solar Energy Production",
          xlab = "Year",
          ylab = "Month")

In [None]:
# Check correlations (as scatterplots), distribution and print corrleation coefficient 
#ggpairs(d_ns) 

In [None]:
# mutated the data to add the combined date
df<-d_ns%>%
    mutate(date=make_datetime(Year,Month,Day, Hour))
#head(df)


# plot the time series of Electricity_KW_HR over time
t1<-ggplot(df, aes(date, Electricity_KW_HR))+geom_line(col='orange')+geom_smooth()+
    ggtitle('Electricity Production (KWh)')+ xlab('Date')+ ylab('Electricity (KWh)')+
    theme(plot.title = element_text(hjust = 0.5))

# plot the time series of Solar_Elevation over time
t2<-ggplot(df, aes(date, Solar_Elevation))+geom_line(col='yellow2')+geom_smooth(col='red')+
    ggtitle('Solar Elevation')+ xlab('Date')+ ylab('Solar_Elevation (Degree)')+
    theme(plot.title = element_text(hjust = 0.5))

# plot the time series of Wind_Speed over time
t3<-ggplot(df, aes(date, Wind_Speed))+geom_line(col='skyblue1')+geom_smooth(col='magenta2')+
    ggtitle('Wind Speed over Time')+ xlab('Date')+ ylab('Wind_Speed (m/s)')+
    theme(plot.title = element_text(hjust = 0.5))

# plot the time series of Visibility over time
t4<-ggplot(df, aes(date, Visibility))+geom_line(col='coral1')+geom_smooth(col='darkslategray2')+
    ggtitle('Visibility over Time')+ xlab('Date')+ ylab('Visibility (Km)')+
    theme(plot.title = element_text(hjust = 0.5))

# plot the time series of Temperature over time
t5<-ggplot(df, aes(date, Temperature))+geom_line(col='green2')+geom_smooth(col='hotpink1')+
    ggtitle('Temperature')+ xlab('Date')+ ylab('Temperature (degree C)')+
    theme(plot.title = element_text(hjust = 0.5))

# plot the time series of Temperature over time
t6<-ggplot(df, aes(date, Dew_Point))+geom_line(col='purple')+geom_smooth(col='hotpink1')+
    ggtitle('Dew Point over Time')+ xlab('Date')+ ylab('Dew Point')+
    theme(plot.title = element_text(hjust = 0.5))

# Arrange plots in sets of 4
ggarrange(t1, t2, t3, t4, t5, t6,
          ncol = 3, nrow = 2)

In [None]:
str(d_ns)

In [None]:
#########BOX PLOTS ###########
######  Draw box plots  #########
new<-d_ns
new$Month <- factor(new$Month, labels = c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"))

fill <- "gold1"
line <- "black"

# boxplot of the Electricity_KW_HR per month
B1 <- ggplot(new, aes(x = Month, y = Electricity_KW_HR)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Electricity (KWh)") +
        ggtitle("Boxplot of Electricity Production (KWh) Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B1

B2 <- ggplot(new, aes(x = Month, y = Solar_Elevation)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Solar Elevation (degree)") +
        ggtitle("Boxplot of Solar Elevation (degree) Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B2

B3 <- ggplot(new, aes(x = Month, y = Visibility)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Visibility") +
        ggtitle("Boxplot of Visibility Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B3

B4 <- ggplot(new, aes(x = Month, y = Temperature)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Temperature") +
        ggtitle("Boxplot of Temperature Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B4

B5 <- ggplot(new, aes(x = Month, y = Cloud_Cover_Fraction)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Cloud Cover Fraction") +
        ggtitle("Boxplot of Cloud Cover Fraction Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B5

B6 <- ggplot(new, aes(x = Month, y = Dew_Point)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Dew Point") +
        ggtitle("Boxplot of Dew Point Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B6

B7 <- ggplot(new, aes(x = Month, y = Humidity_Fraction)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Humidity Fraction") +
        ggtitle("Boxplot of Humidity Fraction Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B7

B8 <- ggplot(new, aes(x = Month, y = Precipitation)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Precipitation") +
        ggtitle("Boxplot of Precipitation Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B8


B9 <- ggplot(new, aes(x = Month, y = Pressure)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Pressure") +
        ggtitle("Boxplot of Pressure Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B9

B10 <- ggplot(new, aes(x = Month, y = Wind_Speed)) +
        geom_boxplot(fill = fill, colour = line, alpha = 0.7, outlier.colour ="red") +  
        scale_x_discrete(name = "Month") +
        scale_y_continuous(name = "Wind Speed") +
        ggtitle("Boxplot of Wind Speed Per month") +
        theme_bw()+
        theme(plot.title = element_text(hjust = 0.5))
B10

In [None]:
new

**MODELS**
* **Multivariate Regression Models without transformation or normalization**

In [None]:
####### Linear Regression Models without transformation or normalization ########
# let's split the data into train and validation for this problem
# Training = 80% and Validation = 20%
smp_size<-floor(0.8*nrow(d_ns))
## set the seed to make the partition repoducible
set.seed(123)
train_ind<-sample(seq_len(nrow(d_ns)), size=smp_size)
train<-d_ns[train_ind,] # 80% training dataset
validation<-d_ns[-train_ind,] # 20% validation dataset

train[1]<-NULL # Removing the first column since it's just states 'solar array' as obs all the way through
validation[1]<-NULL # Removing the first column since it's just states 'solar array' as obs all the way through
head(train)
head(validation)

In [None]:
#sum(is.na(train))
#indx<-apply(train, 2, function(x) any(is.na(x) | is.infinite(x)))
#colnames[indx]

In [None]:
#sapply(train, function(x) sum(is.na(x)))

**Fit All Variables**

In [None]:
# let's fit model on all variables except for very high multicolliniarity varible Dew_Point.
fitAll<-lm(Electricity_KW_HR ~. - Dew_Point, data=train)
summary(fitAll)

# Prediction on Validation set
# Apply prediction on validation set
fitAll_predict_validation <- predict(fitAll, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation$Electricity_KW_HR,fitAll_predict_validation)
print(paste("root-mean-square error between actual and predicted", validation_rmse))
#print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
print(summary(fitAll_predict_validation))


# summary of actual count values
# print("summary of actual Electricity_KW_HR values")
# summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- fitAll_predict_validation
Output2Mod[fitAll_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
print(summary(Output2Mod))

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print(paste("root-mean-square error between actual and predicted",rmse(validation$Electricity_KW_HR, Output2Mod)))
#print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print(paste("root-mean-square-log error value after replaced the negative values", validaion_rmsle))

Looks like we have 11 variables (including intercept) which are coming up as significant variables at p-value of less than 0.05. The variables Day and  precipitation are not coming up as significant so we should remove that variable and see if the model shows any other variables not in the model. We should start with Day first to see if precipitation still come up with the not significant.

We can also see that F-statistic is also coming up with 4037 which is really high. The multiple R-squared and adjusted R-squared are both coming up at 0.61. Therefore, this seems to be a good model since total number of variables are dropping from 13 variables to 10 variables.  However, let's see if we can make this model parsimonious by trying backward elimination, forward selection as well as stepwise regression. 

In [None]:
# Revised model 1
# model without the Dew_point and Precipitation since stepwise and other models did not show these two variables
m1<-lm(Electricity_KW_HR~.-Dew_Point -Day, data=train)
summary(m1)

In the revised model precipitation is not coming up as significant as well as wind_speed.  Therefore, we will remove Precipitation and see if Wind_Speed is still coming up as not significant.

In [None]:
# Revised model 2
# model without the Dew_point and Precipitation since stepwise and other models did not show these two variables
m2<-lm(Electricity_KW_HR~.-Dew_Point -Day -Precipitation, data=train)
summary(m2)

In the revised model 2, the wind_speed is not coming up as significant.  Therefore, we will remove wind_speed and see if any other variables are going to come up as not significant.

In [None]:
# Revised model 3
# model without the Dew_point, Day, Precipitation, and Wind_Speed since stepwise and other models did not show these two variables
m3<-lm(Electricity_KW_HR~.-Dew_Point -Day -Precipitation -Wind_Speed, data=train)
summary(m3)

**Backward Elimination**

In [None]:
fitAllBW<-step(fitAll, direction='backward')
summary(fitAllBW)
#names(fitAllBW)
plot(fitAllBW)
fitAllBW_VIF<-vif(fitAllBW)
fitAllBW_VIF


# Prediction on Validation set
# Apply prediction on validation set
fitAllBW_predict_validation <- predict(fitAllBW, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation$Electricity_KW_HR,fitAllBW_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(fitAllBW_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- fitAllBW_predict_validation
Output2Mod[fitAllBW_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)


It looks like the backward selection also picked 10 independent variables out of the 13 indipendent variables. The final AIC score is 684466.

**Forward Selection**

In [None]:
# we will make the starter model with just mean/intercept value, then we will go up
# by adding each variable in scope with all variables
fitStart<-lm(Electricity_KW_HR ~ 1, data=train)
summary(fitStart)

In [None]:
fitAllFW<-step(fitStart, direction='forward', scope=formula(fitAll))
summary(fitAllFW)
names(fitAllFW)
plot(fitAllFW)
fitAllFW_VIF<-vif(fitAllFW)
fitAllFW_VIF


# Prediction on Validation set
# Apply prediction on validation set
fitAllFW_predict_validation <- predict(fitAllFW, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation$Electricity_KW_HR,fitAllFW_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(fitAllFW_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- fitAllFW_predict_validation
Output2Mod[fitAllFW_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

The forward selection also gave us the same 10 independent variables out of the 13 variables. Next, we will try stepwise to see if we still get the same 10 independent variables.

**Stepwise (combination of forward and backward combination) Method**
AIC critaria is employed for selection purpose for forward, backward, and stepwise methods

In [None]:
fitAllBoth<-step(fitAll, direction='both') #, scope=formula(fitAll))
summary(fitAllBoth)
names(fitAllBoth)
plot(fitAllBoth)
fitAllBoth_VIF<-vif(fitAllBoth)
fitAllBoth_VIF



# Prediction on Validation set
# Apply prediction on validation set
fitAllBoth_predict_validation <- predict(fitAllBoth, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation$Electricity_KW_HR,fitAllBoth_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(fitAllBoth_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- fitAllBoth_predict_validation
Output2Mod[fitAllBoth_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

Stepwise combination of both the forward selection and backward elimnation also showed the same 10 independent variable model.

In [None]:
fullInterTerm_m0<-lm(Electricity_KW_HR~(.-Dew_Point)^2, data=train)
fullInterTerm_m0

In [None]:
summary(fullInterTerm_m0)
names(fullInterTerm_m0)
fullInterTerm_m0$model

* **Exhastive search using leaps package**

In [None]:
head(train)
dim(train[,1:13])
dim(train[,14])
class(train[,14])

In [None]:
x<-model.matrix(Electricity_KW_HR ~ . , -1, data=train)
y<-train$Electricity_KW_HR
regfit.full<-regsubsets(x, y, nbest=1)
summary(regfit.full, matrix.logical=TRUE)

#head(x)
#head(y)

In [None]:
dim(x)
length(y)

In [None]:
regfit.full$tol

In [None]:
regsubsets(x, y, nbest=1)

In [None]:
leaps<-regsubsets(Electricity_KW_HR ~ . -Dew_Point, data=train, nbest=2, method='exhaustive')
#summary(leaps)
subsets(leaps, statistic="bic", ylim=c(-25000,-31400))

In [None]:
######################### leap model ###################################
L_BIC<-lm(Electricity_KW_HR~Cloud_Cover_Fraction+Solar_Elevation, data=train) #, scope=formula(fitAll))
summary(L_BIC)
names(L_BIC)
plot(L_BIC)
L_BIC_VIF<-vif(L_BIC)
L_BIC_VIF



# Prediction on Validation set
# Apply prediction on validation set
L_BIC_predict_validation <- predict(L_BIC, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation$Electricity_KW_HR,L_BIC_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(L_BIC_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- L_BIC_predict_validation
Output2Mod[L_BIC_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

In [None]:
plot(subsets(leaps, statistic="cp", ylim=c(0,200)))

In [None]:
L_cp<-lm(Electricity_KW_HR~Year+Month+Hour+Cloud_Cover_Fraction+Humidity_Fraction+Pressure+Temperature+Solar_Elevation, data=train)
L_cp
summary(L_cp)

# Prediction on Validation set
# Apply prediction on validation set
L_cp_predict_validation <- predict(L_cp, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation$Electricity_KW_HR,L_cp_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(L_cp_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- L_cp_predict_validation
Output2Mod[L_cp_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

In [None]:
subsets(leaps, statistic="rsq", xlim=c(0,6), ylim=c(0.55,0.7))

In [None]:
L_rsq<-lm(Electricity_KW_HR~Humidity_Fraction+Solar_Elevation, data=train)
L_rsq
summary(L_rsq)

# Prediction on Validation set
# Apply prediction on validation set
L_rsq_predict_validation <- predict(L_rsq, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
L_rsq_rmse<-rmse(validation$Electricity_KW_HR,L_rsq_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(L_rsq_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- L_rsq_predict_validation
Output2Mod[L_rsq_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

In [None]:
subsets(leaps, statistic="adjr2", xlim=c(0,6), ylim=c(0.55,0.6))

In [None]:
L_adjr2<-lm(Electricity_KW_HR~Cloud_Cover_Fraction+Humidity_Fraction+Solar_Elevation, data=train)
L_adjr2
summary(L_adjr2)

# Prediction on Validation set
# Apply prediction on validation set
L_adjr2_predict_validation <- predict(L_adjr2, newdata = validation)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
L_adjr2_rmse<-rmse(validation$Electricity_KW_HR,L_adjr2_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(L_adjr2_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation$Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- L_adjr2_predict_validation
Output2Mod[L_adjr2_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation$Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation$Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

In [None]:
subsets(leaps, statistic="rss", xlim=c(5.5,8.5), ylim=c(3.27e+13,3.30e+13))

In [None]:
########### Takes long time to run ###############
## leaps1<-regsubsets(Electricity_KW_HR~.^2, data=train, nbest=5, method='exhaustive', really.big=T)
#summary(leaps1)

In [None]:
subsets(leaps1, statistic="bic", xlim=c(0,40), ylim=c(-42478,-42476))

What are the best predictors based on above evaluations? And why are we choosing these predictors?

Or should we choose model based on the one of the model above?

What is the accuracy of this model for traing, validation and testing dataset? MSE as well?





# Transformation and Normalization
We will first remove all the night time values from Electricity_KW_HR since they are zeros
Then we will apply best Normal to find the best normal for each line
Then we will apply scalling to make all numbers between 0 and 1.


In [None]:
summary(d_ns)

### Normalize using minmax

In [None]:
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
x=normalize(d_ns$Electricity_KW_HR)
new=data.frame(x)
th1<-ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black")) +
    labs(title = "Min Max Transformation")

x=yeojohnson(d_ns$Electricity_KW_HR)
new=data.frame(x)
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#B2182B"), axis.line = element_line(colour = "black")) +
       labs(title = "YeoJohnson Transformation")

In [None]:
yeojohnson_obj<-yeojohnson(d_ns$Electricity_KW_HR)
new<-data.frame(predict(yeojohnson_obj))
colnames(new)[1] <- "x"
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod") + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black")) +
       labs(title = "YeoJohnson Transformation")

In [None]:
# without zeros or NA in Electricity_KW_HR column
df_cpt<-d.s[complete.cases(d.s),]
dim(df_cpt)
summary(df_cpt)

In [None]:
### ADD density to each histogram ########
#par(mfrow=c(2,2))
# Plot Histograms for each variables 
h11<-ggplot(data=d_ns, aes(x=Electricity_KW_HR)) + 
    geom_histogram(color="black", fill="#AFE5AD", bins=25) + 
    geom_density(alpha=1, fill="red") +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
    #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h12<-ggplot(data=d_ns, aes(x=Solar_Elevation)) + 
    geom_histogram(color="black", fill="cadetblue2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h13<-ggplot(data=d_ns, aes(x=Wind_Speed)) + 
    geom_histogram(color="black", fill="azure", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h14<-ggplot(data=d_ns, aes(x=Visibility)) + 
    geom_histogram(color="black", fill="bisque", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 4
ggarrange(h11, h12, h13, h14,
          ncol = 2, nrow = 2)

h15<-ggplot(data=d_ns, aes(x=Temperature)) + 
    geom_histogram(color="black", fill="blueviolet", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h16<-ggplot(data=d_ns, aes(x=Pressure)) + 
    geom_histogram(color="black", fill="coral2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h17<-ggplot(data=d_ns, aes(x=Precipitation)) + 
    geom_histogram(color="black", fill="cornflowerblue", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h18<-ggplot(data=d_ns, aes(x=Humidity_Fraction)) + 
    geom_histogram(color="black", fill="chartreuse2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 4
ggarrange(h15, h16, h17, h18,
          ncol = 2, nrow = 2)

h19<-ggplot(data=d_ns, aes(x=Dew_Point)) + 
    geom_histogram(color="black", fill="cyan", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h20<-ggplot(data=d_ns, aes(x=Cloud_Cover_Fraction)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 2
ggarrange(h19, h20,
          ncol = 2, nrow = 2)

In [None]:
d_ns

In [None]:
df_cpt=data.frame()

In [None]:
#d_ns_normalized <- d_ns %>% mutate_at(c(continuous_vars2), funs(c(scale(.))))

In [None]:
df_cpt

In [None]:
x=arcsinh_x(d_ns$Electricity_KW_HR)
new=data.frame(x)
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))


x=log_x(d_ns$Electricity_KW_HR)
new=data.frame(x)
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

x=sqrt_x(d_ns$Electricity_KW_HR)
new=data.frame(x)
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))


x=orderNorm(d_ns$Electricity_KW_HR)
new=data.frame(x)
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

x=yeojohnson(d_ns$Electricity_KW_HR)
new=data.frame(x)
ggplot(data=new, aes(x=x)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

In [None]:


# "Cloud_Cover_Fraction", "Dew_Point", "Humidity_Fraction","Precipitation","Pressure","Temperature","Visibility","Wind_Speed",
normalization_techniques <- c("arcsinh_x", "log_x", "sqrt_x","orderNorm", "yeojohnson")
continuous_vars2 <- c("Solar_Elevation","Electricity_KW_HR")
best_normalize_results = list()

# Dealing with skew
# Z-Score normalization (mean =0, standard deviation = 1)
d_ns_normalized <- df_cpt %>% mutate_at(c(continuous_vars2), funs(c(scale(.))))

# generate bestNormalize results for each continuous variable
for (con_var in continuous_vars2) {
  print(paste(con_var," bestNormalize process:", sep=""))
  elem <- eval(parse(text=paste("bestNormalize(df_cpt$", con_var,")", sep="")))
  best_normalize_results[[con_var]] <- elem
}

# for each normalization technique and each continuous variable, append a new column to the dataframe
for (norm_tech in normalization_techniques) {
  for (con_var in continuous_vars2) {
    func_str <- paste(norm_tech,"(d_ns$", con_var ,")$x.t")
    col_name <- paste(norm_tech, "_", con_var, sep="")
    df_cpt[[col_name]] <- eval(parse(text=func_str))
  }
}

In [None]:
dim(df_cpt)
summary(df_cpt)

In [None]:
## histogram of normalized variables
### ADD density to each histogram ########
#par(mfrow=c(2,2))
# Plot Histograms for each variables 
h21<-ggplot(data=df_cpt, aes(x=Electricity_KW_HR)) + 
    geom_histogram(color="black", fill="#AFE5AD", bins=25) + 
    geom_density(alpha=1, fill="red") +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
    #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h22<-ggplot(data=df_cpt, aes(x=Solar_Elevation)) + 
    geom_histogram(color="black", fill="cadetblue2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h23<-ggplot(data=df_cpt, aes(x=Wind_Speed)) + 
    geom_histogram(color="black", fill="azure", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h24<-ggplot(data=df_cpt, aes(x=Visibility)) + 
    geom_histogram(color="black", fill="bisque", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 4
ggarrange(h21, h22, h23, h24,
          ncol = 2, nrow = 2)

h25<-ggplot(data=df_cpt, aes(x=Temperature)) + 
    geom_histogram(color="black", fill="blueviolet", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h26<-ggplot(data=df_cpt, aes(x=Pressure)) + 
    geom_histogram(color="black", fill="coral2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h27<-ggplot(data=df_cpt, aes(x=Precipitation)) + 
    geom_histogram(color="black", fill="cornflowerblue", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h28<-ggplot(data=df_cpt, aes(x=Humidity_Fraction)) + 
    geom_histogram(color="black", fill="chartreuse2", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 4
ggarrange(h25, h26, h27, h28,
          ncol = 2, nrow = 2)

h29<-ggplot(data=df_cpt, aes(x=Dew_Point)) + 
    geom_histogram(color="black", fill="cyan", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

h30<-ggplot(data=df_cpt, aes(x=Cloud_Cover_Fraction)) + 
    geom_histogram(color="black", fill="darkgoldenrod", bins=25) + 
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  #ggsave(outputfile, arrangeGrob(p1, p2), width = 15, height = 8, dpi = 300)

# Arrange plots in sets of 2
ggarrange(h29, h30,
          ncol = 2, nrow = 2)

In [None]:
library(gridExtra)
# Generate visualizations for each continuous variable
for (con_var in continuous_vars2) {
  file_name <- paste(con_var,"_normalization_comparison.jpg", sep="")
  page_title <- paste(con_var," Normalization Comparison", sep="")
  #outputfile <- paste(output, file_name, sep="")
  arcsinh_col <- paste("arcsinh_x_", con_var, sep="")
  #exp_col <- paste("exp_x_", con_var, sep="")
  #lambert_col <- paste("lambert_", con_var, sep="")
  log_col <- paste("log_x_", con_var, sep="")
  orderNorm_col <- paste("orderNorm_", con_var, sep="")
  sqrt_col <- paste("sqrt_x_", con_var, sep="")
  yj_col <- paste("yeojohnson_", con_var, sep="")
  
  title = paste("Histogram - Distribution of Values\n", con_var, sep="")
  z_score_title <- paste(title," with z-score normalization", sep="")
  arcsinh_title <- paste(title," with arcsinh normalization", sep="")
  exp_x_title <- paste(title," with exp normalization", sep="")
  lambert_title <- paste(title," with lambert normalization", sep="")
  log_x_title <- paste(title," with log normalization", sep="")
  orderNorm_title <- paste(title," with orderNorm normalization", sep="")
  sqrt_x_title <- paste(title," with sqrt normalization", sep="")
  yeojohnson_title <- paste(title," with yeo johnson normalization", sep="")
  
  p1 <- ggplot(data=df_cpt, aes(x=df_cpt[[con_var]])) + 
        geom_histogram(color="black", fill="#74a9cf", bins=25) + 
        ggtitle(title) +
        theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  
  p2 <- ggplot(data=df_cpt, aes(x=df_cpt[[con_var]])) + 
    geom_histogram(color="black", fill="#AFE5AD", bins=25) + 
    ggtitle(z_score_title) +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

  p3 <- ggplot(data=df_cpt, aes(x=df_cpt[[arcsinh_col]])) + 
    geom_histogram(color="black", fill="#377F69", bins=25) + 
    ggtitle(arcsinh_title) +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  
  #p4 <- ggplot(data=df_cpt, aes(x=df_cpt[[exp_col]])) + 
  #  geom_histogram(color="black", fill="#A7496C", bins=25) + 
  #  ggtitle(exp_x_title) +
  #  theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

  #p5 <- ggplot(data=df_cpt, aes(x=df_cpt[[lambert_col]])) + 
  #  geom_histogram(color="black", fill="#C09454", bins=25) + 
  #  ggtitle(lambert_title) +
  #  theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

  p6 <- ggplot(data=df_cpt, aes(x=df_cpt[[log_col]])) + 
    geom_histogram(color="black", fill="#FFEC4F", bins=25) + 
    ggtitle(log_x_title) +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

  p7 <- ggplot(data=df_cpt, aes(x=df_cpt[[orderNorm_col]])) + 
    geom_histogram(color="black", fill="#C8F71A", bins=25) + 
    ggtitle(orderNorm_title) +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  
  p8 <- ggplot(data=df_cpt, aes(x=df_cpt[[sqrt_col]])) + 
    geom_histogram(color="black", fill="#16D047", bins=25) + 
    ggtitle(sqrt_x_title) +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))

  p9 <- ggplot(data=df_cpt, aes(x=df_cpt[[yj_col]])) + 
    geom_histogram(color="black", fill="#1C87BB", bins=25) + 
    ggtitle(yeojohnson_title) +
    theme(panel.grid.minor=element_blank(), panel.grid.major = element_line(colour = "#E8E8E8"), axis.line = element_line(colour = "black"))
  
    #ggsave(outputfile, grid.arrange(top=page_title, p1, p2, p3, p4, p5, p6, p7, p8, p9, ncol=3),width = 30, height = 20, dpi = 300)
    #ggsave(file_name, grid.arrange(top=page_title, p1, p2, p3, p6, p7, p8, p9, ncol=3),width = 30, height = 20, dpi = 300)
    # Arrange plots in sets of 4
    par(mfrow=c(3,3)
    #ggarrange(p1, p2, p3, p6, p7,p8,p9, nrow = 3, ncol = 3)
    grid.arrange(p1, p2, p3,p4, p5, p6, p7,p8,p9, ncol = 3, nrow = 3)
    #p1, p2, p3,p4, p5, p6, p7,p8,p9
}

In [None]:
# since the best normal distribution is "orderNorm_Electricity_KW_HR" We will use this along with other normal dataset to make the model.
names(df_cpt)

In [None]:
norm_df=df_cpt[,36:45]

In [None]:
# Dealing with skew
# Z-Score normalization (mean =0, standard deviation = 1)
# df_cpt_normalized <- df_cpt %>% mutate_at(c(continuous_vars2), funs(c(scale(.))))
# summary(df_cpt_normalized)

# MODELS

## Multivariate Regression Models with transformation or normalization

In [None]:
####### Linear Regression Models with transformation or normalization ########
# let's split the data into train and validation for this problem
# Training = 80% and Validation = 20%
smp_size<-floor(0.8*nrow(norm_df))
## set the seed to make the partition repoducible
set.seed(123)
train_ind<-sample(seq_len(nrow(norm_df)), size=smp_size)
train1<-norm_df[train_ind,] # 80% training dataset
validation1<-norm_df[-train_ind,] # 20% validation dataset

head(train1)
head(validation1)

In [None]:
# correllation plot of the ordered Normal
ggcorr(norm_df, palette = "RdYlGn", name = "rho", label = TRUE, label_color = "black")

In [None]:
# let's fit model on all variables except for very high multicolliniarity varible Dew_Point.
fitAll<-lm(orderNorm_Electricity_KW_HR ~. - orderNorm_Dew_Point, data=train1)
summary(fitAll)

# Prediction on Validation set
# Apply prediction on validation set
fitAll_predict_validation <- predict(fitAll, newdata = validation1)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation1$orderNorm_Electricity_KW_HR,fitAll_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(fitAll_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation1$orderNorm_Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- fitAll_predict_validation
Output2Mod[fitAll_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation1$orderNorm_Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation1$orderNorm_Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

In [None]:
fitAllBoth<-step(fitAll, direction='both') #, scope=formula(fitAll))
summary(fitAllBoth)
names(fitAllBoth)
plot(fitAllBoth)
fitAllBoth_VIF<-vif(fitAllBoth)
fitAllBoth_VIF



# Prediction on Validation set
# Apply prediction on validation set
fitAllBoth_predict_validation <- predict(fitAllBoth, newdata = validation1)

#Lets compute the root-mean-square error value between actual and predicted
library(Metrics)
validation_rmse<-rmse(validation1$orderNorm_Electricity_KW_HR,fitAllBoth_predict_validation)
print("root-mean-square error between actual and predicted")
print(validation_rmse)

# Let's check the summary of predicted Electricity_KW_HR values
cat("\n")
print("summary of predicted Electricity_KW_HR values")
summary(fitAllBoth_predict_validation)

# summary of actual count values
print("summary of actual Electricity_KW_HR values")
summary(validation1$orderNorm_Electricity_KW_HR)

#From above summary we saw negative values of predicted count.
# We don't want negative values as forecast for bike count. Replace all negative numbers with 0 
Output2Mod <- fitAllBoth_predict_validation
Output2Mod[fitAllBoth_predict_validation<=0] <-0

# Check again the summary of predicted count values
print("summary of predicted count values after replaced the negative values")
summary(Output2Mod)

#Lets compute the root-mean-square error value between actual and predicted where negative values are replaced with 0
print("root-mean-square error between actual and predicted")
print(rmse(validation1$orderNorm_Electricity_KW_HR, Output2Mod))

cat("\n")
#If we want to penalize under-prediction of demand, rmsle might be a better metric
validaion_rmsle<-rmsle(validation1$orderNorm_Electricity_KW_HR,Output2Mod)
print("root-mean-square-log error value after replaced the negative values")
print(validaion_rmsle)

In [None]:
# let's fit model on all variables except for very high multicolliniarity varible Dew_Point.
fitAll<-lm(orderNorm_Electricity_KW_HR ~. - orderNorm_Dew_Point, data=train1)
summary(fitAll)

In [None]:
# let's apply Ridge Regression
library(ridge)
linRidgeMod<-linearRidge(orderNorm_Electricity_KW_HR~. -orderNorm_Dew_Point, data=train1)


predicted<-predict(linRidgeMod, train1)
#compare<-cbind(actual=train1$orderNorm_Electricity_KW_HR, predicted)
ssr<-sum((predicted-mean(train1$orderNorm_Electricity_KW_HR))^2)
ssto<-sum((train1$orderNorm_Electricity_KW_HR - mean(train1$orderNorm_Electricity_KW_HR))^2)
r_sqrd<-ssr/ssto
train_rmse<-rmse(train1$orderNorm_Electricity_KW_HR, predicted)
print(paste("Train data R-squared :", r_sqrd))
print(paste("Train data root-mean-square error between actual and predicted: ", train_rmse))
print(paste("Train data mean-square error between actual and predicted: ", train_rmse^2))

predicted<-predict(linRidgeMod, validation1)
#compare<-cbind(actual=validation1$orderNorm_Electricity_KW_HR, predicted)
ssr<-sum((predicted-mean(validation1$orderNorm_Electricity_KW_HR))^2)
ssto<-sum((validation1$orderNorm_Electricity_KW_HR - mean(validation1$orderNorm_Electricity_KW_HR))^2)
r_sqrd<-ssr/ssto
validation_rmse<-rmse(validation1$orderNorm_Electricity_KW_HR, predicted)
print(paste("Test data R-squared :", r_sqrd))
print(paste("Test data root-mean-square error between actual and predicted: ", validation_rmse))
print(paste("Test data mean-square error between actual and predicted: ", validation_rmse^2))

In [None]:
head(train1)

In [None]:
X<-model.matrix(orderNorm_Electricity_KW_HR~.,train1)[,-1]
head(X)
y<-train1$Electricity_KW_HR
y

In [None]:
# let's apply Lasso Regression
