In [None]:
library(ggplot2)
library(dplyr)

# Day csv

In [None]:
day = read.csv("Bike-Sharing-Dataset//day.csv", header = TRUE)
myday = day
head(myday)

In [None]:
day$dteday = as.Date(day$dteday, format = "%Y-%m-%d") # converting 'dteday' column to date

In [None]:
head(day, 10)
colSums(is.na(day))

In [None]:
str(day)

The dateday column has been explicitly converted to date format, it was 'char' by default.

In [None]:
dim(day)

In [None]:
day$rawtemp = day$temp*41 # converting temp to raw form from normalized form
head(day)$rawtemp
day$rawatemp = day$atemp*50 # converting atemp to raw form from normalized form
head(day)$rawatemp
day$rawhum = day$hum*100 # converting hum to raw form from normalized form
head(day)$rawhum
day$rawwindspeed = day$windspeed*67 # converting windspeed to raw form from normalized form
head(day)$rawwindspeed

In [None]:
qqnorm(day$temp)
qqline(day$temp, col = "red", lwd = 3)

In [None]:
qqnorm(day$hum)
qqline(day$hum, col = "red", lwd = 2)

In [None]:
qqnorm(day$windspeed, cex = 1.5)
qqline(day$windspeed, col = "red", lwd = 2)

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)


ggplot(day, aes(x = instant, y = temp)) +
  geom_point(size= 5)

In [None]:
mydata = day

# converting factor values from 0, 1 to 2011, 2012
mydata$yr <- ifelse(mydata$yr == 0, 2011, 2012)
mydata$yr = factor(mydata$yr)

options(repr.plot.width = 15, repr.plot.height = 10)

ggplot(mydata, aes(x = mnth, y = temp, color = yr)) +
  geom_point(size= 4, alpha = 0.7) +
scale_x_continuous(breaks = seq(1, 12, by = 1)) + 
labs(title = "Raw temperature by month", x = "Month", y = "Normalized temp (in C)") +
theme(axis.title.x = element_text(size = 20), 
      axis.title.y = element_text(size = 20), 
      plot.title = element_text(size = 30, hjust = 0.5),
      legend.position = "top", legend.justification = "right",
      legend.text = element_text(size = 14))
# scale_x_continuous(breaks = seq(1, 12, by = 1))

In [None]:
# aggregating data by month & year

df <- day %>% 
    mutate(month_year = format(dteday, "%Y-%m")) %>% 
    select(month_year, rawtemp)

# calculate average temperature by month and year

df_avg <- df %>% 
  group_by(month_year) %>% 
  summarise(avg_rawtemp = mean(rawtemp))


# plot the average temperature for each month per year

options(repr.plot.width = 15, repr.plot.height = 10)

ggplot(df_avg, aes(x = month_year, y = avg_rawtemp)) + 
geom_point(size = 5, color = "red") +
labs(x = "Month-Year", y = "Average Temperature (Celsius)") +
ggtitle("Average Temperature for each month in 2011 & 2012") +
theme(axis.title.x = element_text(size = 25), 
      axis.title.y = element_text(size = 25), 
      plot.title = element_text(size = 30, hjust = 0.5),
      axis.text.y = element_text(size = 14))

# Hour csv read

In [None]:
hour = read.csv("Bike-Sharing-Dataset//hour.csv", header = TRUE)
myhour = hour
head(myhour)

In [None]:
colSums(is.na(hour))

In [None]:
hour$dteday = as.Date(hour$dteday, format = "%Y-%m-%d") # converting 'dteday' column to date
head(hour)

In [None]:
str(hour)

In [None]:
dim(hour)

In [None]:
hour$rawtemp = hour$temp*41 # converting temp to raw form from normalized form
hour$rawatemp = hour$atemp*50 # converting atemp to raw form from normalized form
hour$rawhum = hour$hum*100 # converting atemp to raw form from normalized form
hour$rawwindspeed = hour$windspeed*67 # converting atemp to raw form from normalized form

# Monthly averages for both years (using day csv)

In [None]:
# creating monthly averages for the year 2011

year_2011 <- subset(day, yr == 0)
avg_counts_2011 <- aggregate(cnt ~ mnth, data = year_2011, FUN = mean)
avg_counts_2011$yr = 2011
avg_counts_2011$yr = factor(avg_counts_2011$yr)

# creating monthly averages for the year 2012

year_2012 <- subset(day, yr == 1)
avg_counts_2012 <- aggregate(cnt ~ mnth, data = year_2012, FUN = mean)
avg_counts_2012$yr = 2012
avg_counts_2012$yr = factor(avg_counts_2012$yr)

# combining monthly averages for both the months

myavg = rbind(avg_counts_2011, avg_counts_2012)

In [None]:
boxplot(year_2011$cnt, main = "Count variation in 2011", ylab = "Count", col = "red")

In [None]:
boxplot(year_2012$cnt, main = "Count variation in 2012", ylab = "Count", col = "red")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)

ggplot(myavg, aes(x = mnth, y = cnt, color = yr)) +
  geom_point(size= 5) +
  geom_line() +
  labs(title = "Average Bike Rental Counts per month for 2011 & 2012",
       x = "Month",
       y = "Average Rental Count",
       color = "Year") +
theme(axis.title.x = element_text(size = 20), 
      axis.title.y = element_text(size = 20), 
      plot.title = element_text(size = 30, hjust = 0.5),
      legend.position = "top", legend.justification = "right",
      legend.text = element_text(size = 14),
      axis.text.x = element_text(size = 14),
      axis.text.y = element_text(size = 14)) +
scale_x_continuous(breaks = seq(1, 12, by = 1)) + 
scale_y_continuous(limits = c(0, 8000), breaks = seq(0, 8000, 2000))

In [None]:
head(day)

In [None]:
max(day$rawhum)

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)

ggplot(day, aes(x = rawtemp, y = cnt, color = registered)) +
  geom_point(size= 5) +
  labs(title = "Bike Rental Counts vs Temperature",
       x = "Temperature (in C)",
       y = "Rental Count",
       color = "Registered or not") +
geom_smooth(method='lm') +
theme(axis.title.x = element_text(size = 20), 
      axis.title.y = element_text(size = 20), 
      plot.title = element_text(size = 30, hjust = 0.5),
      legend.position = "top", legend.justification = "right",
      legend.text = element_text(size = 9),
      axis.text.x = element_text(size = 14),
      axis.text.y = element_text(size = 14)) +
scale_x_continuous(limits = c(0, 40), breaks = seq(3, 36, by=3 )) +
scale_y_continuous(limits = c(0, 9000), breaks = seq(0, 9000, 1000))

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)

ggplot(day, aes(x = rawhum, y = cnt, color = registered)) +
  geom_point(size= 5) +
  labs(title = "Bike Rental Counts vs Temperature",
       x = "Humidity",
       y = "Rental Count",
       color = "Registered or not") + 
# geom_smooth(method='lm') +
theme(axis.title.x = element_text(size = 20), 
      axis.title.y = element_text(size = 20), 
      plot.title = element_text(size = 30, hjust = 0.5),
      legend.position = "top", legend.justification = "right",
      legend.text = element_text(size = 9),
      axis.text.x = element_text(size = 14),
      axis.text.y = element_text(size = 14)) +
scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, by=5 )) +
scale_y_continuous(limits = c(0, 9000), breaks = seq(0, 9000, 1000))

In [None]:
# Load the ggplot2 package
library(ggplot2)

# Load the dataset
bike = read.csv("Bike-Sharing-Dataset//hour.csv", header = TRUE)

# Fit a linear model with cnt as the response variable and temp as the predictor variable
model <- lm(cnt ~ temp, data = bike)
summary(model)