In [None]:
library(tidyverse)

In [None]:
## Start of my work
# Import packages.
library(tidyverse)
library(readr)
library(skimr)
library(plyr)
library(lubridate)
library(devtools)
# Read CSVs.
bike_df <- 'hour.csv' %>% read_csv()
bike_df %>% glimpse()
## End of my work

In [None]:
## Start of my work
# Arrange them.
row_bike_df_1 <- arrange(bike_df, instant)
row_bike_df_2 <- arrange(bike_df, desc(instant))
row_bike_df_1
row_bike_df_2
# Get the top n.
row_bike_df_3 <- bike_df %>%
  group_by(mnth) %>%
  tally()
row_bike_df_4 <- bike_df %>%
  group_by(mnth) %>%
  top_n(1)
row_bike_df_5 <- bike_df %>%
  arrange(desc(mnth)) %>%
  group_by(mnth) %>%
  top_n(1)
row_bike_df_6 <- bike_df %>%
  arrange(desc(windspeed)) %>%
  group_by(windspeed) %>%
  top_n(1)
row_bike_df_3
row_bike_df_4
row_bike_df_5
row_bike_df_6
# Filter them.
row_bike_df_7 <- bike_df %>%
  filter(yr == 1) %>%
  filter(mnth == 12) %>%
  filter(holiday != 0)
myfunction <- function(n){
    n %>% filter(yr == 1) %>%
      filter(mnth == 12) %>%
      filter(holiday != 0)
}
row_bike_df_8 <- bike_df %>% myfunction()
row_bike_df_7
row_bike_df_8
## End of my work

In [None]:
## Start of my work
# Select columns.
column_bike_df_1 <- bike_df %>%
  select(instant, season : hr, temp : atemp, )
column_bike_df_1
# Drop columns.
column_bike_df_2 <- bike_df %>%
  select(- season : - workingday)
column_bike_df_2
# Rename columns.
names(bike_df)
column_bike_df_3 <- bike_df %>%
  rename(c('yr' = 'year', 'mnth' = 'month', 'hr' = 'hour'))
column_bike_df_3
# Reorder columns.
column_bike_df_4 <- bike_df[c(2 : 9, 1, 10: 17)]
column_bike_df_4
column_bike_df_5 <- bike_df[c(17 : 1)]
column_bike_df_5
## End of my work

In [None]:
## Start of my work
# Produce a new variable.
whole_bike_df_1 <- bike_df %>%
  mutate(min = hr * 60)
whole_bike_df_1 <- group_bike_df_1[c(1 : 6, 18, 7 : 17)]
whole_bike_df_1
# Modify an existing variable.
whole_bike_df_2 <- bike_df %>%
  mutate(min = hr * 60)
# Take 'hr' out and put 'min' in.
whole_bike_df_2 <- group_bike_df_1[c(1 : 5, 18, 7 : 17)]
whole_bike_df_2
# Operate the datafram on groups.
group_bike_df_1 <- bike_df %>%
  group_by(season) %>%
  tally()
group_bike_df_2 <- bike_df %>%
  group_by(season, mnth) %>%
  tally()
group_bike_df_3 <- bike_df %>%
  summarise(mean_temp = mean(temp), median_temp = median(temp))
group_bike_df_1
group_bike_df_2
group_bike_df_3
## End of my work

In [None]:
## Start of my work
# Produce a tidy version.
# If assuming the dataframe is not tidy, try the given functions.
tidy_bike_df_0 <- bike_df %>%
  na.omit() %>%
  select(instant, cnt) %>%
  spread(key = 1,
         value = 2)
# gather() doesn't work well since there is not either a symbol or a string in this dataframe.
# The dataframe may have already been tidy or almost tidy.
# The reasons are that in tidy data "each variable forms a column; each observation forms a row; each type of observational unit forms a table." (Wickham, H. (2014). Tidy data. Journal of Statistical Software, 59(10), 4.)
# According to Wickham (2014), the dataframe we use satisfies the first two criteria. We will focus on the third one.
sapply(bike_df, class)
# Make the only one "Date" into "numeric".
tidy_bike_df_1 <- bike_df
tidy_bike_df_1[2] <- unlist(lapply(bike_df[2], function(date) {gsub('-', '', date)}))
tidy_bike_df_1[2] <- as.numeric(unlist(tidy_bike_df_1[2]))
sapply(tidy_bike_df_1, class)
sapply(tidy_bike_df_1, typeof)
# Now, classes of all columns have been "numeric". Types of all data have been "double" and in one table. The dataframe is tidy.
# Since all data has been "double", they can be applied to statistical and mathematical modeling.
## End of my work

In [None]:
## Start of my work
# Hours (time) and bike counts
plot_bike_df_1 <- bike_df %>% 
  ggplot(mapping = aes(
    x = hr,
    y = cnt)) +
  geom_point() +
  xlab('Hours') +
  ylab('Rental Bike Counts') +
  ggtitle('Relationship between Hours in a Day and Bike Rentals')
plot_bike_df_1
# Months (seasons) and bike counts
plot_bike_df_2 <- bike_df %>% 
  ggplot(mapping = aes(
    x = mnth,
    y = cnt)) +
  geom_jitter() +
  scale_x_discrete(name = 'Months', limits = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12')) +
  ylab('Rental Bike Counts') +
  ggtitle('Relationship between Months and Bike Rentals')
plot_bike_df_2
# Humidity and bike counts
plot_bike_df_3 <- bike_df %>% 
  ggplot(mapping = aes(
    x = hum,
    y = cnt,
    colour = cnt)) +
  geom_point() +
  xlab('Normalized Humidity') +
  ylab('Rental Bike Counts') +
  ggtitle('Relationship between Humidity and Bike Rentals') +
  labs(colour = 'Rental Bike Counts')
plot_bike_df_3
# Extra: wind speeds and humidity
plot_bike_df_4 <- bike_df %>%
  filter(windspeed != 0) %>%
  filter(hum != 0)
plot_bike_df_4 %>%
  ggplot(mapping = aes(
    x = windspeed,
    y = hum)) +
  geom_hex() +
  xlab('Normalized Wind Speed') +
  ylab('Normalized Humidity') +
  ggtitle('Relationship between Wind Speed and Humidity')
# There is a weakly negative correlation between humidity and windspeed.
## End of my work

In [None]:
## Start of my work
getwd()
write_csv(tidy_bike_df_1, 'tidy_hour.csv')
ggsave(filename = 'plot_bike_humidity.png', plot = plot_bike_df_3, scale = 1, dpi = 300, limitsize = TRUE)
## End of my work

In [None]:
## Start of my work
# Task (1): months (seasons) and bike counts
challenge_bike_df_1 <- bike_df %>% 
  ggplot(mapping = aes(
    x = mnth,
    y = cnt)) +
  geom_jitter() +
  scale_x_discrete(name = 'Months', limits = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12')) +
  ylab('Rental Bike Counts') +
  ggtitle('Relationship between Months and Bike Rentals')
challenge_bike_df_1
# The months, May, June, July, August, September and October, are classified in "Peak Season". Get prepared and provide enough bikes in these months.
# Task (2): correlation between the registered bike rentals between 6 and 10 am and the weather between 5 and 10 pm
challenge_bike_df_2 <- bike_df %>% 
  ggplot(mapping = aes(
    x = weathersit,
    y = registered,
    colour = registered)) +
  geom_point() +
  scale_x_discrete(name = 'Weather', limits = c('Clear', 'Cloudy', 'Light PREC.', 'Heavy PREC.')) +
  ylab('Registered Rental Bike Counts') +
  ggtitle('Relationship between Weather and Registered Bike Rentals') +
  labs(colour = 'Registered Rental Bike Counts')
challenge_bike_df_2
# Compare the registered users and the casual users.
challenge_bike_df_3 <- bike_df %>%
  mutate(casual = cnt - registered)
challenge_bike_df_3 %>% 
  ggplot(mapping = aes(
    x = weathersit,
    y = casual,
    colour = casual)) +
  geom_point() +
  scale_x_discrete(name = 'Weather', limits = c('Clear', 'Cloudy', 'Light PREC.', 'Heavy PREC.')) +
  ylab('Casual Rental Bike Counts') +
  ggtitle('Relationship between Weather and Casual Bike Rentals') +
  labs(colour = 'Casual Rental Bike Counts')
# Yes, both registered and casual users are sensitive to weather conditions.
# Hours (time) and bike counts
challenge_bike_df_4 <- bike_df %>% 
  ggplot(mapping = aes(
    x = hr,
    y = cnt)) +
  geom_jitter() +
  xlab('Hours') +
  ylab('Rental Bike Counts') +
  ggtitle('Relationship between Hours in a Day and Bike Rentals')
challenge_bike_df_4
# Hours (time) and Weather
challenge_bike_df_5 <- bike_df %>% 
  ggplot(mapping = aes(
    x = hr,
    y = weathersit)) +
  geom_jitter() +
  xlab('Hours') +
  scale_y_discrete(name = 'Weather', limits = c('Clear', 'Cloudy', 'Light PREC.', 'Heavy PREC.')) +
  ggtitle('Relationship between Hours in a Day and Weather')
challenge_bike_df_5
# We can't not extract useful information through this attempt.
# Establish two new tables that consist 5 to 10 pm and 6 to 10 am repsectively.
challenge_bike_df_6 <- bike_df %>%
  filter(hr < 23) %>%
  filter(hr > 16)
challenge_bike_df_7 <- bike_df %>%
  filter(hr < 11) %>%
  filter(hr > 5)
# Plot the two periods and look for the similarity.
challenge_bike_df_6 %>% 
  ggplot(mapping = aes(
    x = hr,
    y = weathersit)) +
  geom_jitter() +
  xlab('Hours') +
  ylab('Weather') +
  ggtitle('Weather between 5 and 10 pm') +
  labs(colour = 'Weather')
challenge_bike_df_7 %>% 
  ggplot(mapping = aes(
    x = hr,
    y = registered)) +
  geom_jitter() +
  xlab('Hours') +
  ylab('Registered Rental Bike Counts') +
  ggtitle('Registered Bike Rentals between 6 and 10 am') +
  labs(colour = 'Registered Rental Bike Counts')
# I don't think I can test the hypothesis based on my current knowledge in data wrangling.
# However, data mining may help!
# Try the single-variable linear regression model.
# To make it simple, take the weather at 8pm on the previous day and the registered bike rentals at 9am.
challenge_bike_df_8 <- challenge_bike_df_6 %>%
  filter(hr == 20)
challenge_bike_df_9 <- challenge_bike_df_7 %>%
  filter(hr == 9)
x_weather <- challenge_bike_df_8[1: 726, 10] 
y_registered <- challenge_bike_df_9[2: 727, 16]
x_weather
y_registered
# We get the weather of Days 1 to 726 and the registered bike rentals in Days 2 to 727.
# Apply the regression.
x_weather <- unlist(x_weather)
y_registered <- unlist(y_registered)
lm.fit = lm(y_registered ~ x_weather)
summary(lm.fit)
# We get the linear relationship between the two variables.
# By the way, using the mean of the period instead of selecting 8pm and 9am will improve the model.
## End of my work