<center>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/module_1/images/SN_web_lightmode.png" width="300"> 
</center>

# Assignment: Exploratory Data Analysis with ggplot2
Estimated time needed: 60 minutes

# Introduction
Using this R notebook you will perform exploratory data analysis on the Seoul Bike Sharing dataset using Tidyverse and ggplot2.


### Setup libraries and load dataset
We will use local datasets from your project directories.

In [None]:
# Load libraries
suppressPackageStartupMessages({
  library(tidyverse)
  library(lubridate)
  library(ggplot2)
})

# Paths (Windows absolute paths as in the project)
seoul_path <- "C:/Users/Diaa/data.science.with.r/project5-capstone/output/seoul_bike_cleaned.csv"

# Read the cleaned Seoul dataset
seoul_bike_sharing <- readr::read_csv(seoul_path, show_col_types = FALSE)
# Normalize column names to uppercase for consistency with SQL notebook
names(seoul_bike_sharing) <- toupper(names(seoul_bike_sharing))

# Inspect the first few rows
head(seoul_bike_sharing)

### Task 1 — Recast DATE and HOURS
Recast `DATE` to Date (DD/MM/YYYY) and `HOUR` to an ordered factor.

In [None]:
# Parse DATE as Date (dataset uses DD/MM/YYYY)
seoul_bike_sharing <- seoul_bike_sharing %>%
  mutate(
    DATE = lubridate::dmy(DATE),
    HOUR = factor(HOUR, levels = 0:23, ordered = TRUE)
  )

str(seoul_bike_sharing)

### Task 2 — Descriptive statistics
Summarize key variables and check for missing values.

In [None]:
# Summary of numeric columns
seoul_bike_sharing %>%
  select(RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY,
         DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, SNOWFALL) %>%
  summary()

# Missing values by column
colSums(is.na(seoul_bike_sharing))

# Holiday-related records and percentage
holiday_stats <- seoul_bike_sharing %>%
  summarise(
    total_rows = n(),
    holiday_rows = sum(HOLIDAY == "Holiday", na.rm = TRUE),
    holiday_pct = round(100 * holiday_rows / total_rows, 2)
  )
holiday_stats

# Expected records given FUNCTIONING_DAY observations (count of rows marked 'Yes')
functioning_count <- seoul_bike_sharing %>%
  summarise(rows_yes = sum(FUNCTIONING_DAY == "Yes", na.rm = TRUE))
functioning_count

### Task 3 — Weather by Seasons
Compute average rainfall and snowfall grouped by `SEASONS`.

In [None]:
season_weather <- seoul_bike_sharing %>%
  group_by(SEASONS) %>%
  summarise(
    avg_rainfall = mean(RAINFALL, na.rm = TRUE),
    avg_snowfall = mean(SNOWFALL, na.rm = TRUE),
    avg_bike = mean(RENTED_BIKE_COUNT, na.rm = TRUE)
  ) %>%
  arrange(desc(avg_bike))
season_weather

### Task 4 — Scatter: Rentals over Time
Plot `RENTED_BIKE_COUNT` vs `DATE`.

In [None]:
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT)) +
  geom_point(alpha = 0.2, color = "steelblue") +
  labs(title = "Rented Bike Count over Time", x = "Date", y = "Rented Bike Count") +
  theme_minimal()

### Task 5 — Scatter: Rentals over Time colored by Hour
Color by `HOUR` to see hourly patterns.

In [None]:
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = HOUR)) +
  geom_point(alpha = 0.4) +
  labs(title = "Rented Bike Count over Time by Hour", x = "Date", y = "Rented Bike Count") +
  theme_minimal() +
  guides(color = guide_legend(title = "Hour"))

### Task 6 — Histogram + Density: Rentals
Visualize the distribution of `RENTED_BIKE_COUNT`.

In [None]:
ggplot(seoul_bike_sharing, aes(x = RENTED_BIKE_COUNT)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "gray85", color = "gray50") +
  geom_density(color = "firebrick", linewidth = 1) +
  labs(title = "Distribution of Rented Bike Count", x = "Rented Bike Count", y = "Density") +
  theme_minimal()

### Task 7 — Scatter: Rentals vs Temperature by Season
Explore how temperature relates to rentals across seasons.

In [None]:
ggplot(seoul_bike_sharing, aes(x = TEMPERATURE, y = RENTED_BIKE_COUNT)) +
  geom_point(alpha = 0.25, color = "steelblue") +
  facet_wrap(~ SEASONS) +
  labs(title = "Rented Bike Count vs Temperature by Season", x = "Temperature (°C)", y = "Rented Bike Count") +
  theme_minimal()

### Task 8 — Boxplot: Rentals by Hour and Season
Compare rental count distributions across hours per season.

In [None]:
ggplot(seoul_bike_sharing, aes(x = HOUR, y = RENTED_BIKE_COUNT, fill = SEASONS)) +
  geom_boxplot(outlier.alpha = 0.2) +
  labs(title = "Rented Bike Count by Hour across Seasons", x = "Hour", y = "Rented Bike Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

### Task 9 — Daily Rainfall and Snowfall
Aggregate to daily totals and visualize over time.

In [None]:
daily <- seoul_bike_sharing %>%
  group_by(DATE) %>%
  summarise(
    daily_rainfall = sum(RAINFALL, na.rm = TRUE),
    daily_snowfall = sum(SNOWFALL, na.rm = TRUE)
  )

# Plot daily rainfall
ggplot(daily, aes(x = DATE, y = daily_rainfall)) +
  geom_line(color = "dodgerblue") +
  labs(title = "Daily Rainfall in Seoul", x = "Date", y = "Rainfall (mm)") +
  theme_minimal()

# Plot daily snowfall
ggplot(daily, aes(x = DATE, y = daily_snowfall)) +
  geom_line(color = "darkslategray") +
  labs(title = "Daily Snowfall in Seoul", x = "Date", y = "Snowfall (cm)") +
  theme_minimal()

# Days with snowfall > 0
days_with_snow <- sum(daily$daily_snowfall > 0, na.rm = TRUE)
days_with_snow

## Author(s)

<h4> Jeff Grossman </h4>
<h4> Lakshmi Holla </h4>

## Other Contributor(s)

<h4>  Malika Singla </h4>

## <h3 align="center"> © IBM Corporation 2022. All rights reserved. <h3/>
