In [None]:
import pandas as pd
import seaborn as sns

## 1 Load data

In [None]:
# Original columns
col_instant='instant'
col_datetime='datetime'
col_season='season'
col_year='year'
col_month='month'
col_hour='hour'
col_holiday='holiday'
col_weekday='weekday'
col_workingday='workingday'
col_weather_situation='weather_situation'
col_temperature='temperature'
col_apparent_temperature='apparent_temperature'
col_humidity='humidity'
col_windspeed='windspeed'

# Target columns
col_casual='casual'
col_registered='registered'
col_cnt='cnt'

# Calculated columns
col_temperature_raw='temperature_raw'
col_temperature_raw_rounded='temperature_raw_rounded'

# Define attribute names
attribute_names_day = [col_instant, col_datetime, col_season, col_year, col_month, col_holiday, col_weekday, col_workingday, col_weather_situation,
                       col_temperature, col_apparent_temperature, col_humidity, col_windspeed, col_casual, col_registered, col_cnt]
attribute_names_hour = [col_instant, col_datetime, col_season, col_year, col_month, col_hour, col_holiday, col_weekday, col_workingday, col_weather_situation,
                       col_temperature, col_apparent_temperature, col_humidity, col_windspeed, col_casual, col_registered, col_cnt]
# Read csv files
data_bike_day = pd.read_csv("../data/bike-sharing-dataset/day.csv", skiprows=1, names=attribute_names_day)
data_bike_hour = pd.read_csv("../data/bike-sharing-dataset/hour.csv", skiprows=1, names=attribute_names_hour)

## 2 Understand data

### 2.1 Show basic facts

In [None]:
data_bike_day.info()

In [None]:
data_bike_hour.info()

### 2.2 Visualize raw data

In [None]:
# Extreme temperature values
temperature_min=-8
temperature_max=39
# Extreme apparent temperature values
apparent_temperature_min=-16
apparent_temperature_max=50
# Extreme humidity value
humidity_max=100
# Extreme wind speed value
windspeed_max=67

In [None]:
# Restore raw values for day data frame
data_bike_day = data_bike_day.assign(temperature_raw=data_bike_day[col_temperature] * (temperature_max-temperature_min) + temperature_min)
data_bike_day = data_bike_day.assign(apparent_temperature_raw=data_bike_day[col_apparent_temperature] * (apparent_temperature_max-apparent_temperature_min) + apparent_temperature_min)
data_bike_day = data_bike_day.assign(humidity_raw=data_bike_day[col_humidity] * humidity_max)
data_bike_day = data_bike_day.assign(windspeed_raw=data_bike_day[col_windspeed] * windspeed_max)

# Round values for better visualization
data_bike_day = data_bike_day.assign(temperature_raw_rounded=round(data_bike_day[col_temperature_raw]/5,0)*5)

# Restore raw values for hour data frame
data_bike_hour = data_bike_hour.assign(temperature_raw=data_bike_hour[col_temperature] * (temperature_max-temperature_min) + temperature_min)
data_bike_hour = data_bike_hour.assign(apparent_temperature_raw=data_bike_hour[col_apparent_temperature] * (apparent_temperature_max-apparent_temperature_min) + apparent_temperature_min)
data_bike_hour = data_bike_hour.assign(humidity_raw=data_bike_hour[col_humidity] * humidity_max)
data_bike_hour = data_bike_hour.assign(windspeed_raw=data_bike_hour[col_windspeed] * windspeed_max)

#### 2.2.1 Show data

In [None]:
# Show first few lines
data_bike_day.head()

In [None]:
# Show first few lines
data_bike_hour.head()

#### 2.2.2 Plot relation between month and number of rented bikes

* Make sure data is plausible
* Expected plot contains few peaks (most popular biking months)

In [None]:
# Plot trend
sns.catplot(col_month,col_cnt,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.3 Plot relation between temperature and number of rented bikes

* Make sure data is plausible
* Expected plot contains one peak (optimal biking temperature)

In [None]:
# Plot trend
sns.catplot(col_temperature_raw_rounded,col_cnt,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.4 Plot relation between weather situation and number of rented bikes

* Make sure data is plausible
* Expected plot indicates that there are significantly more rentals on day having good weather (clear or misty) 
  * 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

In [None]:
# Plot trend
sns.catplot(col_weather_situation,col_cnt,hue=col_year,data=data_bike_day, ci=None, kind='strip', palette='rainbow')