In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import max_error 
%matplotlib inline

## 1 Load data

In [None]:
# Original columns
col_instant='instant'
col_datetime='datetime'
col_season='season'
col_year='year'
col_month='month'
col_hour='hour'
col_holiday='holiday'
col_weekday='weekday'
col_workingday='workingday'
col_weather_situation='weather_situation'
col_temperature='temperature'
col_apparent_temperature='apparent_temperature'
col_humidity='humidity'
col_windspeed='windspeed'

# Target columns
col_casual='casual'
col_registered='registered'
col_count='count'

# Calculated columns
col_temperature_raw='temperature_raw'
col_apparent_temperature_raw='apparent_temperature_raw'
col_humidity_raw='humidity_raw'
col_windspeed_raw='windspeed_raw'
col_temperature_raw_rounded='temperature_raw_rounded'

col_days_since_start='days_since_start'

# Define attribute names
attribute_names_day = [col_instant, col_datetime, col_season, col_year, col_month, col_holiday, col_weekday, col_workingday, col_weather_situation,
                       col_temperature, col_apparent_temperature, col_humidity, col_windspeed, col_casual, col_registered, col_count]

# Read csv files
data_bike_day = pd.read_csv("../data/bike-sharing-dataset/day.csv", skiprows=1, names=attribute_names_day)

## 2 Understand data

### 2.1 Show basic facts

* Show data types of features
* Make sure there are no null values

In [None]:
data_bike_day.info()
data_bike_day.isnull().sum()
# data_bike_day.describe()

### 2.2 Visualize raw data

In [None]:
# Extreme temperature values
temperature_min=-8
temperature_max=39
# Extreme apparent temperature values
apparent_temperature_min=-16
apparent_temperature_max=50
# Extreme humidity value
humidity_max=100
# Extreme wind speed value
windspeed_max=67

In [None]:
# Define columns to be one-hot encoded
columns_raw_values = [col_temperature_raw, col_apparent_temperature_raw, col_humidity_raw, col_windspeed_raw, col_temperature_raw_rounded]

# Restore raw values for day data frame
data_bike_day = data_bike_day.assign(temperature_raw=data_bike_day[col_temperature] * (temperature_max-temperature_min) + temperature_min)
data_bike_day = data_bike_day.assign(apparent_temperature_raw=data_bike_day[col_apparent_temperature] * (apparent_temperature_max-apparent_temperature_min) + apparent_temperature_min)
data_bike_day = data_bike_day.assign(humidity_raw=data_bike_day[col_humidity] * humidity_max)
data_bike_day = data_bike_day.assign(windspeed_raw=data_bike_day[col_windspeed] * windspeed_max)

# Round values for better visualization
data_bike_day = data_bike_day.assign(temperature_raw_rounded=round(data_bike_day[col_temperature_raw]/5,0)*5)

#### 2.2.1 Show data

In [None]:
# Show first few lines
data_bike_day.head()

#### 2.2.2 Plot relation between month and number of rented bikes

* Make sure data is plausible
* Expected plot contains few peaks (most popular biking months)

In [None]:
# Plot trend
sns.catplot(col_month,col_count,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.3 Plot relation between temperature and number of rented bikes

* Make sure data is plausible
* Expected plot contains one peak (optimal biking temperature)

In [None]:
# Plot trend
sns.catplot(col_temperature_raw_rounded,col_count,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.4 Plot relation between weather situation and number of rented bikes

* Make sure data is plausible
* Expected plot indicates that there are significantly more rentals on day having good weather (clear or misty) 
  * 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

In [None]:
# Plot trend
sns.catplot(col_weather_situation,col_count,hue=col_year,data=data_bike_day, ci=None, kind='strip', palette='rainbow')

#### 2.2.4 Plot correlation matrix of some attributes

* Expected matrix contains the following correlations
  * strong correlation between _temperature_ and _apparent temperature_

In [None]:
correlationMatrix=data_bike_day[[col_temperature,col_apparent_temperature,col_humidity,col_windspeed,col_casual,col_registered]].corr()
mask=np.array(correlationMatrix)
mask[np.tril_indices_from(mask)]=False

fig,ax=plt.subplots(figsize=(15,8))
sns.heatmap(correlationMatrix,mask=mask,vmax=0.8,square=True,annot=True,ax=ax)
ax.set_title('Correlation Matrix')
plt.show()

## 3 Preprocess data

### 3.1 One-hot-encode categorical data

* The feature _weekday_ is currently label-encoded which implies an order
* The feature _weather_situation_ is currently label-encoded which implies an order

In [None]:
# Define columns to be one-hot encoded
columns_one_hot_encoded = [col_season, col_year, col_month, col_weekday, col_weather_situation]

# One-hot encode columns 
for column in columns_one_hot_encoded:
    data_bike_day = pd.concat([data_bike_day,pd.get_dummies(data_bike_day[column], prefix=column)],axis=1)

data_bike_day.head()

### 3.2 Enhance data

#### 3.2.1 Calculate date since start

In [None]:
# Calculate start date
# start_date = data_bike_day[col_datetime].min()
# data_bike_day[col_days_since_start] = data_bike_day.apply(lambda row: (datetime.datetime.strptime(row[col_datetime], '%Y-%m-%d')-datetime.datetime.strptime(start_date, '%Y-%m-%d')).days, axis = 1) 

# data_bike_day.head()

### 3.3 Drop columns

* The following columns need to be dropped
  * Indices
    * _instant_ since it does not contain any information besides the order
  * Columns which cannot be used directly
    * _datetime_ which is formatted ```yyyy-mm-dd```
  * Columns which are one-hot encoded
  * Columns containing raw values

In [None]:
# Drop index and unusable columns
data_bike_day.drop([col_instant],axis=1, inplace=True)
data_bike_day.drop([col_datetime],axis=1, inplace=True)

# Drop one-hot encoded columns
for column in columns_one_hot_encoded:
    data_bike_day.drop([column],axis=1, inplace=True)

# Drop raw-value columns
for column in columns_raw_values:
    data_bike_day.drop([column],axis=1, inplace=True)

data_bike_day.head()

### 3.4 Normalize data

_tbd_

## 4 Split data into _training_ and _test_

In [None]:
# Define columns
data_bike_day_x = data_bike_day.drop([col_count, col_casual, col_registered], axis=1)
data_bike_day_y = data_bike_day[[col_count]]

# Split training data and test data
test_size = 0.3
random_state = 42
x_train, x_test, y_train, y_test = train_test_split(data_bike_day_x, data_bike_day_y, test_size=test_size, random_state=random_state)

In [None]:
# Show splitted data
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
x_train.head()

In [None]:
y_train.head()

## 5 Initialize model

In [None]:
linear_regression = linear_model.LinearRegression()

## 6 Train model

In [None]:
linear_regression.fit(x_train, y_train)

## 7 Evaluate model

In [None]:
linear_regression_prediction = linear_regression.predict(x_test)
linear_regression_mean_squared_error = mean_squared_error(y_test, linear_regression_prediction)
linear_regression_max_error = max_error(y_test, linear_regression_prediction)

print("linear regression mean_squared_error", linear_regression_mean_squared_error)
print("linear regression -------- max_error", linear_regression_max_error)