# Applied Programming Coding Challenge #1

![title](../img/photo-1533788179956-82e8a027c962.jpg)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [2]:
%matplotlib inline

In [3]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

## 1 Load data

In [4]:
# Original columns
col_instant='instant'
col_datetime='datetime'
col_season='season'
col_year='year'
col_month='month'
col_hour='hour'
col_holiday='holiday'
col_weekday='weekday'
col_workingday='workingday'
col_weather_situation='weather_situation'
col_temperature='temperature'
col_apparent_temperature='apparent_temperature'
col_humidity='humidity'
col_windspeed='windspeed'

# Target columns
col_casual='casual'
col_registered='registered'
col_count='count'

# Calculated columns
col_temperature_raw='temperature_raw'
col_apparent_temperature_raw='apparent_temperature_raw'
col_humidity_raw='humidity_raw'
col_windspeed_raw='windspeed_raw'
col_temperature_raw_rounded='temperature_raw_rounded'

col_days_since_start='days_since_start'

# Define attribute names
attribute_names_day = [col_instant, col_datetime, col_season, col_year, col_month, col_holiday, col_weekday, col_workingday, col_weather_situation,
                       col_temperature, col_apparent_temperature, col_humidity, col_windspeed, col_casual, col_registered, col_count]

# Read csv files
data_bike_day = pd.read_csv("../data/bike-sharing-dataset/day.csv", skiprows=1, names=attribute_names_day)

## 2 Understand data

### 2.1 Show basic facts

* Show data types of features
* Make sure there are no null values

In [5]:
data_bike_day.info()
data_bike_day.isnull().sum()
# data_bike_day.describe()

### 2.2 Visualize raw data

In [6]:
# Extreme temperature values
temperature_min=-8
temperature_max=39
# Extreme apparent temperature values
apparent_temperature_min=-16
apparent_temperature_max=50
# Extreme humidity value
humidity_max=100
# Extreme wind speed value
windspeed_max=67

In [7]:
# Define columns to be one-hot encoded
columns_raw_values = [col_temperature_raw, col_apparent_temperature_raw, col_humidity_raw, col_windspeed_raw, col_temperature_raw_rounded]

# Restore raw values for day data frame
data_bike_day = data_bike_day.assign(temperature_raw=data_bike_day[col_temperature] * (temperature_max-temperature_min) + temperature_min)
data_bike_day = data_bike_day.assign(apparent_temperature_raw=data_bike_day[col_apparent_temperature] * (apparent_temperature_max-apparent_temperature_min) + apparent_temperature_min)
data_bike_day = data_bike_day.assign(humidity_raw=data_bike_day[col_humidity] * humidity_max)
data_bike_day = data_bike_day.assign(windspeed_raw=data_bike_day[col_windspeed] * windspeed_max)

# Round values for better visualization
data_bike_day = data_bike_day.assign(temperature_raw_rounded=round(data_bike_day[col_temperature_raw]/5,0)*5)

#### 2.2.1 Show data

In [8]:
# Show first few lines
data_bike_day.head()

#### 2.2.2 Plot relation between month and number of rented bikes

* Make sure data is plausible
* Expected plot contains few peaks (most popular biking months)

In [9]:
# Plot trend
sns.catplot(col_month,col_count,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.3 Plot relation between temperature and number of rented bikes

* Make sure data is plausible
* Expected plot contains one peak (optimal biking temperature)

In [10]:
# Plot trend
sns.catplot(col_temperature_raw_rounded,col_count,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.4 Plot relation between weather situation and number of rented bikes

* Make sure data is plausible
* Expected plot indicates that there are significantly more rentals on day having good weather (clear or misty) 
  * 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

In [11]:
# Plot trend
sns.catplot(col_weather_situation,col_count,hue=col_year,data=data_bike_day, ci=None, kind='strip', palette='rainbow')

#### 2.2.4 Plot correlation matrix of some attributes

* Expected matrix contains the following correlations
  * strong correlation between _temperature_ and _apparent temperature_

In [12]:
correlationMatrix=data_bike_day[[col_temperature,col_apparent_temperature,col_humidity,col_windspeed,col_casual,col_registered]].corr()
mask=np.array(correlationMatrix)
mask[np.tril_indices_from(mask)]=False

fig,ax=plt.subplots(figsize=(15,8))
sns.heatmap(correlationMatrix,mask=mask,vmax=0.8,square=True,annot=True,ax=ax)
ax.set_title('Correlation Matrix')
plt.show()

## 3 Preprocess data

### 3.1 One-hot-encode categorical data

* The feature _weekday_ is currently label-encoded which implies an order
* The feature _weather_situation_ is currently label-encoded which implies an order

In [13]:
# Define columns to be one-hot encoded
columns_one_hot_encoded = [col_season, col_year, col_month, col_weekday, col_weather_situation]

# One-hot encode columns 
for column in columns_one_hot_encoded:
    data_bike_day = pd.concat([data_bike_day,pd.get_dummies(data_bike_day[column], prefix=column)],axis=1)

### 3.2 Enhance data

#### 3.2.1 Calculate date since start

In [14]:
# Calculate start date
# start_date = data_bike_day[col_datetime].min()
# data_bike_day[col_days_since_start] = data_bike_day.apply(lambda row: (datetime.datetime.strptime(row[col_datetime], '%Y-%m-%d')-datetime.datetime.strptime(start_date, '%Y-%m-%d')).days, axis = 1) 
# data_bike_day.head()

### 3.3 Drop columns

* The following columns need to be dropped
  * Indices
    * _instant_ since it does not contain any information besides the order
  * Columns which cannot be used directly
    * _datetime_ which is formatted ```yyyy-mm-dd```
  * Columns which are one-hot encoded
    * _weekday_
    * _weather situation_

In [15]:
# Drop index and unusable columns
data_bike_day.drop([col_instant],axis=1, inplace=True)
data_bike_day.drop([col_datetime],axis=1, inplace=True)

# Drop one-hot encoded columns
for column in columns_one_hot_encoded:
    data_bike_day.drop([column],axis=1, inplace=True)

# Drop raw-value columns
for column in columns_raw_values:
    data_bike_day.drop([column],axis=1, inplace=True)

### 3.4 Normalize data

_tbd_

## 4 Split data into _training_ and _test_

In [16]:
# Define columns
data_bike_day_x = data_bike_day.drop([col_count, col_casual, col_registered], axis=1)
data_bike_day_y = data_bike_day[[col_count]]

# Split training data and test data
test_size = 0.3
random_state = 42

x_train, x_test, y_train, y_test = train_test_split(data_bike_day_x, data_bike_day_y, test_size=test_size, random_state=random_state)

In [17]:
# Show splitted data
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

## 5 Initialize model

In [None]:
# Put models into dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(min_samples_split=2,max_leaf_nodes=10),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=200),
    "Lasso": Lasso(alpha=0.2),
    "Elastic Net": ElasticNet(alpha=0.2, random_state=42),
    "Ridge": Ridge(alpha=1.0),
    "Support Vector Regression": SVR(gamma='scale', C=1.0, epsilon=0.2)
}

## 6 Train model

In [None]:
# Train each model in dictionary
for key, value in models.items():
    value.fit(x_train, y_train)

## 7 Evaluate model

In [26]:
def evaluateModel(model_name, predicted_values, expected_values):
    print(model_name)
    print("explained variance score", explained_variance_score(expected_values, predicted_values))
    print("·············· max error", max_error(expected_values, predicted_values))
    print("···· mean absolute error", mean_absolute_error(expected_values, predicted_values))
    print("····· mean squared error", mean_squared_error(expected_values, predicted_values))
    print("· mean squared log error", mean_squared_log_error(expected_values, predicted_values))
    print("·· median absolute error", median_absolute_error(expected_values, predicted_values))
    print("··············· r2 score", r2_score(expected_values, predicted_values))
    print("")

In [27]:
# Train each model in dictionary
for key, value in models.items():
    prediction = value.predict(x_test)
    evaluateModel(key, prediction, y_test)