# Applied Programming Coding Challenge #1

![title](../img/photo-1533788179956-82e8a027c962.jpg)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor 

In [None]:
%matplotlib inline

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

## 1 Load data

In [None]:
# Original columns
col_instant='instant'
col_datetime='datetime'
col_season='season'
col_year='year'
col_month='month'
col_hour='hour'
col_holiday='holiday'
col_weekday='weekday'
col_workingday='workingday'
col_weather_situation='weather_situation'
col_temperature='temperature'
col_apparent_temperature='apparent_temperature'
col_humidity='humidity'
col_windspeed='windspeed'

# Target columns
col_casual='casual'
col_registered='registered'
col_count='count'

# Calculated columns
col_days_since_start='days_since_start'

# Raw column
col_temperature_raw='temperature_raw'
col_apparent_temperature_raw='apparent_temperature_raw'
col_humidity_raw='humidity_raw'
col_windspeed_raw='windspeed_raw'
col_temperature_raw_rounded='temperature_raw_rounded'
col_days_since_start_raw='days_since_start_raw'

# Variants
variant_original = "original"
variant_market = "market"

# Define attribute names
attribute_names_day = [col_instant, col_datetime, col_season, col_year, col_month, col_holiday, col_weekday, 
                       col_workingday, col_weather_situation, col_temperature, col_apparent_temperature, col_humidity, 
                       col_windspeed, col_casual, col_registered, col_count]

# Read csv files
data_bike_day = pd.read_csv("../data/bike-sharing-dataset/day.csv", skiprows=1, names=attribute_names_day)

## 2 Understand data

### 2.1 Show basic facts

* Show data types of features
* Make sure there are no null values

In [None]:
data_bike_day.info()
data_bike_day.isnull().sum()
# data_bike_day.describe()

In [None]:
# Convert columns to date time
data_bike_day[col_datetime] = pd.to_datetime(data_bike_day[col_datetime])

# Define categorical columns
columns_categorical = [col_season, col_year, col_month, col_holiday, col_weekday, col_workingday, col_weather_situation]

# Convert columns to category
for column in columns_categorical:
    data_bike_day[column]=data_bike_day[column].astype('category')

### 2.2 Visualize raw data

In [None]:
# Extreme temperature values
temperature_min=-8
temperature_max=39
# Extreme apparent temperature values
apparent_temperature_min=-16
apparent_temperature_max=50
# Extreme humidity value
humidity_max=100
# Extreme wind speed value
windspeed_max=67

In [None]:
# Define columns containing raw-values
columns_raw_values = [col_temperature_raw, col_apparent_temperature_raw, col_humidity_raw, col_windspeed_raw, 
                      col_temperature_raw_rounded]

# Restore raw values for day data frame
data_bike_day = data_bike_day.assign(temperature_raw=data_bike_day[col_temperature] * (temperature_max-temperature_min) + temperature_min)
data_bike_day = data_bike_day.assign(apparent_temperature_raw=data_bike_day[col_apparent_temperature] * (apparent_temperature_max-apparent_temperature_min) + apparent_temperature_min)
data_bike_day = data_bike_day.assign(humidity_raw=data_bike_day[col_humidity] * humidity_max)
data_bike_day = data_bike_day.assign(windspeed_raw=data_bike_day[col_windspeed] * windspeed_max)

# Round values for better visualization
data_bike_day = data_bike_day.assign(temperature_raw_rounded=round(data_bike_day[col_temperature_raw]/5,0)*5)

#### 2.2.1 Show data

In [None]:
# Show first few lines
data_bike_day.head()

#### 2.2.2 Plot relation between month and number of rented bikes

* Make sure data is plausible
* Expected plot contains few peaks (most popular biking months)

In [None]:
# Plot trend
sns.catplot(col_month,col_count,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.3 Plot relation between temperature and number of rented bikes

* Make sure data is plausible
* Expected plot contains one peak (optimal biking temperature)

In [None]:
# Plot trend
sns.catplot(col_temperature_raw_rounded,col_count,hue=col_year,data=data_bike_day, ci=None, kind='point', palette='rainbow')

#### 2.2.4 Plot relation between weather situation and number of rented bikes

* Make sure data is plausible
* Expected plot indicates that there are significantly more rentals on day having good weather (clear or misty) 
  * 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

In [None]:
# Plot trend
sns.catplot(col_weather_situation,col_count,hue=col_year,data=data_bike_day, ci=None, kind='strip', palette='rainbow')

#### 2.2.4 Plot correlation matrix of some attributes

* Expected matrix contains the following correlations
  * strong correlation between _temperature_ and _apparent temperature_

In [None]:
# Plot correlation matrix
correlationMatrix=data_bike_day[[col_temperature,col_apparent_temperature,col_humidity,col_windspeed,col_casual,col_registered]].corr()
mask=np.array(correlationMatrix)
mask[np.tril_indices_from(mask)]=False

fig,ax=plt.subplots(figsize=(15,8))
sns.heatmap(correlationMatrix,mask=mask,vmax=0.8,square=True,annot=True,ax=ax)
ax.set_title('Correlation Matrix')
plt.show()

## 3 Preprocess data

### 3.0 Initialize variants

In [None]:
class Variant:
    """Contains data, model and predictions of a variant"""
    color = "#000"
    data_frame = {}
    splitted_data = {}
    models = {}
    predictions = {}
    
    def __init__(self, color):
        self.color = color

# Define variants
variants = {
    variant_original: Variant(color="#3fe0e0"),
    variant_market: Variant(color="#d4dd80"),
}

### 3.1 One-hot-encode categorical data

In [None]:
# One-hot encode columns 
for column in columns_categorical:
    data_bike_day = pd.concat([data_bike_day,pd.get_dummies(data_bike_day[column], prefix=column)],axis=1)

### 3.2 Enhance data

* The following variants shall be compared
  * _original_ original features
  * _market_ original features + time since start of bike sharing campaign

In [None]:
# Iterate over variants
for variant_name, variant in variants.items():
    
    # Copy original data frame
    variant.data_frame = data_bike_day.copy()

#### 3.2.1 Calculate date since start

In [None]:
# Calculate days since start
start_date = variants[variant_market].data_frame[col_datetime].min()
variants[variant_market].data_frame[col_days_since_start_raw] = \
    variants[variant_market].data_frame.apply(lambda row: (row[col_datetime] - start_date).days, axis = 1) 

# Normalize days since start
max_days_since_start = variants[variant_market].data_frame[col_days_since_start_raw].max()
variants[variant_market].data_frame[col_days_since_start] = \
    variants[variant_market].data_frame.apply(lambda row: (row[col_days_since_start_raw] / max_days_since_start), axis = 1)

# Drop raw days since start
variants[variant_market].data_frame.drop(col_days_since_start_raw, axis=1, inplace=True)

In [None]:
# Plot correlation matrix
correlationMatrix=variants[variant_market].data_frame[[col_temperature,col_apparent_temperature,col_humidity,col_windspeed,col_days_since_start,col_count]].corr()
mask=np.array(correlationMatrix)
mask[np.tril_indices_from(mask)]=False

fig,ax=plt.subplots(figsize=(15,8))
sns.heatmap(correlationMatrix,mask=mask,vmax=0.8,square=True,annot=True,ax=ax)
ax.set_title('Correlation Matrix')
plt.show()

### 3.3 Normalize data
    

### 3.4 Drop columns

* The following columns need to be dropped
  * Indices
    * _instant_ since it does not contain any information besides the order
  * Columns which cannot be used directly
    * _datetime_ which is formatted ```yyyy-mm-dd```
  * Columns which are one-hot encoded
  * Columns which contain raw values
  * Unused target columns

In [None]:
# Iterate over variants
for variant_name, variant in variants.items():
    
    # Drop index and unusable columns
    variant.data_frame.drop([col_instant, col_datetime, col_temperature], axis=1, inplace=True)
    # Drop one-hot encoded columns
    variant.data_frame.drop(columns_categorical, axis=1, inplace=True)
    # Drop raw-value columns
    variant.data_frame.drop(columns_raw_values, axis=1, inplace=True)
    # Drop unused target columns    
    variant.data_frame.drop([col_casual, col_registered], axis=1, inplace=True)

## 4 Split data into _training_ and _test_

In [None]:
class DataSplit:
    """Contains results of a train/test data split"""
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

# Iterate over variants
for variant_name, variant in variants.items():
    
    # Define columns
    data_bike_day_x = variant.data_frame.drop([col_count], axis=1)
    data_bike_day_y = variant.data_frame[[col_count]]
    
    test_size = 0.3
    random_state = 0
    
    # Split training data and test data
    x_train, x_test, y_train, y_test = train_test_split(data_bike_day_x, data_bike_day_y, test_size=test_size, random_state=random_state)
    
    # Add splitted data 
    variant.splitted_data = DataSplit(x_train, x_test, y_train, y_test)
    
    # Show splitted data
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

## 5 Initialize model

In [None]:
# Put models into dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(min_samples_split=10,max_leaf_nodes=100),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=200),
    "Lasso": Lasso(alpha=0.2),
    "Elastic Net": ElasticNet(alpha=0.2, random_state=0),
    "Ridge": Ridge(alpha=1.0),
    "Support Vector Regression (linear)": SVR(kernel='linear', gamma='scale', C=1.0, epsilon=0.2),
    "Support Vector Regression (rbf)": SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.2)
}

In [None]:
# Iterate over variants
for variant_name, variant in variants.items():

    # Iterate over models
    for model_name, model in models.items():
        
        # Attach model to variant
        variant.models[model_name] = model

## 6 Train model

In [None]:
# Iterate over variants
for variant_name, variant in variants.items():
    
    # Iterate over models in variant 
    for model_name, model in variant.models.items():
        
        # Fit model and create prediction
        model.fit(variant.splitted_data.x_train, variant.splitted_data.y_train)
        variant.predictions[model_name] = model.predict(variant.splitted_data.x_test)

## 7 Evaluate model

In [None]:
def evaluateModel(predicted_values, expected_values):
    """Evaluates regression model by using multiple metrics"""
    print("explained variance score", round(explained_variance_score(expected_values, predicted_values), 2))
    print("               max error", round(max_error(expected_values, predicted_values), 2))
    print("     mean absolute error", round(mean_absolute_error(expected_values, predicted_values), 2))
    print("      mean squared error", round(mean_squared_error(expected_values, predicted_values), 2))
    print("  mean squared log error", round(mean_squared_log_error(expected_values, predicted_values), 2))
    print("   median absolute error", round(median_absolute_error(expected_values, predicted_values), 2))
    print("                r2 score", round(r2_score(expected_values, predicted_values), 2))
    print("")

### 7.1 Display metrics

In [None]:
# Iterate over variants
for variant_name, variant in variants.items():
    
    # Iterate over models
    for model_name, model in variant.models.items():
        print(variant_name + " / " + model_name)
        
        # Evaluate each model in dictionary
        predicted_values = variant.predictions[model_name]
        expected_values = variant.splitted_data.y_test
        evaluateModel(predicted_values, expected_values)

### 7.2 Visualize R2 score

In [None]:
y_pos = np.arange(len(models.keys()))

variant_index = 0

# Iterate over variants
for variant_name, variant in variants.items():

    models_r2_score = []
    models_r2_score_index = 0
    
    # Iterate over models
    for model_name in models.keys():

        predicted_values = variant.predictions[model_name]
        expected_values = variant.splitted_data.y_test

        models_r2_score.insert(models_r2_score_index, r2_score(expected_values, predicted_values))
        models_r2_score_index += 1
    
    plt.barh(y_pos + (variant_index*0.5), models_r2_score, 0.4, align='center', alpha=0.5, color=variant.color)
    variant_index += 1

plt.xlabel('R2 Score')
plt.yticks(y_pos, models.keys())
plt.title('Model performance')
plt.show()