# Applied Programming Coding Challenge #1

![title](../img/photo-1533788179956-82e8a027c962.jpg)

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor 

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

## 1 Load data

```HINT``` Put column names into variables in order to benefit from code completion

In [None]:
# Original columns
col_instant='instant'
col_datetime='datetime'
col_season='season'
col_year='year'
col_month='month'
col_hour='hour'
col_holiday='holiday'
col_weekday='weekday'
col_workingday='workingday'
col_weather_situation='weather_situation'
col_temperature='temperature'
col_apparent_temperature='apparent_temperature'
col_humidity='humidity'
col_windspeed='windspeed'

# Target columns
col_casual='casual'
col_registered='registered'
col_count='count'

# Calculated columns
col_days_since_start='days_since_start'

# Raw column
col_temperature_raw='temperature_raw'
col_apparent_temperature_raw='apparent_temperature_raw'
col_humidity_raw='humidity_raw'
col_windspeed_raw='windspeed_raw'
col_temperature_raw_rounded='temperature_raw_rounded'
col_days_since_start_raw='days_since_start_raw'

In [None]:
# Data sets
data_set_day = "day"
data_set_hour = "hour"

class DataSet:
    """Contains data, model and predictions of a data set"""
    file_name = ""
    attribute_names = []
    columns_categorical = []
    data_frame = {}
    variants = {}
    
    def __init__(self, file_name):
        self.file_name = file_name

# Define data sets
data_sets = {
    data_set_day: DataSet("../data/bike-sharing-dataset/day.csv"),
    data_set_hour: DataSet("../data/bike-sharing-dataset/hour.csv")
}

In [None]:
# Define attribute names
data_sets[data_set_day].attribute_names = [col_instant, col_datetime, col_season, col_year, col_month, col_holiday, 
                                           col_weekday, col_workingday, col_weather_situation, col_temperature, 
                                           col_apparent_temperature, col_humidity, col_windspeed, col_casual, 
                                           col_registered, col_count]
data_sets[data_set_hour].attribute_names = [col_instant, col_datetime, col_season, col_year, col_month, col_hour, 
                                            col_holiday, col_weekday, col_workingday, col_weather_situation, 
                                            col_temperature, col_apparent_temperature, col_humidity, col_windspeed, col_casual, col_registered, col_count]

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    # Read csv file
    data_set.data_frame = pd.read_csv(data_set.file_name, skiprows=1, names=data_set.attribute_names)

## 2 Understand data

### 2.1 Show basic facts

* Show data types of features
* Make sure there are no null values

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    data_set.data_frame.info()
    data_set.data_frame.isnull().sum()
    print("")

### 2.2 Set data types

In [None]:
# Define categorical attributes
data_sets[data_set_day].columns_categorical = [col_season, col_year, col_month, col_holiday, col_weekday, 
                                               col_workingday, col_weather_situation]
data_sets[data_set_hour].columns_categorical = [col_season, col_year, col_month, col_hour, col_holiday, col_weekday, 
                                                col_workingday, col_weather_situation]

# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Convert columns to date time
    data_set.data_frame[col_datetime] = pd.to_datetime(data_set.data_frame[col_datetime])

    # Convert columns to category
    for column in data_set.columns_categorical:
        data_set.data_frame[column]=data_set.data_frame[column].astype('category')

### 2.3 Visualize raw data

* This step is to make sure data is plausible
* Unfortunately many features in the data sets are already normalized
* For better visualization they need to be de-normalized first (using extreme values from data set's readme file)

In [None]:
# Extreme temperature values
temperature_min=-8
temperature_max=39
# Extreme apparent temperature values
apparent_temperature_min=-16
apparent_temperature_max=50
# Extreme humidity value
humidity_max=100
# Extreme wind speed value
windspeed_max=67

In [None]:
# Define columns containing raw-values
columns_raw_values = [col_temperature_raw, col_apparent_temperature_raw, col_humidity_raw, col_windspeed_raw, 
                      col_temperature_raw_rounded]

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Restore raw values for day data frame
    data_set.data_frame = data_set.data_frame.assign(temperature_raw=data_set.data_frame[col_temperature] * (temperature_max-temperature_min) + temperature_min)
    data_set.data_frame = data_set.data_frame.assign(apparent_temperature_raw=data_set.data_frame[col_apparent_temperature] * (apparent_temperature_max-apparent_temperature_min) + apparent_temperature_min)
    data_set.data_frame = data_set.data_frame.assign(humidity_raw=data_set.data_frame[col_humidity] * humidity_max)
    data_set.data_frame = data_set.data_frame.assign(windspeed_raw=data_set.data_frame[col_windspeed] * windspeed_max)
    
    # Round values for better visualization
    data_set.data_frame = data_set.data_frame.assign(temperature_raw_rounded=round(data_set.data_frame[col_temperature_raw]/5,0)*5)

#### 2.2.1 Show data

```WARNING``` DataFrame.head() does not work inside loops

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Show first few lines
    data_set.data_frame.head()

In [None]:
data_sets[data_set_day].data_frame.head()

In [None]:
data_sets[data_set_hour].data_frame.head()

#### 2.2.2 Plot relation between month and number of rented bikes

* Make sure data is plausible
* Expected plot contains few peaks (most popular biking months)

In [None]:
# Plot trend
sns.catplot(col_month,col_count,hue=col_year,data=data_sets[data_set_day].data_frame, ci=None, kind='point', palette='rainbow')

#### 2.2.3 Plot relation between temperature and number of rented bikes

* Make sure data is plausible
* Expected plot contains one peak (optimal biking temperature)

In [None]:
# Plot trend
sns.catplot(col_temperature_raw_rounded,col_count,hue=col_year,data=data_sets[data_set_day].data_frame, ci=None, kind='point', palette='rainbow')

#### 2.2.4 Plot relation between weather situation and number of rented bikes

* Make sure data is plausible
* Expected plot indicates that there are significantly more rentals on day having good weather (clear or misty) 
  * 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Plot trend
    sns.catplot(col_weather_situation,col_count,hue=col_year,data=data_set.data_frame, ci=None, kind='strip', palette='rainbow')

#### 2.2.4 Plot correlation matrix of some attributes

* Expected matrix contains the following correlations
  * strong correlation between _temperature_ and _apparent temperature_

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    # Plot correlation matrix
    correlationMatrix=data_set.data_frame[[col_temperature, col_apparent_temperature, col_humidity,col_windspeed,
                                           col_casual, col_registered]].corr()
    mask=np.array(correlationMatrix)
    mask[np.tril_indices_from(mask)]=False
    
    fig,ax=plt.subplots(figsize=(15,8))
    sns.heatmap(correlationMatrix,mask=mask,vmax=0.8,square=True,annot=True,ax=ax)
    ax.set_title('Correlation Matrix')
    plt.show()

## 3 Preprocess data

### 3.0 Initialize variants

In [None]:
# Variants
variant_original = "original"
variant_market = "market"

class Variant:
    """Contains data, model and predictions of a variant"""
    color = "#000"
    data_frame = {}
    splitted_data = {}
    models = {}
    predictions = {}
    
    def __init__(self, color):
        self.color = color

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    # Assign variants to data frames
    data_set.variants = {
        variant_original: Variant(color="#3fe0e0"),
        variant_market: Variant(color="#d4dd80"),
    }

### 3.1 One-hot-encode categorical data

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # One-hot encode columns 
    for column in data_set.columns_categorical:
        data_set.data_frame = pd.concat([data_set.data_frame,pd.get_dummies(data_set.data_frame[column], prefix=column)],axis=1)

### 3.2 Enhance data

* The following variants shall be compared
  * _original_ original features
  * _market_ original features + time since start of bike sharing campaign

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Iterate over variants
    for variant_name, variant in data_set.variants.items():
        
        # Copy original data frame
        variant.data_frame = data_set.data_frame.copy()

#### 3.2.1 Calculate date since start

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Calculate days since start
    start_date = data_set.variants[variant_market].data_frame[col_datetime].min()
    data_set.variants[variant_market].data_frame[col_days_since_start_raw] = \
        data_set.variants[variant_market].data_frame.apply(lambda row: (row[col_datetime] - start_date).days, axis = 1) 
    
    # Normalize days since start
    max_days_since_start = data_set.variants[variant_market].data_frame[col_days_since_start_raw].max()
    data_set.variants[variant_market].data_frame[col_days_since_start] = \
        data_set.variants[variant_market].data_frame.apply(lambda row: (row[col_days_since_start_raw] / max_days_since_start), axis = 1)
    
    # Drop raw days since start
    data_set.variants[variant_market].data_frame.drop(col_days_since_start_raw, axis=1, inplace=True)

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    # Plot correlation matrix
    correlationMatrix=data_set.variants[variant_market].data_frame[[col_temperature, col_apparent_temperature, 
                                                                    col_humidity, col_windspeed, col_days_since_start, 
                                                                    col_count]].corr()
    mask=np.array(correlationMatrix)
    mask[np.tril_indices_from(mask)]=False
    
    fig,ax=plt.subplots(figsize=(15,8))
    sns.heatmap(correlationMatrix,mask=mask,vmax=0.8,square=True,annot=True,ax=ax)
    ax.set_title('Correlation Matrix')
    plt.show()

### 3.3 Normalize data
    

### 3.4 Drop columns

* The following columns need to be dropped
  * Indices
    * _instant_ since it does not contain any information besides the order
  * Columns which cannot be used directly
    * _datetime_ which is formatted ```yyyy-mm-dd```
  * Columns which are one-hot encoded
  * Columns which contain raw values
  * Unused target columns

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    # Iterate over variants
    for variant_name, variant in data_set.variants.items():
        
        # Drop index and unusable columns
        variant.data_frame.drop([col_instant, col_datetime, col_temperature], axis=1, inplace=True)
        # Drop one-hot encoded columns
        variant.data_frame.drop(data_set.columns_categorical, axis=1, inplace=True)
        # Drop raw-value columns
        variant.data_frame.drop(columns_raw_values, axis=1, inplace=True)
        # Drop unused target columns    
        variant.data_frame.drop([col_casual, col_registered], axis=1, inplace=True)

## 4 Split data into _training_ and _test_

In [None]:
class DataSplit:
    """Contains results of a train/test data split"""
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        
# Iterate over data sets
for data_set_name, data_set in data_sets.items():

    # Iterate over variants
    for variant_name, variant in data_set.variants.items():
        
        # Define columns
        data_bike_day_x = variant.data_frame.drop([col_count], axis=1)
        data_bike_day_y = variant.data_frame[[col_count]]
        
        test_size = 0.3
        random_state = 0
        
        # Split training data and test data
        x_train, x_test, y_train, y_test = train_test_split(data_bike_day_x, data_bike_day_y, test_size=test_size, random_state=random_state)
        
        # Add splitted data 
        variant.splitted_data = DataSplit(x_train, x_test, y_train, y_test)
        
        # Show splitted data
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

## 5 Initialize model

In [None]:
# Put models into dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(min_samples_split=10,max_leaf_nodes=100),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=200),
    "Lasso": Lasso(alpha=0.2),
    "Elastic Net": ElasticNet(alpha=0.2, random_state=0),
    "Ridge": Ridge(alpha=1.0),
    "Support Vector Regression (linear)": SVR(kernel='linear', gamma='scale', C=1.0, epsilon=0.2),
    "Support Vector Regression (rbf)": SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.2)
}

In [None]:
# Iterate over data sets
for data_set_name, data_set in data_sets.items():
    
    # Iterate over variants
    for variant_name, variant in data_set.variants.items():
    
        # Iterate over models
        for model_name, model in models.items():
            
            # Attach model to variant
            variant.models[model_name] = model

## 6 Train and evaluate model

In [None]:
def trainAndEvaluateDataSet(data_set_name, data_set):
    """Trains and evaluates a given data set"""
    
    variant_index = 0
    y_pos = np.arange(len(models.keys()))

    # Iterate over variants
    for variant_name, variant in data_set.variants.items():
        
        model_index = 0
        model_r2_scores = []
        
        # Iterate over models in variant 
        for model_name, model in variant.models.items():
            
            # print(data_set_name + " / " + variant_name + " / " + model_name)
            
            # Fit model and create prediction
            model.fit(variant.splitted_data.x_train, variant.splitted_data.y_train)
            
            # Make prediction
            predicted_values = model.predict(variant.splitted_data.x_test)
            expected_values = variant.splitted_data.y_test
            
            # Calculate scores
            score_explained_variance_score = explained_variance_score(expected_values, predicted_values)
            score_max_error = max_error(expected_values, predicted_values)
            score_mean_absolute_error = mean_absolute_error(expected_values, predicted_values)
            score_mean_squared_error = mean_squared_error(expected_values, predicted_values)
            score_median_absolute_error = median_absolute_error(expected_values, predicted_values)
            score_r2_score = r2_score(expected_values, predicted_values)
            
            # print("explained variance score", round(score_explained_variance_score, 2))
            # print("               max error", round(score_max_error, 2))
            # print("     mean absolute error", round(score_mean_absolute_error, 2))
            # print("      mean squared error", round(score_mean_squared_error, 2))
            # print("   median absolute error", round(score_median_absolute_error, 2))
            # print("                r2 score", round(score_r2_score, 2))
            
            print(data_set_name + " / " + variant_name + " / " + model_name + " > " + str(round(score_r2_score, 2)))

            model_r2_scores.insert(model_index, score_r2_score)
            model_index += 1
        
        plt.barh(y_pos + (variant_index*0.5), model_r2_scores, 0.4, align='center', alpha=0.5, color=variant.color)
        variant_index += 1
    
    plt.xlabel('R2 Score')
    plt.yticks(y_pos, models.keys())
    plt.title('Model performance')
    plt.show()
    

In [None]:
# Train and evaluate data set 'day'
trainAndEvaluateDataSet(data_set_day, data_sets[data_set_day])   

In [None]:
# Train and evaluate data set 'hour'
trainAndEvaluateDataSet(data_set_hour, data_sets[data_set_hour])   
    