In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

### Load and preapare data

In [2]:
# Load data
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0,rain_1h,snow_1h,clouds_all,temp_c,Clouds,Clear,Rain,Drizzle,Mist,Haze,...,Snow,Squall,Smoke,hour,is_day_off,day_of_week,holiday_bool,month,date_time,traffic_volume
0,0.0,0.0,40,15.13,1,0,0,0,0,0,...,0,0,0,9,0,1,0,10,2012-10-02 09:00:00,5545
1,0.0,0.0,75,16.21,1,0,0,0,0,0,...,0,0,0,10,0,1,0,10,2012-10-02 10:00:00,4516
2,0.0,0.0,90,16.43,1,0,0,0,0,0,...,0,0,0,11,0,1,0,10,2012-10-02 11:00:00,4767
3,0.0,0.0,90,16.98,1,0,0,0,0,0,...,0,0,0,12,0,1,0,10,2012-10-02 12:00:00,5026
4,0.0,0.0,75,17.99,1,0,0,0,0,0,...,0,0,0,13,0,1,0,10,2012-10-02 13:00:00,4918


In [3]:
X = df.drop(['date_time', 'traffic_volume', 'day_of_week', 'holiday_bool'], axis = 1) 
y = df['traffic_volume']

In [4]:
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Create pipelines for data preparing
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """Class from book Handson-ml by Ayrelien Geron. This estimators
    takes chosen columns from dataframe"""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [6]:
numerical_attributs = ['rain_1h', 'snow_1h', 'clouds_all', 'temp_c', 'hour', 'month']
categorical_attributs = ['Clouds', 'Clear', 'Rain', 'Drizzle', 'Mist', 'Haze', 'Fog', 'Thunderstorm', 'Snow', 'Squall',
                       'Smoke']

numerical_pipeline = Pipeline([
    ('selector', DataFrameSelector(numerical_attributs)),
    ('std_scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('selector', DataFrameSelector(categorical_attributs)),
])
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', numerical_pipeline),
    ('cat_pipeline', categorical_pipeline)
])

In [7]:
# Use created pipline on a x_train and x_test to standarize numerical variables
X_train_std = full_pipeline.fit_transform(X_train)
X_test_std = full_pipeline.transform(X_test)

### Test various models 

In [8]:
models = {
    'linear_regression': LinearRegression(),
    'random_forest_regressor depth=5': RandomForestRegressor(max_depth=5, random_state=0, n_estimators=10),
    #'svm': SVC(kernel='poly', degree=3, coef0=1, C=5, gamma='auto')
}

for model_name, model in models.items():
    model.fit(X_train_std, y_train)
    y_train_pred = model.predict(X_train_std)
    y_test_pred = model.predict(X_test_std)
    print(f"r2 score on training data using {model_name}: {r2_score(y_train, y_train_pred)}")
    print(f"r2 score on test data using {model_name}: {r2_score(y_test, y_test_pred)}\n")
    
    print(f"mean squared error score on training data using {model_name}: {np.sqrt(mean_squared_error(y_train, y_train_pred))}")
    print(f"mean squared error score on test data using {model_name}: {np.sqrt(mean_squared_error(y_test, y_test_pred))}\n")

r2 score on training data using linear_regression: 0.15124887305808843
r2 score on test data using linear_regression: 0.1356364250394908

mean squared error score on training data using linear_regression: 1828.1601902870916
mean squared error score on test data using linear_regression: 1845.8695949567602

r2 score on training data using random_forest_regressor depth=5: 0.7828900313428654
r2 score on test data using random_forest_regressor depth=5: 0.7790643614945432

mean squared error score on training data using random_forest_regressor depth=5: 924.6214152233316
mean squared error score on test data using random_forest_regressor depth=5: 933.2235201624292



In [10]:
# def plot_learning_curves(model, X, y):
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
#     train_errors, val_errors = [], []
#     for m in range(1, len(X_train)):
#         model.fit(X_train[:m], y_train[:m])
#         y_train_predict = model.predict(X_train[:m])
#         y_val_predict = model.predict(X_val)
#         train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
#         val_errors.append(mean_squared_error(y_val_predict, y_val))
#     plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
#     plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")

# plot_learning_curves(random_forest_regressor, X_train_std[:100], y_train[:100])