# Polynomial Regression

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import nextbike.preprocessing.Preprocessing as prep

from scipy.stats import zscore
import numpy as np
from vincenty import vincenty 
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
from sklearn.linear_model import Lasso,Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.svm import SVR
import time

In [2]:
df = pd.read_csv('../data/processed/dortmund_trips.csv', index_col=0)

In [3]:
X = df[["month","weekday","day","day_of_year", "hour","minute","latitude_start","longitude_start","area_start",
        "temperature °C","precipitation","distanceToUniversity","distanceToCentralStation"]]
y = df["trip_duration"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [4]:
name, poly_degree, r2, rmse, mae, exetime, desc = [],[],[],[],[],[],[] # arrays for results

In [5]:
def polynomial_reg(modelname, estimator, degree): 
    start = time.time()
        
    poly_reg = PolynomialFeatures(degree = degree)
    x_poly = poly_reg.fit_transform(X_train)

    model = estimator
    model.fit(x_poly, y_train)

    # evaluate the model on the second set of data
    y_pred = model.predict(poly_reg.transform(X_test))
    y_pred_train = model.predict(x_poly)
    end = time.time()

    print("w/o cross-validation:")
    print("R^2-Score is: {}".format(r2_score(y_train, y_pred_train)))
    print("RMSE: {}".format(np.sqrt(mean_squared_error(y_train,y_pred_train))))
    print("MAE: {}".format(mean_absolute_error(y_train,y_pred_train)))
    print("")
    print("w/ cross-validation:")
    print ('The R^2 is: ',r2_score(y_test, y_pred))
    print("RMSE: {}".format(np.sqrt(mean_squared_error(y_test,y_pred))))
    print("MAE: {}".format(mean_absolute_error(y_test,y_pred)))  
    
    name.append(modelname)
    r2.append(r2_score(y_test, y_pred))
    rmse.append(np.sqrt(mean_squared_error(y_test,y_pred)))
    mae.append(mean_absolute_error(y_test,y_pred))
    exetime.append((end-start)/60)
    desc.append(estimator)
    poly_degree.append(degree)

In [6]:
polynomial_reg("LinearRegression", LinearRegression(), 3)

w/o cross-validation:
R^2-Score is: 0.09863347080824947
RMSE: 79.30940698885384
MAE: 37.94287238934919

w/ cross-validation:
The R^2 is:  0.08792723768339195
RMSE: 79.3889247739953
MAE: 38.31425362922783


In [7]:
polynomial_reg("Linear Regression", LinearRegression(), 4)

w/o cross-validation:
R^2-Score is: 0.1379965783694329
RMSE: 77.55833618716143
MAE: 37.665970106329034

w/ cross-validation:
The R^2 is:  0.10175403992243914
RMSE: 78.78486825482919
MAE: 38.54711356428935


In [8]:
polynomial_reg("Ridge", Ridge(), 3)

w/o cross-validation:
R^2-Score is: 0.09595897945030096
RMSE: 79.42698136473796
MAE: 37.95368980433868

w/ cross-validation:
The R^2 is:  0.0857564503279592
RMSE: 79.48334380516425
MAE: 38.294657432120054


In [9]:
polynomial_reg("Ridge", Ridge(), degree=4)

w/o cross-validation:
R^2-Score is: 0.14004079508598188
RMSE: 77.46631790054701
MAE: 37.10508858535518

w/ cross-validation:
The R^2 is:  0.1101364297740931
RMSE: 78.41639822597739
MAE: 37.87658796820659


In [10]:
# hyperparameters found with RandomizedSearch 
polynomial_reg("Ridge", Ridge(solver="cholesky", max_iter=40, fit_intercept=True, copy_X=False, alpha=50), degree=4)

w/o cross-validation:
R^2-Score is: -1832712967.2220135
RMSE: 3576195.1960099083
MAE: 3576195.1951687094

w/ cross-validation:
The R^2 is:  0.10981951893216191
RMSE: 78.430360363228
MAE: 37.859364546529946


In [11]:
polynomial_reg("Lasso", Lasso(), 4)

w/o cross-validation:
R^2-Score is: 0.10232069152286005
RMSE: 79.147025230541
MAE: 37.69723938493105

w/ cross-validation:
The R^2 is:  0.08897553320083451
RMSE: 79.34328862281411
MAE: 38.124820170860055


In [12]:
polynomial_reg("Lasso",Lasso(max_iter=40,fit_intercept=True, copy_X=False, alpha=50), 4)

w/o cross-validation:
R^2-Score is: 0.07921017190506452
RMSE: 80.1593608145324
MAE: 38.20685455231052

w/ cross-validation:
The R^2 is:  0.07197270068594519
RMSE: 80.08027442145028
MAE: 38.48503231469628


In [13]:
pd.set_option('display.max_colwidth', -1)

results = {
    "Polynomial Regression with": name, 
    "Degree":poly_degree, 
    "R2": r2, 
    "RMSE":rmse, 
    "MAE": mae, 
    "Execution time (min)":exetime,
    "Description":desc
}

df_result = pd.DataFrame(results)
df_result

Unnamed: 0,Polynomial Regression with,Degree,R2,RMSE,MAE,Execution time (min),Description
0,LinearRegression,3,0.087927,79.388925,38.314254,0.122343,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
1,Linear Regression,4,0.101754,78.784868,38.547114,1.50727,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
2,Ridge,3,0.085756,79.483344,38.294657,0.053588,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)"
3,Ridge,4,0.110136,78.416398,37.876588,2.770713,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)"
4,Ridge,4,0.10982,78.43036,37.859365,2.76263,"Ridge(alpha=50, copy_X=False, fit_intercept=True, max_iter=40, normalize=False,\n random_state=None, solver='cholesky', tol=0.001)"
5,Lasso,4,0.088976,79.343289,38.12482,6.450144,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)"
6,Lasso,4,0.071973,80.080274,38.485032,1.089877,"Lasso(alpha=50, copy_X=False, fit_intercept=True, max_iter=40, normalize=False,\n positive=False, precompute=False, random_state=None, selection='cyclic',\n tol=0.0001, warm_start=False)"
