# Polynomial Regression

In [32]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import nextbike.preprocessing.Preprocessing as prep

from scipy.stats import zscore
import numpy as np
from vincenty import vincenty 
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
from sklearn.linear_model import Lasso,Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.svm import SVR
import time

In [33]:
df = pd.read_csv('../data/processed/dortmund_trips.csv', index_col=0)

Unnamed: 0,datetime_start,b_number,latitude_start,p_name_start,longitude_start,datetime_end,latitude_end,p_name_end,longitude_end,trip_duration,...,weekday,weekend,day,month,hour,minute,day_of_year,temperature °C,precipitation in mm,precipitation
0,2019-01-20 16:22:00,50641,51.506312,Hainallee / Südbad,7.470531,2019-01-20 17:00:00,51.493966,TU Dortmund Emil-Figge-Straße 50,7.418008,38,...,6,1,20,1,16,22,20,0.5,0.0,0
1,2019-01-20 16:42:00,53940,51.507457,Möllerbrücke,7.451364,2019-01-20 16:44:00,51.507457,Möllerbrücke,7.451364,2,...,6,1,20,1,16,42,20,0.5,0.0,0
2,2019-01-20 16:53:00,50061,51.503293,Vinckeplatz,7.455822,2019-01-20 17:13:00,51.519332,Cinestar,7.460124,20,...,6,1,20,1,16,53,20,0.5,0.0,0
3,2019-01-20 16:35:00,51138,51.499039,Steigenberger Hotel / Berswordtstr.,7.451472,2019-01-20 16:37:00,51.499039,Steigenberger Hotel / Berswordtstr.,7.451472,2,...,6,1,20,1,16,35,20,0.5,0.0,0
4,2019-01-20 16:43:00,53120,51.507457,Möllerbrücke,7.451364,2019-01-20 17:02:00,51.512836,Am Kaiserbrunnen,7.482258,19,...,6,1,20,1,16,43,20,0.5,0.0,0


In [28]:
X = df[["month","weekday","day","day_of_year", "hour","minute","latitude_start","longitude_start","area_start",
        "temperature °C","precipitation","distanceToUniversity","distaneToCentralStation"]]
y = df["trip_duration"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [5]:
name, poly_degree, r2, rmse, mae, exetime, desc = [],[],[],[],[],[],[]

In [29]:
def polynomial_reg(modelname, estimator, degree): 
    start = time.time()
        
    poly_reg = PolynomialFeatures(degree = degree)
    x_poly = poly_reg.fit_transform(X_train)

    model = estimator
    model.fit(x_poly, y_train)

    # evaluate the model on the second set of data
    y_pred = model.predict(poly_reg.transform(X_test))
    y_pred_train = model.predict(x_poly)
    end = time.time()

    print("w/o cross-validation:")
    print("R^2-Score is: {}".format(r2_score(y_train, y_pred_train)))
    print("RMSE: {}".format(np.sqrt(mean_squared_error(y_train,y_pred_train))))
    print("MAE: {}".format(mean_absolute_error(y_train,y_pred_train)))
    print("")
    print("w/ cross-validation:")
    print ('The R^2 is: ',r2_score(y_test, y_pred))
    print("RMSE: {}".format(np.sqrt(mean_squared_error(y_test,y_pred))))
    print("MAE: {}".format(mean_absolute_error(y_test,y_pred)))  
    
    name.append(modelname)
    r2.append(r2_score(y_test, y_pred))
    rmse.append(np.sqrt(mean_squared_error(y_test,y_pred)))
    mae.append(mean_absolute_error(y_test,y_pred))
    exetime.append((end-start)/60)
    desc.append(estimator)
    poly_degree.append(degree)

In [30]:
polynomial_reg("LinearRegression", LinearRegression(), 3)

w/o cross-validation:
R^2-Score is: 0.09835551121387343
RMSE: 79.32163459583923
MAE: 38.032720029810875

w/ cross-validation:
The R^2 is:  0.08659253425274016
RMSE: 79.44699137676392
MAE: 38.43744729750855


In [8]:
polynomial_reg("Linear Regression", LinearRegression(), 4)

w/o cross-validation:
R^2-Score is: 0.12309696562700068
RMSE: 78.22575715692106
MAE: 37.69146121759359

w/ cross-validation:
The R^2 is:  0.09749382423901798
RMSE: 78.97147833539839
MAE: 38.40895084243636


In [19]:
polynomial_reg("Ridge", Ridge(), 3)

w/o cross-validation:
R^2-Score is: 0.08499550483787077
RMSE: 79.90714293177551
MAE: 38.211166983445956

w/ cross-validation:
The R^2 is:  0.07701274280084802
RMSE: 79.86252358604241
MAE: 38.533652105893175


In [10]:
polynomial_reg("Ridge", Ridge(), degree=4)

w/o cross-validation:
R^2-Score is: 0.11945832236303788
RMSE: 78.38788509791337
MAE: 37.68849258877771

w/ cross-validation:
The R^2 is:  0.09784736503780567
RMSE: 78.95600898179521
MAE: 38.28547002842608


In [11]:
# hyperparameters found with RandomizedSearch 
polynomial_reg("Ridge", Ridge(solver="cholesky", max_iter=40, fit_intercept=True, copy_X=False, alpha=50), degree=4)

w/o cross-validation:
R^2-Score is: -2499437305.065168
RMSE: 4176332.9326353404
MAE: 4176332.9318983196

w/ cross-validation:
The R^2 is:  0.09697179615327667
RMSE: 78.994314395195
MAE: 38.27848342046496


In [12]:
polynomial_reg("Lasso", Lasso(), 4)

w/o cross-validation:
R^2-Score is: 0.10021257824447527
RMSE: 79.23990531021094
MAE: 37.71591936796588

w/ cross-validation:
The R^2 is:  0.0874045622790568
RMSE: 79.41166895461856
MAE: 38.1402407131487


In [13]:
polynomial_reg("Lasso",Lasso(max_iter=40,fit_intercept=True, copy_X=False, alpha=50), 4)

w/o cross-validation:
R^2-Score is: 0.07559283386290228
RMSE: 80.31666016389617
MAE: 38.31306546747794

w/ cross-validation:
The R^2 is:  0.06881700410425473
RMSE: 80.21631275930133
MAE: 38.56919831882716


In [31]:
pd.set_option('display.max_colwidth', -1)

results = {
    "Polynomial Regression with": name, 
    "Degree":poly_degree, 
    "R2": r2, 
    "RMSE":rmse, 
    "MAE": mae, 
    "Execution time (min)":exetime,
    "Description":desc
}

df_result = pd.DataFrame(results)
df_result

Unnamed: 0,Polynomial Regression with,Degree,R2,RMSE,MAE,Execution time (min),Description
0,LinearRegression,3,0.078448,79.800397,38.606739,0.094502,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
1,Linear Regression,4,0.097494,78.971478,38.408951,0.88228,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
2,Ridge,3,0.077013,79.862524,38.533652,0.043932,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)"
3,Ridge,4,0.097847,78.956009,38.28547,1.774598,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)"
4,Ridge,4,0.096972,78.994314,38.278483,0.201598,"Ridge(alpha=50, copy_X=False, fit_intercept=True, max_iter=40, normalize=False,\n random_state=None, solver='cholesky', tol=0.001)"
5,Lasso,4,0.087405,79.411669,38.140241,4.49695,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)"
6,Lasso,4,0.068817,80.216313,38.569198,0.439745,"Lasso(alpha=50, copy_X=False, fit_intercept=True, max_iter=40, normalize=False,\n positive=False, precompute=False, random_state=None, selection='cyclic',\n tol=0.0001, warm_start=False)"
7,LinearRegression,3,0.078448,79.800397,38.606739,0.093012,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
8,Ridge,3,0.077013,79.862524,38.533652,0.039224,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)"
9,LinearRegression,3,0.078448,79.800397,38.606739,0.095075,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
