#### Modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M:%S'

# Convert to datetime object
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format=time_format) 
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], format=time_format)

# Convert to Float datatype
df['trip_duration'] = df['trip_duration'].astype(float)
df['vendor_id'] = df['vendor_id'].astype(float)
df['passenger_count'] = df['passenger_count'].astype(float)

# Create Float columns to split datetime objects
df['pickup_year'] = df['pickup_datetime'].dt.year.astype(float)
df['pickup_month'] = df['pickup_datetime'].dt.month.astype(float)
df['pickup_day'] = df['pickup_datetime'].dt.day.astype(float)
df['pickup_hour'] = df['pickup_datetime'].dt.hour.astype(float)
df['pickup_minute'] = df['pickup_datetime'].dt.minute.astype(float)
df['pickup_second'] = df['pickup_datetime'].dt.second.astype(float)

df['dropoff_year'] = df['dropoff_datetime'].dt.year.astype(float)
df['dropoff_month'] = df['dropoff_datetime'].dt.month.astype(float)
df['dropoff_day'] = df['dropoff_datetime'].dt.day.astype(float)
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour.astype(float)
df['dropoff_minute'] = df['dropoff_datetime'].dt.minute.astype(float)
df['dropoff_second'] = df['dropoff_datetime'].dt.second.astype(float)

# Convert to Category datatype
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype('category')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 23 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null float64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null float64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null category
trip_duration         1458644 non-null float64
pickup_year           1458644 non-null float64
pickup_month          1458644 non-null float64
pickup_day            1458644 non-null float64
pickup_hour           1458644 non-null float64
pickup_minute         1458644 non-null float64
pickup_second         1458644 non-null float64
dropoff_year          1458644 non-null float64
dropoff_month         14586

In [4]:
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M:%S'

# Convert to datetime object (dropoff_datetime is missing)
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'], format=time_format) 

# Convert to Float datatype
df_test['passenger_count'] = df_test['passenger_count'].astype(float)

# Create Float columns to split datetime objects
df_test['pickup_year'] = df_test['pickup_datetime'].dt.year.astype(float)
df_test['pickup_month'] = df_test['pickup_datetime'].dt.month.astype(float)
df_test['pickup_day'] = df_test['pickup_datetime'].dt.day.astype(float)
df_test['pickup_hour'] = df_test['pickup_datetime'].dt.hour.astype(float)
df_test['pickup_minute'] = df_test['pickup_datetime'].dt.minute.astype(float)
df_test['pickup_second'] = df_test['pickup_datetime'].dt.second.astype(float)

# Convert to Category datatype
df_test['store_and_fwd_flag'] = df_test['store_and_fwd_flag'].astype('category')

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 15 columns):
id                    625134 non-null object
vendor_id             625134 non-null int64
pickup_datetime       625134 non-null datetime64[ns]
passenger_count       625134 non-null float64
pickup_longitude      625134 non-null float64
pickup_latitude       625134 non-null float64
dropoff_longitude     625134 non-null float64
dropoff_latitude      625134 non-null float64
store_and_fwd_flag    625134 non-null category
pickup_year           625134 non-null float64
pickup_month          625134 non-null float64
pickup_day            625134 non-null float64
pickup_hour           625134 non-null float64
pickup_minute         625134 non-null float64
pickup_second         625134 non-null float64
dtypes: category(1), datetime64[ns](1), float64(11), int64(1), object(1)
memory usage: 67.4+ MB


#### Split Data

In [5]:
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error

X = df.drop(['trip_duration','id','store_and_fwd_flag','dropoff_datetime','pickup_datetime'], axis=1).values
y = df['trip_duration'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#### Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
steps = [('lr', LinearRegression())]

pipeline = Pipeline(steps)

params = {'lr__fit_intercept':[True,False], 'lr__normalize':[True,False], 'lr__copy_X':[True, False]}

linreg_cv = GridSearchCV(pipeline,params, cv=5)

linreg_cv.fit(X_train,y_train)

y_pred = linreg_cv.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(linreg_cv.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))


print("Best Parameters: {}".format(linreg_cv.best_params_)) 
print("Best score is {}".format(linreg_cv.best_score_))

R^2: 0.8954384072188829
Root Mean Squared Error: 1033.1670044712448
Best Parameters: {'lr__copy_X': True, 'lr__fit_intercept': True, 'lr__normalize': True}
Best score is 0.9443147193143128


#### Ridge Regression

In [8]:
from sklearn.linear_model import RidgeCV

In [9]:
modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001],cv=5)
modelCV.fit(X_train,y_train)

RidgeCV(alphas=[0.1, 0.01, 0.001, 0.0001], cv=5, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [10]:
y_pred = modelCV.predict(X_test)

In [11]:
print("Best Alpha: {}".format(modelCV.alpha_))

# Compute and print R^2 and RMSE
print("R^2: {}".format(modelCV.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Best Alpha: 0.0001
R^2: -38718456.72970435
Root Mean Squared Error: 19881239.529856503


#### SVM

Not good for large datasets --> optional: linear kernels

#### Linear SVR

In [12]:
from sklearn.svm import LinearSVR, SVR
from sklearn.preprocessing import StandardScaler

In [13]:
steps = [('scaler', StandardScaler()),
         ('svr', LinearSVR())]

pipeline = Pipeline(steps)

params = {'svr__C':[0.1,1,10]} # higher C value, longer it takes

svr_cv = GridSearchCV(pipeline,params,cv=5,verbose=3)

In [14]:
svr_cv.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] svr__C=0.1 ......................................................
[CV] .......... svr__C=0.1, score=-0.005762317765246605, total=   1.9s
[CV] svr__C=0.1 ......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV] ........... svr__C=0.1, score=0.000569114028224682, total=   1.6s
[CV] svr__C=0.1 ......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.8s remaining:    0.0s


[CV] ........... svr__C=0.1, score=-0.00269738323559654, total=   1.8s
[CV] svr__C=0.1 ......................................................
[CV] .......... svr__C=0.1, score=-0.008372429092449618, total=   1.8s
[CV] svr__C=0.1 ......................................................
[CV] ........... svr__C=0.1, score=0.001974491727495642, total=   1.8s
[CV] svr__C=1 ........................................................
[CV] ............ svr__C=1, score=-0.006185910255976035, total=   2.9s
[CV] svr__C=1 ........................................................
[CV] ............ svr__C=1, score=0.0006219375538523141, total=   2.6s
[CV] svr__C=1 ........................................................
[CV] ........... svr__C=1, score=-0.0029362958090561886, total=   2.8s
[CV] svr__C=1 ........................................................
[CV] ............ svr__C=1, score=-0.008840938059473835, total=   3.0s
[CV] svr__C=1 ........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svr', LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svr__C': [0.1, 1, 10]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=3)

In [15]:
y_pred = svr_cv.predict(X_test)

In [16]:
# Compute and print R^2 and RMSE
print("R^2: {}".format(svr_cv.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))

print("Best Parameters: {}".format(svr_cv.best_params_)) 
print("Best score is {}".format(svr_cv.best_score_))

R^2: -248039192.01585808
Root Mean Squared Error: 50320458.28536123
Best Parameters: {'svr__C': 0.1}
Best score is -0.0028577048675144877


#### SVR

Reduced the shape of dataframe to use SVR

In [17]:
df_Y = df[df['store_and_fwd_flag']=='Y'] 

df_Y.shape

(8045, 23)

In [18]:
X = df_Y.drop(['trip_duration','id','store_and_fwd_flag','dropoff_datetime','pickup_datetime'], axis=1).values
y = df_Y['trip_duration'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [19]:
steps = [('scaler', StandardScaler()),
         ('svr', SVR())]

pipeline = Pipeline(steps)

param_grid = {'svr__C': [0.1,1,10],'svr__gamma':[1,0.1,0.01]}

sv_cv = GridSearchCV(pipeline,param_grid,cv=5,verbose=3)

In [20]:
sv_cv.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] svr__C=0.1, svr__gamma=1 ........................................
[CV]  svr__C=0.1, svr__gamma=1, score=-0.0890075310862537, total=   1.1s
[CV] svr__C=0.1, svr__gamma=1 ........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV]  svr__C=0.1, svr__gamma=1, score=-0.08607996626587887, total=   1.0s
[CV] svr__C=0.1, svr__gamma=1 ........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  svr__C=0.1, svr__gamma=1, score=-0.09765520052707499, total=   1.0s
[CV] svr__C=0.1, svr__gamma=1 ........................................
[CV]  svr__C=0.1, svr__gamma=1, score=-0.09372722942263657, total=   1.0s
[CV] svr__C=0.1, svr__gamma=1 ........................................
[CV]  svr__C=0.1, svr__gamma=1, score=-0.08590104630458972, total=   1.0s
[CV] svr__C=0.1, svr__gamma=0.1 ......................................
[CV]  svr__C=0.1, svr__gamma=0.1, score=-0.08339597482539851, total=   0.9s
[CV] svr__C=0.1, svr__gamma=0.1 ......................................
[CV]  svr__C=0.1, svr__gamma=0.1, score=-0.07954099004007653, total=   1.1s
[CV] svr__C=0.1, svr__gamma=0.1 ......................................
[CV]  svr__C=0.1, svr__gamma=0.1, score=-0.09051885705073093, total=   1.0s
[CV] svr__C=0.1, svr__gamma=0.1 ......................................
[CV]  svr__C=0.1, svr__gamma=0.1, score=-0.08576651395536761, total=   1.0s
[CV] svr__C=0.1, svr__gamma=0.1 ................

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svr__C': [0.1, 1, 10], 'svr__gamma': [1, 0.1, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [21]:
y_pred = sv_cv.predict(X_test)

In [22]:
# Compute and print R^2 and RMSE
print("R^2: {}".format(sv_cv.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))

print("Best Parameters: {}".format(sv_cv.best_params_)) 
print("Best score is {}".format(sv_cv.best_score_))

R^2: 0.2723144074095586
Root Mean Squared Error: 831.5939406947047
Best Parameters: {'svr__C': 10, 'svr__gamma': 0.1}
Best score is 0.24313717775684524
