In [35]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

print('Modules imported.')

Modules imported.


# Import and parse the dataset

In [2]:
train = pd.read_csv('./dataset/train.csv')

### Split pickup and dropoff columns to corresponding date and time columns

In [3]:
train['pickup_date'], train['pickup_time'] = zip(*train['pickup_datetime'].apply(lambda x: x.split(' ')))
train['dropoff_date'], train['dropoff_time'] = zip(*train['dropoff_datetime'].apply(lambda x: x.split(' ')))

In [4]:
pickup_date_year, pickup_date_month, pickup_date_day = zip(*train['pickup_date'].apply(lambda x: x.split('-')))
dropoff_date_year, dropoff_date_month, dropoff_date_day = zip(*train['dropoff_date'].apply(lambda x: x.split('-')))

In [5]:
pickup_date_month = pd.to_numeric(pickup_date_month)
pickup_date_day = pd.to_numeric(pickup_date_day)

dropoff_date_month = pd.to_numeric(dropoff_date_month)
dropoff_date_day = pd.to_numeric(dropoff_date_day)

In [6]:
pickup_days = (pickup_date_month * 30) + pickup_date_day
dropoff_days = (dropoff_date_month * 30) + dropoff_date_day

train['pickup_days'] = pickup_days
train['dropoff_days'] = dropoff_days

In [7]:
pickup_hours = pd.to_numeric(train.pickup_time.str.slice(0, 2))
dropoff_hours = pd.to_numeric(train.dropoff_time.str.slice(0, 2))

train['pickup_hours'] = pickup_hours
train['dropoff_hours'] = dropoff_hours

In [8]:
fwd_flag = pd.get_dummies(train.store_and_fwd_flag).iloc[:,1]
train['fwd_flag'] = fwd_flag

In [9]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_date,pickup_time,dropoff_date,dropoff_time,pickup_days,dropoff_days,pickup_hours,dropoff_hours,fwd_flag
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,2016-03-14,17:24:55,2016-03-14,17:32:30,104,104,17,17,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,2016-06-12,00:43:35,2016-06-12,00:54:38,192,192,0,0,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,2016-01-19,11:35:24,2016-01-19,12:10:48,49,49,11,12,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,2016-04-06,19:32:31,2016-04-06,19:39:40,126,126,19,19,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,2016-03-26,13:30:55,2016-03-26,13:38:10,116,116,13,13,0


In [31]:
X = pd.DataFrame(data=train, columns=['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_days', 'dropoff_days', 'pickup_hours', 'dropoff_hours', 'fwd_flag'])
y = pd.DataFrame(data=train, columns=['trip_duration'])

In [38]:
X.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_days,dropoff_days,pickup_hours,dropoff_hours,fwd_flag
0,2,1,-73.982155,40.767937,-73.96463,40.765602,104,104,17,17,0
1,1,1,-73.980415,40.738564,-73.999481,40.731152,192,192,0,0,0
2,2,1,-73.979027,40.763939,-74.005333,40.710087,49,49,11,12,0
3,2,1,-74.01004,40.719971,-74.012268,40.706718,126,126,19,19,0
4,2,1,-73.973053,40.793209,-73.972923,40.78252,116,116,13,13,0


# Splitting the dataset into train and test

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [37]:
X_train.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_days,dropoff_days,pickup_hours,dropoff_hours,fwd_flag
879655,1,1,-73.955551,40.773346,-73.97364,40.7635,38,39,23,0,0
646838,2,1,-73.962181,40.763599,-73.980377,40.764919,95,95,9,10,0
1138713,1,1,-73.977486,40.751842,-74.011688,40.718925,129,129,16,16,0
864716,1,1,-73.970001,40.762363,-73.963264,40.774666,36,36,11,11,0
434927,1,1,-73.950348,40.771561,-73.968178,40.762409,206,206,9,9,0


# Feature scaling

### Standardization

In [27]:
scaler = preprocessing.StandardScaler().fit(train_features)

In [30]:
scaler.transform(train_features)

array([[ 0.9323801 , -0.50563722, -0.12226117, ...,  0.5302625 ,
         0.52456292, -0.07447137],
       [-1.07252397, -0.50563722, -0.09772722, ..., -2.12611574,
        -2.09740503, -0.07447137],
       [ 0.9323801 , -0.50563722, -0.07814311, ..., -0.40728276,
        -0.24660413, -0.07447137],
       ..., 
       [ 0.9323801 , -0.50563722,  0.20249083, ..., -1.18857048,
        -1.01777117, -0.07447137],
       [-1.07252397, -0.50563722, -0.12118512, ...,  0.21774741,
         0.37032951, -0.07447137],
       [-1.07252397, -0.50563722, -0.08535264, ...,  0.06148987,
         0.06186269, -0.07447137]])