In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [28]:
data = pd.read_csv(r'E:\CampusX\DSML_2\ny-taxi-project\data\raw\train.csv')

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [24]:
data.sample(10)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
381187,2,2016-01-30 09:34:02,2016-01-30 09:40:49,2,-73.984383,40.745609,-73.990067,40.75658,N,407
1291600,2,2016-03-20 12:11:18,2016-03-20 12:21:17,5,-73.97287,40.753571,-73.953079,40.782661,N,599
1114284,1,2016-04-09 10:15:23,2016-04-09 10:21:41,1,-73.947334,40.796684,-73.951523,40.810368,N,378
1312321,1,2016-01-11 09:20:13,2016-01-11 09:43:05,1,-73.958031,40.776051,-73.979111,40.755836,N,1372
1031109,2,2016-06-16 08:44:19,2016-06-16 08:50:44,1,-73.987823,40.755154,-73.992317,40.738605,N,385
441911,2,2016-03-11 18:01:10,2016-03-11 19:03:04,5,-73.959114,40.760967,-73.789009,40.642029,N,3714
576701,2,2016-04-25 00:59:36,2016-04-25 01:00:41,6,-73.944946,40.834221,-73.948158,40.829731,N,65
163081,2,2016-06-02 22:18:54,2016-06-02 22:34:41,3,-73.983566,40.742283,-74.011871,40.721092,N,947
415687,2,2016-02-08 08:48:38,2016-02-08 09:10:03,1,-74.007729,40.704529,-73.973305,40.762459,N,1285
434567,2,2016-04-07 10:31:09,2016-04-07 10:55:23,1,-73.984131,40.761551,-74.015549,40.711491,N,1454


In [12]:
data.shape

(1458644, 11)

In [15]:
data.duplicated().sum()

0

In [14]:
data.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [18]:
data.isnull().sum()

vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [30]:
data=data.drop(columns='id',axis=1)

In [19]:
data['vendor_id'].value_counts()

vendor_id
2    780302
1    678342
Name: count, dtype: int64

In [20]:
data['passenger_count'].value_counts()

passenger_count
1    1033540
2     210318
5      78088
3      59896
6      48333
4      28404
0         60
7          3
9          1
8          1
Name: count, dtype: int64

In [21]:
data['store_and_fwd_flag'].value_counts()

store_and_fwd_flag
N    1450599
Y       8045
Name: count, dtype: int64

In [31]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])

In [32]:
data["pickup_hour"] = data['pickup_datetime'].dt.hour
data["pickup_minute"] = data['pickup_datetime'].dt.minute
data["pickup_second"] = data['pickup_datetime'].dt.second
data["pickup_day_week"] = data['pickup_datetime'].dt.dayofweek
data["pickup_month"] = data['pickup_datetime'].dt.month
data["pickup_minute_of_the_day"] = data["pickup_hour"]*60+data["pickup_minute"]

In [35]:
data["dropoff_hour"] = data['dropoff_datetime'].dt.hour
data["dropoff_minute"] = data['dropoff_datetime'].dt.minute
data["dropoff_second"] = data['dropoff_datetime'].dt.second
data["dropoff_day_week"] = data['dropoff_datetime'].dt.dayofweek
data["dropoff_month"] = data['dropoff_datetime'].dt.month
data["dropoff_minute_of_the_day"] = data["dropoff_hour"]*60+data["dropoff_minute"]

In [36]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,pickup_second,pickup_day_week,pickup_month,pickup_minute_of_the_day,dropoff_hour,dropoff_minute,dropoff_second,dropoff_day_week,dropoff_month,dropoff_minute_of_the_day
0,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,...,55,0,3,1044,17,32,30,0,3,1052
1,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,...,35,6,6,43,0,54,38,6,6,54
2,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,...,24,1,1,695,12,10,48,1,1,730
3,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,...,31,2,4,1172,19,39,40,2,4,1179
4,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,...,55,5,3,810,13,38,10,5,3,818


In [37]:
data.drop(columns=["pickup_datetime", "dropoff_datetime"], axis=1, inplace=True)

In [38]:
data["trip_duration"].value_counts()

trip_duration
368      1624
408      1584
348      1582
367      1581
358      1577
         ... 
7378        1
83250       1
6615        1
34014       1
6124        1
Name: count, Length: 7417, dtype: int64

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 20 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   vendor_id                  1458644 non-null  int64  
 1   passenger_count            1458644 non-null  int64  
 2   pickup_longitude           1458644 non-null  float64
 3   pickup_latitude            1458644 non-null  float64
 4   dropoff_longitude          1458644 non-null  float64
 5   dropoff_latitude           1458644 non-null  float64
 6   store_and_fwd_flag         1458644 non-null  object 
 7   trip_duration              1458644 non-null  int64  
 8   pickup_hour                1458644 non-null  int32  
 9   pickup_minute              1458644 non-null  int32  
 10  pickup_second              1458644 non-null  int32  
 11  pickup_day_week            1458644 non-null  int32  
 12  pickup_month               1458644 non-null  int32  
 13  pickup_minut

In [42]:
cat_columns = data.columns[data.dtypes=='object']
num_columns = data.columns[(data.dtypes=='float') | (data.dtypes=='int')]

In [43]:
num_columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_day_week', 'pickup_month', 'pickup_minute_of_the_day',
       'dropoff_hour', 'dropoff_minute', 'dropoff_second', 'dropoff_day_week',
       'dropoff_month', 'dropoff_minute_of_the_day'],
      dtype='object')

In [45]:
X= data.drop(columns='trip_duration',axis=1)
y = data.trip_duration

In [46]:
X.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_hour,pickup_minute,pickup_second,pickup_day_week,pickup_month,pickup_minute_of_the_day,dropoff_hour,dropoff_minute,dropoff_second,dropoff_day_week,dropoff_month,dropoff_minute_of_the_day
0,2,1,-73.982155,40.767937,-73.96463,40.765602,N,17,24,55,0,3,1044,17,32,30,0,3,1052
1,1,1,-73.980415,40.738564,-73.999481,40.731152,N,0,43,35,6,6,43,0,54,38,6,6,54
2,2,1,-73.979027,40.763939,-74.005333,40.710087,N,11,35,24,1,1,695,12,10,48,1,1,730
3,2,1,-74.01004,40.719971,-74.012268,40.706718,N,19,32,31,2,4,1172,19,39,40,2,4,1179
4,2,1,-73.973053,40.793209,-73.972923,40.78252,N,13,30,55,5,3,810,13,38,10,5,3,818


In [47]:
y.head()

0     455
1     663
2    2124
3     429
4     435
Name: trip_duration, dtype: int64

In [49]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1166915, 19), (291729, 19), (1166915,), (291729,))

In [52]:
preprocessor = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(drop='first',dtype=np.int32),cat_columns),
    ('ss',StandardScaler(),num_columns)
], remainder='passthrough'
)

In [53]:
X_train= preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [55]:
preprocessor.get_feature_names_out()

array(['ohe__store_and_fwd_flag_Y', 'ss__pickup_longitude',
       'ss__pickup_latitude', 'ss__dropoff_longitude',
       'ss__dropoff_latitude', 'ss__pickup_hour', 'ss__pickup_minute',
       'ss__pickup_second', 'ss__pickup_day_week', 'ss__pickup_month',
       'ss__pickup_minute_of_the_day', 'ss__dropoff_hour',
       'ss__dropoff_minute', 'ss__dropoff_second', 'ss__dropoff_day_week',
       'ss__dropoff_month', 'ss__dropoff_minute_of_the_day',
       'remainder__vendor_id', 'remainder__passenger_count'], dtype=object)

In [56]:
def evaluate_model(true, predict):
    r2score = r2_score(true,predict)
    mse = mean_squared_error(true, predict)
    mae = mean_absolute_error(true, predict)
    rmse = np.sqrt(mse)

    return r2score, mse, mae, rmse