## Steps to be followed
1. Import libraries
2. Read data
3. Remove null values
4. Remove outliers
5. Extract hour, day of week, month from date time
6. Calculate distance - geodesic and haversine formula
7. Remove unnecessary columns
8. Separate catrgotical and numerical features
9. Make pipeline for data transformation
10. Try predicting using multiple models - simple, multiple, polynomial, ridge, lasso, elastic net

## Import libraries

In [44]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import math
import warnings
warnings.dilterwarnings = "ignore"

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

# Read data and basic statistics of data

In [2]:
raw_data = pd.read_csv("../data/raw.csv")
raw_data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [3]:
raw_data.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


# Remove null values

In [5]:
raw_data.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [6]:
# Since tehre are very few null values, we shall drop
raw_data = raw_data.dropna()
raw_data.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [7]:
# Extract temporal features and remove unnecessary features
raw_data['datetime'] = pd.to_datetime(raw_data['pickup_datetime'])
raw_data['month'] = raw_data['datetime'].dt.month
raw_data['hour'] = raw_data['datetime'].dt.hour
raw_data['day_of_week'] = raw_data['datetime'].dt.weekday
raw_data.drop(['Unnamed: 0', 'key', 'datetime', 'pickup_datetime'], axis=1, inplace=True)
raw_data.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month,hour,day_of_week
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,5,19,3
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,7,20,4
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,8,21,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,6,8,4
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,8,17,3


# Remove outliers

In [8]:
Q1 = raw_data.quantile(0.25)
Q3 = raw_data.quantile(0.75)
IQR = Q3 - Q1
raw_data = raw_data[~((raw_data < (Q1 - 1.5 * IQR)) | (raw_data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [9]:
raw_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month,hour,day_of_week
count,151022.0,151022.0,151022.0,151022.0,151022.0,151022.0,151022.0,151022.0,151022.0
mean,8.720266,-73.981784,40.752556,-73.980575,40.752778,1.260757,6.251904,13.602462,3.023672
std,3.942257,0.016173,0.021598,0.017229,0.023158,0.549094,3.446346,6.394607,1.936912
min,-3.5,-74.0294,40.686277,-74.032612,40.682587,0.0,1.0,0.0,0.0
25%,5.7,-73.992798,40.737846,-73.992159,40.737774,1.0,3.0,9.0,1.0
50%,7.7,-73.982838,40.753335,-73.981828,40.754058,1.0,6.0,14.0,3.0
75%,10.9,-73.971335,40.766672,-73.969875,40.767542,1.0,9.0,19.0,5.0
max,22.2,-73.929794,40.815686,-73.922062,40.819247,3.0,12.0,23.0,6.0


# Calculate distance using geodesic formula

In [10]:
raw_data.reset_index(inplace=True)

In [11]:
def calc_dist(row):
    point1 = (row['pickup_longitude'], row['pickup_latitude'])
    point2 = (row['dropoff_longitude'], row['dropoff_latitude'])
    return geodesic(point1, point2).meters

raw_data['distance_geodesic'] = raw_data.apply(calc_dist, axis=1)

In [12]:
raw_data.head()

Unnamed: 0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month,hour,day_of_week,distance_geodesic
0,0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,5,19,3,467.148242
1,1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,7,20,4,681.598098
2,2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,8,21,0,4841.545635
3,3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,6,8,4,1266.384983
4,5,4.9,-73.969019,40.75591,-73.969019,40.75591,1,2,2,5,0.0


In [13]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.asin(math.sqrt(a))
    
    R = 6371.0
    distance = R * c
    
    return distance

distances = []

for i in range(len(raw_data)):
    lat1 = raw_data['pickup_latitude'][i]
    lat2 = raw_data['dropoff_latitude'][i]
    lon1 = raw_data['pickup_longitude'][i]
    lon2 = raw_data['dropoff_longitude'][i]
    dist = haversine(lat1, lon1, lat2, lon2)
    distances.append(dist * 1000)

raw_data['distance_haversine'] = distances

In [14]:
raw_data.head()

Unnamed: 0,index,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month,hour,day_of_week,distance_geodesic,distance_haversine
0,0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,5,19,3,467.148242,1683.322752
1,1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,7,20,4,681.598098,2457.589884
2,2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,8,21,0,4841.545635,5036.37719
3,3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,6,8,4,1266.384983,1661.683458
4,5,4.9,-73.969019,40.75591,-73.969019,40.75591,1,2,2,5,0.0,0.0


In [15]:
raw_data.columns

Index(['index', 'fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'month',
       'hour', 'day_of_week', 'distance_geodesic', 'distance_haversine'],
      dtype='object')

In [17]:
# cant understand why geodesic does not give the correct distance value
raw_data.drop(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 
               'dropoff_latitude', 'distance_geodesic'], axis=1, inplace=True)

In [18]:
raw_data

Unnamed: 0,index,fare_amount,passenger_count,month,hour,day_of_week,distance_haversine
0,0,7.5,1,5,19,3,1683.322752
1,1,7.7,1,7,20,4,2457.589884
2,2,12.9,1,8,21,0,5036.377190
3,3,5.3,3,6,8,4,1661.683458
4,5,4.9,1,2,2,5,0.000000
...,...,...,...,...,...,...,...
151017,199994,12.0,1,1,14,4,1122.877892
151018,199995,3.0,1,10,10,6,112.210245
151019,199996,7.5,1,3,1,4,1875.050167
151020,199998,14.5,1,5,14,2,3539.715452


In [19]:
raw_data.drop('index', axis=1, inplace=True)

In [20]:
raw_data

Unnamed: 0,fare_amount,passenger_count,month,hour,day_of_week,distance_haversine
0,7.5,1,5,19,3,1683.322752
1,7.7,1,7,20,4,2457.589884
2,12.9,1,8,21,0,5036.377190
3,5.3,3,6,8,4,1661.683458
4,4.9,1,2,2,5,0.000000
...,...,...,...,...,...,...
151017,12.0,1,1,14,4,1122.877892
151018,3.0,1,10,10,6,112.210245
151019,7.5,1,3,1,4,1875.050167
151020,14.5,1,5,14,2,3539.715452


# Getting data ready for training

In [21]:
y = raw_data['fare_amount']
X = raw_data.drop('fare_amount', axis=1)
X.shape, y.shape

((151022, 5), (151022,))

In [22]:
X

Unnamed: 0,passenger_count,month,hour,day_of_week,distance_haversine
0,1,5,19,3,1683.322752
1,1,7,20,4,2457.589884
2,1,8,21,0,5036.377190
3,3,6,8,4,1661.683458
4,1,2,2,5,0.000000
...,...,...,...,...,...
151017,1,1,14,4,1122.877892
151018,1,10,10,6,112.210245
151019,1,3,1,4,1875.050167
151020,1,5,14,2,3539.715452


## Splitting into train and test

In [24]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120817, 5), (30205, 5), (120817,), (30205,))

In [28]:
categorical_feats = ['passenger_count', 'month', 'hour', 'day_of_week']
numerical_feats = ['distance_haversine']

In [29]:
ct = make_column_transformer(
    (StandardScaler(), numerical_feats),
    (OneHotEncoder(), categorical_feats)
)

In [33]:
X_train_fin = ct.fit_transform(X_train).toarray()
X_test_fin = ct.fit_transform(X_test).toarray()

In [34]:
one_hot_cols = ct.named_transformers_['onehotencoder'].get_feature_names_out(categorical_feats).tolist()
column_names = numerical_feats + one_hot_cols

In [35]:
X_train_trans = pd.DataFrame(X_train_fin, columns=column_names)
X_test_trans = pd.DataFrame(X_test_fin, columns=column_names)

X_train_trans

In [37]:
X_test_trans

Unnamed: 0,distance_haversine,passenger_count_0,passenger_count_1,passenger_count_2,passenger_count_3,month_1,month_2,month_3,month_4,month_5,...,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,-1.396278,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.391360,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.278118,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.633672,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.543534,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30200,4.303397,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
30201,-0.368907,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30202,-0.757250,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30203,-0.460946,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Model training

### Multivariable linear

In [41]:
model1 = LinearRegression()
model1.fit(X_train_trans, y_train)
y_pred1 = model1.predict(X_test_trans)
rmse1 = np.sqrt(mean_squared_error(y_test, y_pred1))
rmse1

2.245384227633756

### Polynomial

In [54]:
degree = 2 # same result as 3
model2 = make_pipeline(PolynomialFeatures(), LinearRegression())
model2.fit(X_train_trans, y_train)
y_pred2 = model2.predict(X_test_trans)
rmse2 = np.sqrt(mean_squared_error(y_test, y_pred2))
rmse2

2.203617410162649

### Ridge

In [58]:
param_grid = {'alpha': [0.001, 0.1, 0.5, 1]}

grid_search = GridSearchCV(Ridge(), param_grid, cv=5)
grid_search.fit(X_train_trans, y_train)

f"best param: {grid_search.best_params_}, best score: {grid_search.best_score_}"

"best param: {'alpha': 1}, best score: 0.6730120628614713"

In [59]:
alpha = 1
model3 = Ridge(alpha=alpha)
model3.fit(X_train_trans, y_train)
y_pred3 = model3.predict(X_test_trans)
rmse3 = np.sqrt(mean_squared_error(y_test, y_pred3))
rmse3

2.2450455255509802

### Lasso

In [60]:
param_grid = {'alpha': [0.001, 0.1, 0.5, 1]}

grid_search = GridSearchCV(Lasso(), param_grid, cv=5)
grid_search.fit(X_train_trans, y_train)

f"best param: {grid_search.best_params_}, best score: {grid_search.best_score_}"

"best param: {'alpha': 0.001}, best score: 0.6729285435164248"

In [61]:
alpha = 0.001
model4 = Lasso(alpha=alpha)
model4.fit(X_train_trans, y_train)
y_pred4 = model4.predict(X_test_trans)
rmse4 = np.sqrt(mean_squared_error(y_test, y_pred4))
rmse4

2.245521078118362

### Elastic Net

In [63]:
param_grid = {
    'alpha': [0.001, 0.1, 0.5, 1, 10],
    'l1_ratio': [0.1, 0.5, 0.9]
}

grid_search = GridSearchCV(
    ElasticNet(),
    param_grid,
    n_jobs=-1,
    cv=5,
    scoring='neg_mean_squared_error'
)

grid_search.fit(X_train_trans, y_train)
f"best param: {grid_search.best_params_}, best score: {grid_search.best_score_}"

"best param: {'alpha': 0.001, 'l1_ratio': 0.1}, best score: -5.084593155237241"

In [64]:
alpha = 0.001
l1_ratio = 0.1
model5 = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
model5.fit(X_train_trans, y_train)
y_pred5 = model5.predict(X_test_trans)
rmse5 = np.sqrt(mean_squared_error(y_test, y_pred5))
rmse5

2.2453214656630953

## The polynomial linear regression model gave the best results 