# Data Load

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [62]:
import pandas as pd

file_path = "/content/drive/MyDrive/ny_taxi/"
file_name = "Distilled_2023_Yellow_Taxi_Trip_Data.txt"

data = pd.read_csv(
    file_path + file_name
)

data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,07/29/2023 09:36:48 AM,07/29/2023 09:38:56 AM,2.0,0.6,1.0,N,48.0,142.0,1.0,5.1,2.5,0.5,1.8,0.0,1.0,10.9,2.5,0.0
1,1,01/10/2023 10:13:12 AM,01/10/2023 10:27:46 AM,0.0,1.9,1.0,N,234.0,211.0,1.0,12.1,2.5,0.5,4.0,0.0,1.0,20.1,2.5,0.0
2,1,05/11/2023 02:07:59 PM,05/11/2023 02:21:58 PM,1.0,1.7,1.0,N,68.0,161.0,2.0,13.5,2.5,0.5,0.0,0.0,1.0,17.5,2.5,0.0
3,2,05/10/2023 12:42:14 PM,05/10/2023 12:57:27 PM,1.0,1.34,1.0,N,163.0,237.0,1.0,14.2,0.0,0.5,1.5,0.0,1.0,19.7,2.5,0.0
4,2,01/03/2023 09:37:59 AM,01/03/2023 09:45:31 AM,1.0,1.22,1.0,N,234.0,211.0,1.0,9.3,0.0,0.5,3.33,0.0,1.0,16.63,2.5,0.0


## Validation

In [67]:
data.isna().sum()

Unnamed: 0,0
VendorID,0
tpep_pickup_datetime,0
tpep_dropoff_datetime,0
passenger_count,0
trip_distance,0
RatecodeID,0
PULocationID,0
DOLocationID,0
payment_type,0
fare_amount,0


In [68]:
len(data)

1233923

## Handling missing values

In [65]:
# handling missing values
data["airport_fee"] = data["airport_fee"].fillna(0)
data["congestion_surcharge"] = data["congestion_surcharge"].fillna(0)
data["passenger_count"] = data["passenger_count"].fillna(1)
data["RatecodeID"] = data["RatecodeID"].fillna(99)
data = data.drop(columns = ["store_and_fwd_flag"])

In [66]:
data = data.dropna()

## Data splitting

In [69]:
# shuffle and split data
data = data.sample(frac=1, random_state=432).reset_index(drop=True)

targets = data["tip_amount"]
features = data.drop(columns = ["tip_amount","tpep_pickup_datetime","tpep_dropoff_datetime"])

x_train, x_test = features.iloc[:-800000], features.iloc[-800000:]
y_train, y_test = targets.iloc[:-800000], targets.iloc[-800000:]

len(x_test), len(y_test), len(x_train), len(y_train)

(800000, 800000, 433923, 433923)

## Additional adjustments

In [76]:
data.select_dtypes(include="number").agg(["min","max","mean"]).T

Unnamed: 0,min,max,mean
VendorID,1.0,6.0,1.747552
passenger_count,0.0,9.0,1.365559
trip_distance,0.51,94.02,3.666041
RatecodeID,1.0,99.0,4.273419
PULocationID,1.0,265.0,164.950471
DOLocationID,1.0,265.0,163.493712
payment_type,0.0,4.0,1.15801
fare_amount,0.0,633.0,20.321872
extra,0.0,13.75,1.604287
mta_tax,0.0,4.0,0.496943


In [73]:
data = data[data["trip_distance"]>0.5]
data = data[data["trip_distance"]<100]

data = data[~data.select_dtypes("number").lt(0).any(axis=1)]

In [74]:
data = data[data["total_amount"]>3.7]
data = data[data["total_amount"]<1000]

In [None]:
!pip install --upgrade xgboost==1.7.6 cupy-cuda11x



## Model training and evaluation

In [77]:
# define and train the model
from xgboost import XGBRegressor

model = XGBRegressor()

model.fit(x_train, y_train)
predictions = model.predict(x_test)
predictions

array([ 1.3911767 ,  2.1413689 ,  2.5912926 , ...,  3.9655797 ,
       -0.03262945,  1.9143916 ], dtype=float32)

In [80]:
# model evaluation
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, predictions)
mae

0.23221938694728644