# Taxi Trip Data Analysis
This notebook analyzes the 2017 Yellow Taxi Trip data to understand the relationships between trip features and fare amounts.

## Sections
1. Data Loading
2. Data Cleaning
3. Feature Engineering
4. Regression Modeling
5. Model Evaluation
6. Insights and Conclusions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm


In [2]:
df = pd.read_csv('2017_Yellow_Taxi_Trip_Data.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,24870114,2,03/25/2017 8:55:43 AM,03/25/2017 9:09:47 AM,6,3.34,1,N,100,231,1,13.0,0.0,0.5,2.76,0.0,0.3,16.56
1,35634249,1,04/11/2017 2:53:28 PM,04/11/2017 3:19:58 PM,1,1.8,1,N,186,43,1,16.0,0.0,0.5,4.0,0.0,0.3,20.8
2,106203690,1,12/15/2017 7:26:56 AM,12/15/2017 7:34:08 AM,1,1.0,1,N,262,236,1,6.5,0.0,0.5,1.45,0.0,0.3,8.75
3,38942136,2,05/07/2017 1:17:59 PM,05/07/2017 1:48:14 PM,1,3.7,1,N,188,97,1,20.5,0.0,0.5,6.39,0.0,0.3,27.69
4,30841670,2,04/15/2017 11:32:20 PM,04/15/2017 11:49:03 PM,1,4.37,1,N,4,112,2,16.5,0.5,0.5,0.0,0.0,0.3,17.8


In [3]:
df.drop_duplicates(inplace=True)
df.dropna(subset=['fare_amount', 'trip_distance', 'passenger_count', 'payment_type'], inplace=True)


In [4]:
df = df[(df['fare_amount'] > 0) & (df['trip_distance'] > 0) & (df['passenger_count'] > 0)]


In [5]:
features = ['trip_distance', 'passenger_count', 'payment_type']
target = 'fare_amount'

X = df[features]
y = df[target]


In [6]:
X = pd.get_dummies(X, columns=['payment_type'], drop_first=True)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)


In [9]:
y_pred = lr.predict(X_test)

print("R^2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R^2 Score: 0.890751559683934
RMSE: 3.4758232314006308


In [10]:
coefficients = pd.DataFrame(lr.coef_, X.columns, columns=['Coefficient'])
print(coefficients)


                 Coefficient
trip_distance       2.777663
passenger_count    -0.017591
payment_type_2     -0.116220
payment_type_3     -0.061695
payment_type_4     -0.328597


In [12]:
X_train.dtypes


trip_distance      float64
passenger_count      int64
payment_type_2        bool
payment_type_3        bool
payment_type_4        bool
dtype: object

In [13]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')


In [14]:
print(X_train.isnull().sum())
print(y_train.isnull().sum())


trip_distance      0
passenger_count    0
payment_type_2     0
payment_type_3     0
payment_type_4     0
dtype: int64
0


In [15]:
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]


In [16]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)


In [18]:
print("X_train types:\n", X_train.dtypes)
print("y_train type:\n", y_train.dtypes)


X_train types:
 trip_distance      float64
passenger_count      int64
payment_type_2        bool
payment_type_3        bool
payment_type_4        bool
dtype: object
y_train type:
 float64


In [19]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')
y_train = pd.to_numeric(y_train, errors='coerce')


In [20]:
# Keep only rows where both X and y have valid (non-NaN) values
valid_rows = X_train.notna().all(axis=1) & y_train.notna()
X_train_clean = X_train[valid_rows]
y_train_clean = y_train[valid_rows]


In [28]:
X_train_clean = X_train_clean.reset_index(drop=True)
y_train_clean = y_train_clean.reset_index(drop=True)


In [29]:
X_train = X_train.astype({'payment_type_2': int, 'payment_type_3': int, 'payment_type_4': int})


In [31]:
import statsmodels.api as sm

X_sm = sm.add_constant(X_train)
model_sm = sm.OLS(y_train, X_sm).fit()
print(model_sm.summary())


                            OLS Regression Results                            
Dep. Variable:            fare_amount   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     5385.
Date:                Sun, 04 May 2025   Prob (F-statistic):               0.00
Time:                        14:29:37   Log-Likelihood:                -63698.
No. Observations:               18002   AIC:                         1.274e+05
Df Residuals:                   17996   BIC:                         1.275e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               4.9234      0.120     