# NYC Taxi Fare Prediction - Automatidata
This project analyses New York City taxi trip data to develop a machine learning models that predict fare prices (and tips, etc) based on trip features.

In [1]:
# Data Analysis
import pandas as pd
import numpy as np
from scipy import stats

# Data Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning
from sklearn.model_selection import train_test_split    
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score    
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")




## Dataset loading
This dataset is also in kaggle https://www.kaggle.com/datasets/yakhyojon/automatidata

In [None]:
#Load the dataset
taxi_data = pd.read_csv('2017_Yellow_Taxi_Trip_Data.csv', index_col=0)

## Initial data inspection

In [5]:
# Display the first few rows of the dataset
print(taxi_data.head(3))

           VendorID   tpep_pickup_datetime  tpep_dropoff_datetime  \
24870114          2  03/25/2017 8:55:43 AM  03/25/2017 9:09:47 AM   
35634249          1  04/11/2017 2:53:28 PM  04/11/2017 3:19:58 PM   
106203690         1  12/15/2017 7:26:56 AM  12/15/2017 7:34:08 AM   

           passenger_count  trip_distance  RatecodeID store_and_fwd_flag  \
24870114                 6           3.34           1                  N   
35634249                 1           1.80           1                  N   
106203690                1           1.00           1                  N   

           PULocationID  DOLocationID  payment_type  fare_amount  extra  \
24870114            100           231             1         13.0    0.0   
35634249            186            43             1         16.0    0.0   
106203690           262           236             1          6.5    0.0   

           mta_tax  tip_amount  tolls_amount  improvement_surcharge  \
24870114       0.5        2.76           0.0  

In [3]:
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22699 entries, 24870114 to 17208911
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   VendorID               22699 non-null  int64  
 1   tpep_pickup_datetime   22699 non-null  object 
 2   tpep_dropoff_datetime  22699 non-null  object 
 3   passenger_count        22699 non-null  int64  
 4   trip_distance          22699 non-null  float64
 5   RatecodeID             22699 non-null  int64  
 6   store_and_fwd_flag     22699 non-null  object 
 7   PULocationID           22699 non-null  int64  
 8   DOLocationID           22699 non-null  int64  
 9   payment_type           22699 non-null  int64  
 10  fare_amount            22699 non-null  float64
 11  extra                  22699 non-null  float64
 12  mta_tax                22699 non-null  float64
 13  tip_amount             22699 non-null  float64
 14  tolls_amount           22699 non-null  float64
 1

In [6]:
taxi_data.shape

(22699, 17)

## Data quality checks

In [8]:
taxi_data.isnull().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
dtype: int64

In [10]:
taxi_data.duplicated().sum()

np.int64(0)

## Next Steps
We will now proceed to cleaning and exploring the data further in `01_data_cleaning_eda.ipynb`.
