# Taxi Trip Exploratory Data Analysis

**Name(s)**: Drake Graham

**Website Link**: https://dgraham6.github.io/Taxi-EDA/

In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

import requests
from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster
import googlemaps
import time
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import make_column_transformer

ModuleNotFoundError: No module named 'plotly'

## Step 1: Introduction

Here are some initial questions about the dataset:  
- On what days do taxi drivers generate the most revenue?  
- What is the relationship between trip characteristics, such as distance and duration, and revenue?  
- Are there significant differences in performance between taxi companies?  

After consideration, I decided to focus on a what I think is the most important: predicting how long a trip will last. This has practical applications in improving route efficiency, setting accurate expectations for customers, and optimizing fleet management for taxi companies.

## Step 2: Data Cleaning and Exploratory Data Analysis

In [3]:
taxi = pd.read_csv('curve.csv',low_memory=False)

NameError: name 'pd' is not defined

In [None]:
durations = pd.read_csv('durations.csv', low_memory=False)
taxi = pd.read_csv('OpenDataDC_Taxi_2024/taxi_2024_10.csv')
for i in range(1, 9):
    taxi = pd.concat([taxi, pd.read_csv(f"OpenDataDC_Taxi_2024/taxi_2024_0{i}.csv")], axis=0)
for i in range(0,2):
    taxi = pd.concat([taxi, pd.read_csv(f"OpenDataDC_Taxi_2024/taxi_2024_1{i}.csv")], axis=0)

In [None]:
osrm_url = "http://localhost:8080/route/v1/driving"


def batch_coordinates(df, batch_size):
    for i in range(0, len(df), batch_size):
       yield df.iloc[i:i + batch_size]

def format_coordinates(row):
    return f"{row['ORIGIN_BLOCK_LONGITUDE']},{row['ORIGIN_BLOCK_LATITUDE']};{row['DESTINATION_BLOCK_LONG']},{row['DESTINATION_BLOCK_LAT']}"


def get_route_duration_distance(coords):
    try:
        response = requests.get(f"{osrm_url}/{coords}?overview=false")
        if response.status_code == 200:
            data = response.json()
            if 'routes' in data and len(data['routes']) > 0:
                route = data['routes'][0]
                return route.get('duration', None), route.get('distance', None)
            else:
                return None, None
        else:
            return None, None
    except Exception as e:
        print(f"Error: {e}")
        return None, None


durations, distances = [], []
for batch in tqdm(batch_coordinates(taxi, batch_size=50), total=len(taxi) // 50 + 1):
    for _, row in batch.iterrows():
        coords = format_coordinates(row)
        duration, distance = get_route_duration_distance(coords)
        durations.append(duration)
        distances.append(distance)
        
taxi['duration'] = durations
taxi['distance'] = distances

In [None]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 38.9072, 
    "longitude": -77.0369,  
    "start_date": "2024-01-01", 
    "end_date": "2024-11-01",  
    "hourly": "snowfall,precipitation"  
}


responses = openmeteo.weather_api(url, params=params)
response = responses[0]  

hourly = response.Hourly()
hourly_snowfall = hourly.Variables(0).ValuesAsNumpy()  
hourly_precipitation = hourly.Variables(1).ValuesAsNumpy()


weather_data = {
    "time": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left"
    ),
    "snowfall": hourly_snowfall,
    "precipitation": hourly_precipitation
}
weather_df = pd.DataFrame(weather_data)

weather_df["time"] = pd.to_datetime(weather_df["time"], utc=True)
taxi["time"] = pd.to_datetime(taxi["Time"], utc=True)


taxi = pd.merge(taxi, weather_df, on="time", how="left")

In [None]:
taxi['Hour of Day'] = taxi['ORIGINDATETIME_TR'].dt.hour

grouped = taxi.groupby(['Day of the Week', 'Hour of Day']).agg(
    total_mileage=('MILEAGE', 'sum'),
    avg_duration=('DURATION', 'mean'),
    avg_temp=('temp', 'mean')
).reset_index()

pivot_table = grouped.pivot_table(
    index=['Day of the Week'],
    columns='Hour of Day',
    values=['total_mileage', 'avg_duration', 'avg_temp'],
    aggfunc='mean'
)

In [None]:
taxi = taxi.drop(taxi.loc[(taxi['DURATION'] < 1) | (taxi['DURATION'] > 3600)].index)
taxi = taxi.drop(taxi.loc[(taxi['distance'] < 10) & (taxi['MILEAGE'] < 0.1)].index)

In [None]:
fig = px.scatter(taxi, x='snowfall', y='Duration(m)',
                         title='Heatmap of Weather Metric vs Trip Duration',
                         labels={'severerisk': 'Weather Metric', 'DURATION': 'Trip Duration (minutes)'},
                         )
fig.update_yaxes(title_text='Trip Duration(m)', range=[30, 50])
fig.show()

In [None]:
taxi['Time']  = pd.to_datetime(taxi['ORIGINDATETIME_TR'], format='%m/%d/%Y %H:%M')
taxi['Duration(m)'] = taxi['DURATION'] / 60
taxi['Time'] = pd.to_datetime(taxi['Time'])
taxi['Day of the Week'] = taxi['Time'].dt.day_name()
taxi = pd.get_dummies(taxi, columns=['Day of the Week'])
week = taxi.groupby('Day of the Week', as_index=False)['Duration(m)'].count()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
week['Day of the Week'] = pd.Categorical(week['Day of the Week'], categories=day_order, ordered=True)
week = week.sort_values('Day of the Week')
taxi['Day of the Week'] = week['Day of the Week']

In [None]:
def compute_center(X):
    return X.assign(
        CenterLat=(X["ORIGIN_BLOCK_LATITUDE"] + X["DESTINATION_BLOCK_LAT"]) / 2,
        CenterLong=(X["ORIGIN_BLOCK_LONGITUDE"] + X["DESTINATION_BLOCK_LONG"]) / 2
    )
def compute_direction(X):
    return X.assign(
        Direction=X.apply(
            lambda row: (
                'NorthEast' if row['DESTINATION_BLOCK_LAT'] > row['ORIGIN_BLOCK_LATITUDE'] and row['DESTINATION_BLOCK_LONG'] > row['ORIGIN_BLOCK_LONGITUDE'] else
                'NorthWest' if row['DESTINATION_BLOCK_LAT'] > row['ORIGIN_BLOCK_LATITUDE'] and row['DESTINATION_BLOCK_LONG'] < row['ORIGIN_BLOCK_LONGITUDE'] else
                'SouthEast' if row['DESTINATION_BLOCK_LAT'] <= row['ORIGIN_BLOCK_LATITUDE'] and row['DESTINATION_BLOCK_LONG'] >= row['ORIGIN_BLOCK_LONGITUDE'] else
                'SouthWest'
            ),
            axis=1
        )
    )
def haversine(lat1, lon1, lat2, lon2):
    R = 3959
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c
    
def compute_distance(X):
    return X.assign(
        Distance=(haversine(X["ORIGIN_BLOCK_LATITUDE"], X["ORIGIN_BLOCK_LONGITUDE"],X["DESTINATION_BLOCK_LAT"], X["DESTINATION_BLOCK_LONG"]))
    )
    
busy_roads = []
def check_roads(x):
    for road in x:
        if road in busy_roads:
            return 1
    return 0

def compute_traffic(X):
    return X.assign(
        Traffic=(check_roads(X['roads']))
    )

def final_model(X_train, y_train):
    rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

    add_traffic = FunctionTransformer(compute_traffic)
    add_center = FunctionTransformer(compute_center)
    add_direction = FunctionTransformer(compute_direction)
    add_distance = FunctionTransformer(compute_distance)

    num_features = X_train.select_dtypes(include=["number"]).columns
    cat_features = X_train.select_dtypes(include=["object"]).columns

    num_preprocessor = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
        ("scaler", StandardScaler())
    ])
    cat_preprocessor = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    column_transformer = ColumnTransformer(
        transformers=[
            ("num", num_preprocessor, num_features), 
            ("cat", cat_preprocessor, cat_features)   
        ],
        remainder="drop"
    )

    pipeline = Pipeline([
        ("add_traffic", add_traffic),
        ("add_distance", add_distance),
        ("add_center", add_center),
        ("add_direction", add_direction),
        ("preprocessor", column_transformer),
        ("regressor", LinearRegression())
    ])

    hyperparams = {
        "preprocessor__num__poly_features__degree": [2]  # Tune degree for polynomial features
    }

    searcher = GridSearchCV(
        pipeline,
        param_grid=hyperparams,
        cv=5,
        scoring=rmsle_scorer,
    )

    searcher.fit(X_train, y_train)
    return searcher
    
pipe_final = final_model(X_train, y_train)
y_test_pred = pipe_final.predict(X_test)
rmsle_score = rmsle(y_test, y_test_pred)