# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [None]:
# all import statements needed for the project

import os
import re
import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import pyarrow.parquet as pq
import geopandas as gpd
import warnings
import sqlite3

In [None]:
TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [None]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [None]:
def get_html() -> str:
    response = requests.get(TLC_URL)
    html = response.content
    return html

In [None]:
def find_HVFHV_parquet_links() -> List[str]:
    """
    Finds and returns a list of URLs of High Volume FHV parquet files.

    Returns:
        List[str]: a list of strings representing URLs of High Volume FHV parquet files.
    """
    parquet_links = list()
    
    html = get_html()
    soup = bs4.BeautifulSoup(html, "html.parser")
    links = soup.find_all("a")
    pattern = re.compile(r"High Volume For-Hire Vehicle Trip Records")
    for link in links:
        title = link.get('title')
        if title != None:
            match = pattern.search(title)
            if match:
                parquet_links.append(link.get('href'))
    return parquet_links

In [None]:
def download_HVFHV_parquet_files() -> None:
    """
    Downloads High Volume FHV parquet files from URLs found by 'find_HVFHV_parquet_links'.

    Returns:
        None
    """
    taxi_files = find_HVFHV_parquet_links()
    for file_url in taxi_files:
        file_url = file_url.replace(' ', '')
        name = file_url.split('trip-data/')[1]
        # Check if the file already exists
        if os.path.exists(name):
            pass
        else:
            response = requests.get(file_url, stream=True)
            with open(name, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024): 
                    if chunk:
                        f.write(chunk)  

In [None]:
download_HVFHV_parquet_files()

In [None]:
def load_taxi_zones(shapefile: str) -> gpd.GeoDataFrame:
    data = gpd.read_file(shapefile)
    return data

In [None]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    raise NotImplementedError()

### Calculate Sample Size

In [None]:
def calculate_sample_size(population):
    raise NotImplementedError()

### Common Functions

In [None]:
def get_all_urls_from_tlc_page(taxi_page):
    raise NotImplementedError()

In [None]:
def filter_parquet_urls(all_urls):
    raise NotImplementedError()

### Process Taxi Data

In [None]:
def get_and_clean_taxi_month(url):
    raise NotImplementedError()

In [None]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.contact(all_taxi_dataframes)
    return taxi_data

In [None]:
def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TLC_URL)
    all_parquet_urls = find_taxi_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

In [None]:
taxi_data.info()

In [None]:
taxi_data.describe()

### Processing Uber Data

In [None]:
def get_and_clean_uber_month(url: str) -> pd.DataFrame:
    """
    Fetches, processes, and cleans Uber trip data for a given month.

    This function downloads Uber trip data from the specified URL, filters the data 
    based on predefined criteria, normalizes columns, and removes invalid or 
    out-of-bound data points. The processed data is returned as a cleaned DataFrame.

    Args:
        url (str): The URL of the Uber trip data file.

    Returns:
        pd.DataFrame: A cleaned DataFrame containing filtered and normalized Uber trip data.
    """
    
    file_url = url.replace(' ', '')
    name = file_url.split('trip-data/')[1]
    data = pq.read_table(name).to_pandas()

    #filter for Uber license
    data = data[data['hvfhs_license_num'] == "HV0003"]

    #sample the data
    sample = int(calculate_sample_size(p=0.5, e=0.05, z=1.96))
    data = data.sample(n=sample, random_state=1)

    #add geographic coordinates for pickup and dropoff locations
    if 'hvfhs_license_num' in data.columns:
        coords = data.apply(
            lambda row: pd.Series(
                lookup_coords_for_taxi_zone_id(row['PULocationID'], loaded_taxi_zones) +
                lookup_coords_for_taxi_zone_id(row['DOLocationID'], loaded_taxi_zones)
            )
            if (row['PULocationID'] in loaded_taxi_zones['LocationID'] and
                row['DOLocationID'] in loaded_taxi_zones['LocationID'])
            else pd.Series([None, None, None, None]),
            axis=1
    )
    coords.columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
    data = pd.concat([data, coords], axis=1)
    data = data.dropna(subset=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])

    #select columns
    data = data[[
             'request_datetime',
             'pickup_datetime',
             'dropoff_datetime',
             'trip_miles',
             'trip_time',
             'pickup_longitude',
             'pickup_latitude',
             'dropoff_longitude',
             'dropoff_latitude',
             'base_passenger_fare',
             'tolls',
             'bcf',
             'sales_tax',
             'congestion_surcharge',
             'airport_fee',
             'tips'
                ]
            ] 

    #normalize appropriate column types - time
    data['request_datetime'] = pd.to_datetime(data['request_datetime'])
    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
    data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])
    
    #normalize appropriate column types - number
    data['trip_miles'] = pd.to_numeric(data['trip_miles'])
    data['trip_time'] = pd.to_numeric(data['trip_time'])
    data['base_passenger_fare'] = pd.to_numeric(data['base_passenger_fare'])
    data['tolls'] = pd.to_numeric(data['tolls'])
    data['bcf'] = pd.to_numeric(data['bcf'])
    data['sales_tax'] = pd.to_numeric(data['sales_tax'])
    data['congestion_surcharge'] = pd.to_numeric(data['congestion_surcharge'])
    data['airport_fee'] = pd.to_numeric(data['airport_fee'])
    data['tips'] = pd.to_numeric(data['tips'])
    
    #compute total surcharge
    data['all_surcharge'] = data['congestion_surcharge'] + data['airport_fee'] + data['bcf'] + data['tips']
    
    #remove NaN
    data = data.dropna()
    
    #normalize column names (pu, do, trip_time, bcf)
    new_column_names = [
             'request_datetime',
             'pickup_datetime',
             'dropoff_datetime',
             'trip_miles',
             'trip_time_seconds',
             'pickup_longitude',
             'pickup_latitude',
             'dropoff_longitude',
             'dropoff_latitude',
             'base_passenger_fare',
             'tolls',
             'black_car_fund',
             'sales_tax',
             'congestion_surcharge',
             'airport_fee',
             'tips',
             'all_surcharge'
                       ]
    data.columns = new_column_names

    #remove invalid time
    data = data[data['pickup_datetime'] < data['dropoff_datetime']]
    #remove 0 distance
    data = data[data['trip_miles'] > 0]
    #remove 0 time
    data = data[data['trip_time_seconds'] > 0]
    
    #removing trips that start and/or end outside (40.560445, -74.242330) and (40.908524, -73.717047)
    data = data[(data['pickup_latitude'] >= 40.560445) & (data['pickup_latitude'] <= 40.908524) &
                (data['pickup_longitude'] >= -74.242330) & (data['pickup_longitude'] <= -73.717047) &
                (data['dropoff_latitude'] >= 40.560445) & (data['dropoff_latitude'] <= 40.908524) &
                (data['dropoff_longitude'] >= -74.242330) & (data['dropoff_longitude'] <= -73.717047)]

    return data

In [None]:
def get_and_clean_uber_data(parquet_urls: List[str]) -> pd.DataFrame:
    """
    Processes and cleans Uber trip data from a list of Parquet file URLs.

    This function iterates through a list of Parquet file URLs, filters those related
    to High Volume FHV, processes each month's data using`get_and_clean_uber_month`, 
    and combines the results into a single DataFrame.

    Args:
        parquet_urls (List[str]): A list of URLs pointing to Parquet files containing Uber data.

    Returns:
        pd.DataFrame: A cleaned and combined DataFrame containing Uber trip data 
        from all specified Parquet files.
    """
    
    all_uber_dataframes = []
    
    #process each URL in the list
    for parquet_url in parquet_urls:
        if ('fhvhv' in parquet_url):
            dataframe = get_and_clean_uber_month(parquet_url)
            all_uber_dataframes.append(dataframe)
        
    uber_data = pd.concat(all_uber_dataframes)
    
    return uber_data

In [None]:
def get_uber_data() -> pd.DataFrame:
    """
    Fetches and processes Uber trip data from TLC's website.

    This function retrieves all URLs from the TLC page, filters for Parquet file URLs,
    processes the Uber trip data from the relevant Parquet files, and returns a cleaned
    and combined DataFrame.

    Args:
        None

    Returns:
        pd.DataFrame: A cleaned DataFrame containing Uber trip data from all relevant Parquet files.
    """
    all_urls = get_all_urls_from_tlc_page(TLC_URL)
    all_parquet_urls = find_parquet_urls(all_urls)
    taxi_data = get_and_clean_uber_data(all_parquet_urls)
    return taxi_data

In [None]:
warnings.filterwarnings('ignore')
uber_data = get_uber_data()

In [None]:
uber_data.head()

In [None]:
uber_data.info()

In [None]:
uber_data.describe()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)