## Project Setup

In [1]:
import os
import re
import bs4
import math
import requests
import warnings
import numpy as np
import pandas as pd
import sqlalchemy as db
import geopandas as gpd
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [9]:
# the constants

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
UBER_CSV = "uber_rides_sample.csv"
WEATHER_CSV = ["2009_weather.csv", "2010_weather.csv", "2011_weather.csv", "2012_weather.csv", "2013_weather.csv", "2014_weather.csv", "2015_weather.csv"]

NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

## Part 1: Data Preprocessing

### Calculating distance

The most important thing in a trip is the distance, so we need to calculate the distance from the start to the end of the trip.

In [1]:
def calculate_distance(from_coord, to_coord):
    """
    Calculate the distance between two coordinates in kilometers.
    args:
        from_coord: dataframe of columns 'pickup_latitude' and 'pickup_longitude'
        to_coord: dataframe of columns 'dropoff_latitude' and 'dropoff_longitude'
    returns:
        distance in kilometers
    """
    pickup_lat = from_coord['pickup_latitude'].map(math.radians)
    pickup_lon = from_coord['pickup_longitude'].map(math.radians)
    dropoff_lat = to_coord['dropoff_latitude'].map(math.radians)
    dropoff_lon = to_coord['dropoff_longitude'].map(math.radians)
    R = 6378.137 # Radius of earth in KM
    a = pickup_lat - dropoff_lat
    b = pickup_lon - dropoff_lon
    c = ((a / 2).map(math.sin)) ** 2 + pickup_lat.map(math.cos) * dropoff_lat.map(math.cos) * ((b / 2).map(math.sin) ** 2)
    distance = 2 * R * c.map(math.sqrt).map(math.asin)

    return distance.astype('float32')

In [2]:
def add_distance_column(dataframe):
    '''
    Adds a column to the dataframe with the distance between the pickup and dropoff coordinates.
    args:
        dataframe: a pandas dataframe with columns "pickup_latitude" , "pickup_longitude",  "dropoff_latitude" and "dropoff_longitude"
    returns:
        a copy of the dataframe with an additional column "distance" containing the distance between the pickup and dropoff coordinates
    '''

    from_coord = dataframe[['pickup_latitude', 'pickup_longitude']]
    to_coord = dataframe[['dropoff_latitude', 'dropoff_longitude']]
    dataframe['distance'] = calculate_distance(from_coord, to_coord)

    # remove any rows where the distance is bigger than 100 km
    dataframe = dataframe[dataframe['distance'] < 100]
    
    return dataframe

### Processing Taxi Data

We will load and clean the yellow taxi data from 2009-01 to 2009-06.

In [7]:
def find_taxi_parquet_urls():
    '''
    Finds the urls of the parquet files for the taxi data.
    returns:
        a list of urls
    '''
    strhtml = requests.get(TAXI_URL)
    soup = bs4.BeautifulSoup(strhtml.text, "html.parser")
    links = soup.find_all("a", href=True)

    urls = []
    for link in links:
        # use regular expression to find the links that match the pattern 2009-01~2015-06
        if re.search(r"yellow_tripdata_2009|yellow_tripdata_201[0-4]|yellow_tripdata_2015-0[1-6]", link.get("href")):
            urls.append(link.get("href"))
    return urls


In [None]:
geo = gpd.read_file(filename='taxi_zones.zip', engine='fiona')
geo = geo.to_crs(4326)
# get the coordinates of the pickup and dropoff locations
geo['longitude'] = geo.centroid.x
geo['latitude'] = geo.centroid.y
# some location IDs are duplicated so those IDs are removed 
geo.drop_duplicates('LocationID',inplace=True)
geo.set_index('LocationID',inplace=True)

In [None]:
def get_and_clean_month_taxi_data(url):
    '''
    Downloads the taxi data for a given month and cleans it.
    args:
        url: the url of the parquet file
    returns:
        a pandas dataframe with the cleaned data
    '''
    file_name = url.split('/')[-1]

    # download the file if it doesn't exist
    if not os.path.exists(file_name):
        print("downloading", file_name)
        file = requests.get(url)
        with open(file_name , "wb") as f:
            f.write(file.content)

    df = pd.read_parquet(file_name)
    print("cleaning", file_name)

    # looking up the latitude and longitude for some months where only location IDs are given for pickups and dropoffs
    if 'PULocationID' in df.columns:
        df['pickup_latitude'] = df['PULocationID'].map(geo['latitude'])
        df['pickup_longitude'] = df['PULocationID'].map(geo['longitude'])
        df['dropoff_latitude'] = df['DOLocationID'].map(geo['latitude'])
        df['dropoff_longitude'] = df['DOLocationID'].map(geo['longitude'])

    # normalizing column names
    df.rename(columns={'tpep_pickup_datetime':'pickup_datetime','Trip_Pickup_DateTime':'pickup_datetime','Trip_Dropoff_DateTime':'dropoff_datetime','Start_Lon':'pickup_longitude', 'Start_Lat':'pickup_latitude', 'End_Lon':'dropoff_longitude', 'End_Lat':'dropoff_latitude','Tip_Amt':'tip_amount'}, inplace=True)

    # some location IDs are valid so those specific trips are removed 
    df.dropna(subset=['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'],inplace=True)

    # removing unnecessary columns
    df = df[['PULocationID','DOLocationID','pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','tip_amount']]

    # removing invalid data points
    df = df[df['tip_amount'] >= 0]

    # removing trips that start and/or end outside of the NEW_YORK_BOX_COORDS
    df = df[(df['pickup_latitude'] >= NEW_YORK_BOX_COORDS[0][0]) & (df['pickup_latitude'] <= NEW_YORK_BOX_COORDS[1][0])]
    df = df[(df['pickup_longitude'] >= NEW_YORK_BOX_COORDS[0][1]) & (df['pickup_longitude'] <= NEW_YORK_BOX_COORDS[1][1])]

    # sample the data to make it roughly equal to the size of the Uber dataset
    # ~200000 is the number of rows in the Uber dataset
    # so every month we sample 200000 / 78 = 2564 rows
    df = df.sample(n=2564)

    # normalizing and using appropriate column types for the respective data
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df = df.astype({'PULocationID':'int8','DOLocationID':'int8','pickup_longitude':'float32','pickup_latitude':'float32','dropoff_longitude':'float32','dropoff_latitude':'float32','tip_amount':'float32'})

    return df
    

In [None]:
def get_and_clean_taxi_data():
    '''
    Downloads the taxi data and cleans it.
    returns:
        a pandas dataframe with the cleaned taxi data
    '''
    all_taxi_dataframes = []
    
    all_parquet_urls = find_taxi_parquet_urls()
    for parquet_url in all_parquet_urls:
        dataframe = get_and_clean_month_taxi_data(parquet_url)
        add_distance_column(dataframe)

        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    return taxi_data

### Processing Uber Data

Now we should process the Uber data. We should make sure that the data is in the same format as the taxi data.

In [10]:
def load_and_clean_uber_data(csv_file):
    '''
    Loads the Uber data and cleans it.
    args:
        csv_file: the csv file with the Uber data
    returns:
        a pandas dataframe with the cleaned Uber data
    '''
    df = pd.read_csv(csv_file)
    print("cleaning", csv_file)
    
    # removing unnecessary columns
    df = df[['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']]

    # removing trips that start and/or end outside of the NEW_YORK_BOX_COORDS
    df = df[(df['pickup_latitude'] >= NEW_YORK_BOX_COORDS[0][0]) & (df['pickup_latitude'] <= NEW_YORK_BOX_COORDS[1][0])]
    df = df[(df['pickup_longitude'] >= NEW_YORK_BOX_COORDS[0][1]) & (df['pickup_longitude'] <= NEW_YORK_BOX_COORDS[1][1])]

    # normalizing and using appropriate column types for the respective data
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df = df.astype({'pickup_longitude':'float32','pickup_latitude':'float32','dropoff_longitude':'float32','dropoff_latitude':'float32'})
    
    return df

In [None]:
def get_uber_data():
    '''
    Load Uber data and cleans it.
    returns:
        a pandas dataframe with the cleaned Uber data
    '''
    if os.path.exists(UBER_CSV.replace('.csv','_cleaned.csv')):
        print("reading", UBER_CSV.replace('.csv','_cleaned.csv'))
        return pd.read_csv(UBER_CSV.replace('.csv','_cleaned.csv'))

    uber_dataframe = load_and_clean_uber_data(UBER_CSV)
    add_distance_column(uber_dataframe)
    
    return uber_dataframe

### Processing Weather Data

We will load and clean the weather data from 2009-01 to 2009-06.

In [11]:
def clean_month_weather_data_hourly(csv_file):
    '''
    Loads the weather data and cleans it.
    args:
        csv_file: the csv file with the weather data
    returns:
        a pandas dataframe with the cleaned weather data
    '''
    df = pd.read_csv(csv_file)
    
    # removing unnecessary columns
    df = df[['DATE','HourlyWindSpeed','HourlyPrecipitation']]
    
    # normalizing and using appropriate column types for the respective data
    df['DATE'] = pd.to_datetime(df['DATE'])
    df['HourlyPrecipitation'] = pd.to_numeric(df['HourlyPrecipitation'], errors='coerce')

    # fill in missing values
    mean = int(df['HourlyWindSpeed'].mean())
    df['HourlyWindSpeed'].fillna(mean, inplace=True)
    df['HourlyPrecipitation'].fillna(0, inplace=True)

    df = df.astype({'HourlyWindSpeed': 'int8', 'HourlyPrecipitation': 'float64'})

    return df

In [None]:
def clean_month_weather_data_daily(csv_file):
    '''
    Loads the weather data and groups it by day.
    args:
        csv_file: the csv file with the weather data
    returns:
        a pandas dataframe with the cleaned weather data
    '''
    df = clean_month_weather_data_hourly(csv_file)

    # group by hour and take the mean of the windspeed and sum of the precipitation
    df['DATE'] = df['DATE'].dt.date
    df = df.groupby('DATE').agg({'HourlyWindSpeed': np.mean, 'HourlyPrecipitation': np.sum})

    # normalizing and using appropriate column types for the respective data
    df.reset_index(inplace=True)
    df['DATE'] = pd.to_datetime(df['DATE'])
    df.rename(columns={'HourlyWindSpeed': 'DailyWindSpeed', 'HourlyPrecipitation': 'DailyPrecipitation'}, inplace=True)
    df = df.astype({'DailyWindSpeed':'int8','DailyPrecipitation':'float64'})
    
    return df

In [None]:
def load_and_clean_weather_data():
    '''
    Loads the weather data and cleans it.
    returns:
        a pandas dataframe with the cleaned weather data
    '''

    hourly_dataframes = []
    daily_dataframes = []

    for csv_file in WEATHER_CSV:
        print("cleaning", csv_file)
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        daily_dataframes.append(daily_dataframe)

    # create two dataframes with data from every month needed
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)

    return hourly_data, daily_data

### Process All Data

Now we can process all the data.

In [None]:
taxi_data = get_and_clean_taxi_data()
uber_data = get_uber_data()
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

## Part 2: Storing Cleaned Data

_Write some prose that tells the reader what you're about to do here._

In [14]:
engine = db.create_engine(DATABASE_URL)

In [15]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [16]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

_**TODO:** Write some prose that tells the reader what you're about to do here._

In [17]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

_A checklist of requirements to keep you on track. Remove this whole cell before submitting the project. The order of these tasks aren't necessarily the order in which they need to be done. It's okay to do them in an order that makes sense to you._

* [ ] For 01-2009 through 06-2015, what hour of the day was the most popular to take a yellow taxi? The result should have 24 bins.
* [ ] For the same time frame, what day of the week was the most popular to take an uber? The result should have 7 bins.
* [ ] What is the 95% percentile of distance traveled for all hired trips during July 2013?
* [ ] What were the top 10 days with the highest number of hired rides for 2009, and what was the average distance for each day?
* [ ] Which 10 days in 2014 were the windiest, and how many hired trips were made on those days?
* [ ] During Hurricane Sandy in NYC (Oct 29-30, 2012) and the week leading up to it, how many trips were taken each hour, and for each hour, how much precipitation did NYC receive and what was the sustained wind speed?

In [18]:
def write_query_to_file(query, outfile):
    raise NotImplemented()

### Query N

_**TODO:** Write some prose that tells the reader what you're about to do here._

_Repeat for each query_

In [None]:
QUERY_N = """
TODO
"""

In [None]:
engine.execute(QUERY_N).fetchall()

In [None]:
write_query_to_file(QUERY_N, "some_descriptive_name.sql")

## Part 4: Visualizing the Data

_A checklist of requirements to keep you on track. Remove this whole cell before submitting the project. The order of these tasks aren't necessarily the order in which they need to be done. It's okay to do them in an order that makes sense to you._

* [ ] Create an appropriate visualization for the first query/question in part 3
* [ ] Create a visualization that shows the average distance traveled per month (regardless of year - so group by each month). Include the 90% confidence interval around the mean in the visualization
* [ ] Define three lat/long coordinate boxes around the three major New York airports: LGA, JFK, and EWR (you can use bboxfinder to help). Create a visualization that compares what day of the week was most popular for drop offs for each airport.
* [ ] Create a heatmap of all hired trips over a map of the area. Consider using KeplerGL or another library that helps generate geospatial visualizations.
* [ ] Create a scatter plot that compares tip amount versus distance.
* [ ] Create another scatter plot that compares tip amount versus precipitation amount.

_Be sure these cells are executed so that the visualizations are rendered when the notebook is submitted._

### Visualization N

_**TODO:** Write some prose that tells the reader what you're about to do here._

_Repeat for each visualization._

_The example below makes use of the `matplotlib` library. There are other libraries, including `pandas` built-in plotting library, kepler for geospatial data representation, `seaborn`, and others._

In [19]:
# use a more descriptive name for your function
def plot_visual_n(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_n():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplemented()

In [None]:
some_dataframe = get_data_for_visual_n()
plot_visual_n(some_dataframe)