# Analyzing TLC Trip Record Data Using Snowflake

Project Goal: The goal of this project is to analyze New York Yellow Taxi data using Snowflake 

# Import Modules

In [1]:
import warnings
from credentials import *
from snowflake import snowpark

# Options and Settings

In [2]:
warnings.simplefilter("ignore")

# Create a Session

In [3]:
connection_parameters = {
                          "account": SF_ACCOUNT,
                          "user": SF_USERNAME,
                          "password": SF_PASSWORD,
                          "role": SF_ROLE, 
                          "warehouse": "TLC_WH", 
                          "database": "NYC_TLC", 
                          "schema": "YELLOW_TAXI"
                        }

sf_session = snowpark.Session.builder.configs(connection_parameters).create()

# Query Tables

Count Total Passengers By Mode of Payment

In [4]:
sf_session.sql(
                """
                CREATE OR REPLACE FUNCTION payment_count_by_passengers(payment_type TEXT)
                  RETURNS TABLE (payment_type_name CHAR, total_passengers INTEGER)
                AS
                '
                SELECT pt.payment_type_name, COUNT(pc.passenger_count) AS total_passengers
                FROM NYC_TLC.YELLOW_TAXI.fact_table ft
                LEFT JOIN NYC_TLC.YELLOW_TAXI.payment_type_dim pt USING(payment_type_id)
                LEFT JOIN NYC_TLC.YELLOW_TAXI.passenger_count_dim pc ON ft.passenger_count_id = pc.passenger_count_id
                WHERE payment_type_name = (payment_type)
                GROUP BY 1
                ORDER BY 2 DESC'

                """
              )

<snowflake.snowpark.dataframe.DataFrame at 0x1873d975b80>

In [5]:
sf_session.sql("""SELECT * FROM TABLE (payment_count_by_passengers('Credit card'))""").show()

--------------------------------------------
|"PAYMENT_TYPE_NAME"  |"TOTAL_PASSENGERS"  |
--------------------------------------------
|Credit card          |2592256             |
--------------------------------------------



In [6]:
sf_session.sql("""SELECT * FROM TABLE (payment_count_by_passengers('Cash'))""").show()

--------------------------------------------
|"PAYMENT_TYPE_NAME"  |"TOTAL_PASSENGERS"  |
--------------------------------------------
|Cash                 |552611              |
--------------------------------------------



In [7]:
sf_session.sql("""SELECT * FROM TABLE (payment_count_by_passengers('Dispute'))""").show()

--------------------------------------------
|"PAYMENT_TYPE_NAME"  |"TOTAL_PASSENGERS"  |
--------------------------------------------
|Dispute              |41185               |
--------------------------------------------



In [8]:
sf_session.sql("""SELECT * FROM TABLE (payment_count_by_passengers('No charge'))""").show()

--------------------------------------------
|"PAYMENT_TYPE_NAME"  |"TOTAL_PASSENGERS"  |
--------------------------------------------
|No charge            |21294               |
--------------------------------------------



Explore the date and time when the meter was engaged

In [9]:
sf_session.sql("""
               WITH 
                   daily_passenger_count as (
                                              SELECT 
                                                    DISTINCT 
                                                            TO_CHAR(dt.tpep_pickup_datetime::datetime, 'Dy') AS day_name,
                                                            pc.passenger_count,
                                                            dt.tpep_pickup_datetime
                                              FROM FACT_TABLE ft
                                              LEFT JOIN PASSENGER_COUNT_DIM pc USING (passenger_count_id)
                                              LEFT JOIN DATETIME_DIM dt ON ft.datetime_id = dt.datetime_id
                                              WHERE dt.tpep_pickup_datetime BETWEEN '2023-06-01' AND '2023-06-30'
                                            )
               SELECT
                     DISTINCT
                             day_name,
                             COUNT(passenger_count) AS total_passengers
               FROM daily_passenger_count 
               GROUP BY 1
               ORDER BY  total_passengers DESC   

               """
              ).show()

-----------------------------------
|"DAY_NAME"  |"TOTAL_PASSENGERS"  |
-----------------------------------
|Wed         |287340              |
|Tue         |281836              |
|Fri         |298584              |
|Thu         |368849              |
|Sat         |306689              |
|Sun         |274499              |
|Mon         |262245              |
-----------------------------------



In [10]:
sf_session.sql("""
               WITH 
                   hourly_passenger_count as (
                                              SELECT 
                                                    DISTINCT 
                                                            EXTRACT(HOUR FROM dt.tpep_pickup_datetime) as pickup_hour,
                                                            pc.passenger_count,
                                                            dt.tpep_pickup_datetime
                                              FROM FACT_TABLE ft
                                              LEFT JOIN PASSENGER_COUNT_DIM pc USING (passenger_count_id)
                                              LEFT JOIN DATETIME_DIM dt ON ft.datetime_id = dt.datetime_id
                                              WHERE dt.tpep_pickup_datetime BETWEEN '2023-06-01' AND '2023-06-30'
                                            )
               SELECT 
                     DISTINCT 
                             pickup_hour,
                             COUNT(passenger_count) AS total_passengers
               FROM hourly_passenger_count
               GROUP BY 1
               ORDER BY 2 DESC 

               """
              ).show()

--------------------------------------
|"PICKUP_HOUR"  |"TOTAL_PASSENGERS"  |
--------------------------------------
|18             |130580              |
|17             |125152              |
|19             |122026              |
|15             |120857              |
|16             |120238              |
|14             |119571              |
|10             |97843               |
|23             |96483               |
|9              |89603               |
|8              |79894               |
--------------------------------------



Calculate Daily Trip Distance By Borough

In [11]:
def calculate_trip_distance(borough: str, order: str = "DESC") -> snowpark.DataFrame:
  """
  Calculates daily trip distance by Borough

  Parameters
  ----------
  borough: str
      administrative division to use

  order: str, defaults to descending
      condition for sorting results

  Returns:
      Snowflake dataframe

  """
  connection_parameters = {
                             "account": SF_ACCOUNT,
                             "user": SF_USERNAME,
                             "password": SF_PASSWORD,
                             "role": SF_ROLE, 
                             "warehouse": "TLC_WH", 
                             "database": "NYC_TLC", 
                             "schema": "YELLOW_TAXI"
                           }
  
  session = snowpark.Session.builder.configs(connection_parameters).create()

  sql_query = f"""
               SELECT 
                     DISTINCT 
                             pl.borough,
                             EXTRACT(DAY from dt.tpep_pickup_datetime) as day,
                             ROUND(SUM(td.trip_distance)) as total_trip_distance              
               FROM fact_table ft
               LEFT JOIN trip_distance_dim td USING (trip_distance_id)
               LEFT JOIN pickup_location_dim pl ON ft.pickup_location_id = pl.pickup_location_id
               LEFT JOIN datetime_dim dt ON ft.datetime_id = dt.datetime_id
               WHERE pl.borough = '{borough}'
               GROUP BY 1, 2
               ORDER BY 3 {order}

              """
  try:
      session.sql(sql_query).show()
  
  except Exception as err:
     print(f"An error occurred: {err}")

  finally:
     session.close()

In [12]:
calculate_trip_distance(borough="Manhattan", order="DESC")

---------------------------------------------
|"BOROUGH"  |"DAY"  |"TOTAL_TRIP_DISTANCE"  |
---------------------------------------------
|Manhattan  |14     |335867.0               |
|Manhattan  |15     |329431.0               |
|Manhattan  |28     |315058.0               |
|Manhattan  |16     |313180.0               |
|Manhattan  |13     |313029.0               |
|Manhattan  |5      |311031.0               |
|Manhattan  |3      |308979.0               |
|Manhattan  |9      |307886.0               |
|Manhattan  |18     |248690.0               |
|Manhattan  |12     |240750.0               |
---------------------------------------------



In [13]:
calculate_trip_distance(borough="Queens", order="DESC")

---------------------------------------------
|"BOROUGH"  |"DAY"  |"TOTAL_TRIP_DISTANCE"  |
---------------------------------------------
|Queens     |7      |158746.0               |
|Queens     |22     |154868.0               |
|Queens     |15     |153800.0               |
|Queens     |9      |152421.0               |
|Queens     |20     |151767.0               |
|Queens     |27     |150225.0               |
|Queens     |16     |148926.0               |
|Queens     |14     |144536.0               |
|Queens     |23     |143533.0               |
|Queens     |26     |185308.0               |
---------------------------------------------



In [14]:
calculate_trip_distance(borough="Brooklyn", order="DESC")

---------------------------------------------
|"BOROUGH"  |"DAY"  |"TOTAL_TRIP_DISTANCE"  |
---------------------------------------------
|Brooklyn   |4      |5821.0                 |
|Brooklyn   |10     |5417.0                 |
|Brooklyn   |24     |5157.0                 |
|Brooklyn   |17     |5123.0                 |
|Brooklyn   |3      |5073.0                 |
|Brooklyn   |15     |4964.0                 |
|Brooklyn   |16     |4788.0                 |
|Brooklyn   |1      |3852.0                 |
|Brooklyn   |13     |3836.0                 |
|Brooklyn   |6      |3670.0                 |
---------------------------------------------



# Close Current Session

In [15]:
sf_session.close()