# Analyzing TLC Trip Record Data Using Snowflake

Project Goal: The goal of this project is to analyze New York Yellow Taxi data using Snowflake 

# Import Modules

In [1]:
import warnings
from credentials import *
from snowflake import snowpark

# Options and Settings

In [2]:
warnings.simplefilter("ignore")

# Create a Session

In [3]:
connection_parameters = {
                          "account": SF_ACCOUNT,
                          "user": SF_USERNAME,
                          "password": SF_PASSWORD,
                          "role": SF_ROLE, 
                          "warehouse": "TLC_WH", 
                          "database": "NYC_TLC", 
                          "schema": "YELLOW_TAXI"
                        }

sf_session = snowpark.Session.builder.configs(connection_parameters).create()

# Query Tables

Explore the date and time when the meter was engaged

In [4]:
sf_session.sql("""
               WITH 
                   daily_passenger_count as (
                                              SELECT 
                                                    DISTINCT 
                                                            TO_CHAR(dt.tpep_pickup_datetime::datetime, 'Dy') AS day_name,
                                                            pc.passenger_count,
                                                            dt.tpep_pickup_datetime
                                              FROM FACT_TABLE ft
                                              LEFT JOIN PASSENGER_COUNT_DIM pc USING (passenger_count_id)
                                              LEFT JOIN DATETIME_DIM dt ON ft.datetime_id = dt.datetime_id
                                              WHERE dt.tpep_pickup_datetime BETWEEN '2023-06-01' AND '2023-06-30'
                                            )
               SELECT
                     DISTINCT
                             day_name,
                             COUNT(passenger_count) AS total_passengers
               FROM daily_passenger_count 
               GROUP BY 1
               ORDER BY  total_passengers DESC   

               """
              ).show()

-----------------------------------
|"DAY_NAME"  |"TOTAL_PASSENGERS"  |
-----------------------------------
|Sat         |306689              |
|Fri         |298584              |
|Thu         |368849              |
|Wed         |287340              |
|Tue         |281836              |
|Sun         |274499              |
|Mon         |262245              |
-----------------------------------



In [5]:
sf_session.sql("""
               WITH 
                   hourly_passenger_count as (
                                              SELECT 
                                                    DISTINCT 
                                                            EXTRACT(HOUR FROM dt.tpep_pickup_datetime) as pickup_hour,
                                                            pc.passenger_count,
                                                            dt.tpep_pickup_datetime
                                              FROM FACT_TABLE ft
                                              LEFT JOIN PASSENGER_COUNT_DIM pc USING (passenger_count_id)
                                              LEFT JOIN DATETIME_DIM dt ON ft.datetime_id = dt.datetime_id
                                              WHERE dt.tpep_pickup_datetime BETWEEN '2023-06-01' AND '2023-06-30'
                                            )
               SELECT 
                     DISTINCT 
                             pickup_hour,
                             COUNT(passenger_count) AS total_passengers
               FROM hourly_passenger_count
               GROUP BY 1
               ORDER BY 2 DESC 

               """
              ).show()

--------------------------------------
|"PICKUP_HOUR"  |"TOTAL_PASSENGERS"  |
--------------------------------------
|10             |97843               |
|23             |96483               |
|9              |89603               |
|8              |79894               |
|0              |70818               |
|7              |60348               |
|18             |130580              |
|17             |125152              |
|19             |122026              |
|15             |120857              |
--------------------------------------



Calculate Daily Trip Distance By Borough

In [6]:
def calculate_trip_distance(borough: str, order: str = "DESC") -> snowpark.DataFrame:
  """
  Calculates daily trip distance by Borough

  Parameters
  ----------
  borough: str
      administrative division to use

  order: str, defaults to descending
      condition for sorting results

  Returns:
      Snowflake dataframe

  """
  connection_parameters = {
                             "account": SF_ACCOUNT,
                             "user": SF_USERNAME,
                             "password": SF_PASSWORD,
                             "role": SF_ROLE, 
                             "warehouse": "TLC_WH", 
                             "database": "NYC_TLC", 
                             "schema": "YELLOW_TAXI"
                           }
  
  session = snowpark.Session.builder.configs(connection_parameters).create()

  sql_query = f"""
               SELECT 
                     DISTINCT 
                             pl.borough,
                             EXTRACT(DAY from dt.tpep_pickup_datetime) as day,
                             ROUND(SUM(td.trip_distance)) as total_trip_distance              
               FROM fact_table ft
               LEFT JOIN trip_distance_dim td USING (trip_distance_id)
               LEFT JOIN pickup_location_dim pl ON ft.pickup_location_id = pl.pickup_location_id
               LEFT JOIN datetime_dim dt ON ft.datetime_id = dt.datetime_id
               WHERE pl.borough = '{borough}'
               GROUP BY 1, 2
               ORDER BY 3 {order}

              """
  try:
      session.sql(sql_query).show()
  
  except Exception as err:
     print(f"An error occurred: {err}")

  finally:
     session.close()

In [7]:
calculate_trip_distance(borough="Manhattan", order="DESC")

---------------------------------------------
|"BOROUGH"  |"DAY"  |"TOTAL_TRIP_DISTANCE"  |
---------------------------------------------
|Manhattan  |29     |698856.0               |
|Manhattan  |21     |513075.0               |
|Manhattan  |1      |419528.0               |
|Manhattan  |22     |402487.0               |
|Manhattan  |2      |389196.0               |
|Manhattan  |8      |370374.0               |
|Manhattan  |23     |363376.0               |
|Manhattan  |10     |342820.0               |
|Manhattan  |27     |302357.0               |
|Manhattan  |24     |299100.0               |
---------------------------------------------



In [8]:
calculate_trip_distance(borough="Queens", order="DESC")

---------------------------------------------
|"BOROUGH"  |"DAY"  |"TOTAL_TRIP_DISTANCE"  |
---------------------------------------------
|Queens     |8      |135213.0               |
|Queens     |30     |128518.0               |
|Queens     |24     |122335.0               |
|Queens     |3      |119970.0               |
|Queens     |10     |108927.0               |
|Queens     |17     |107865.0               |
|Queens     |31     |105.0                  |
|Queens     |26     |185308.0               |
|Queens     |11     |169154.0               |
|Queens     |19     |169044.0               |
---------------------------------------------



In [9]:
calculate_trip_distance(borough="Brooklyn", order="DESC")

---------------------------------------------
|"BOROUGH"  |"DAY"  |"TOTAL_TRIP_DISTANCE"  |
---------------------------------------------
|Brooklyn   |26     |54913.0                |
|Brooklyn   |12     |30642.0                |
|Brooklyn   |27     |29051.0                |
|Brooklyn   |19     |20642.0                |
|Brooklyn   |23     |13644.0                |
|Brooklyn   |11     |6659.0                 |
|Brooklyn   |25     |6311.0                 |
|Brooklyn   |18     |6234.0                 |
|Brooklyn   |4      |5821.0                 |
|Brooklyn   |10     |5417.0                 |
---------------------------------------------



# Close Current Session

In [10]:
sf_session.close()