In [14]:
import psycopg2
from psycopg2 import sql
from sqlalchemy import create_engine
import pandas as pd

In [15]:
db_connection_str = 'postgresql://root:root@localhost:5432/ny_taxi'
db_connection = create_engine(db_connection_str)

# Load the taxi zone lookup CSV file into a DataFrame
df_lookup = pd.read_csv('taxi+_zone_lookup.csv')

# Assuming the table does not exist, create it
# Define the SQL statement to create a table. Modify this as necessary for your schema
sql_create_table = """
CREATE TABLE IF NOT EXISTS taxi_lookup (
    LocationID int,
    Borough text,
    Zone text,
    service_zone text
);
"""
db_connection.execute(sql_create_table)

# Ingest the DataFrame into the taxi_lookup table
df_lookup.to_sql('taxi_lookup', con=db_connection, if_exists='replace', index=False)


265

In [5]:
conn = psycopg2.connect(
    dbname="ny_taxi",
    user="root",
    password="root",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Define the dates to check
dates_to_check = ['2019-09-18', '2019-09-16', '2019-09-26', '2019-09-21']

# Initialize a dictionary to store the maximum trip distance for each date
largest_trip_each_day = {}

# Iterate through each date and execute a query
for date in dates_to_check:
    query = """
        SELECT MAX(trip_distance) AS max_trip_distance
        FROM taxi_data
        WHERE DATE(lpep_pickup_datetime) = %s;
    """
    cur.execute(query, [date])
    max_trip_distance = cur.fetchone()[0]
    largest_trip_each_day[date] = max_trip_distance

# Display the results
print(largest_trip_each_day)


UndefinedColumn: column "lpep_pickup_datetime" does not exist
LINE 4:         WHERE DATE(lpep_pickup_datetime) = '2019-09-18';
                           ^
HINT:  Perhaps you meant to reference the column "taxi_data.tpep_pickup_datetime".


In [6]:
query = """
    SELECT COUNT(*)
    FROM taxi_data
    WHERE DATE(lpep_pickup_datetime) = '2019-09-18'
      AND DATE(lpep_dropoff_datetime) = '2019-09-18';
"""
cur.execute(query)
num_trips_on_sept_18 = cur.fetchone()[0]
print(f"Number of trips on September 18th, 2019: {num_trips_on_sept_18}")

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [8]:
conn = psycopg2.connect(
    dbname="ny_taxi",
    user="root",
    password="root",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()
query = """
    SELECT t.DOLocationID, MAX(t.tip_amount) AS max_tip
    FROM taxi_data t
    JOIN lookup b ON t.PULocationID = b.LocationID
    WHERE DATE(t.lpep_pickup_datetime) BETWEEN '2019-09-01' AND '2019-09-30'
      AND b.Zone = 'Astoria'
    GROUP BY t.DOLocationID
    ORDER BY max_tip DESC
    LIMIT 1;
"""
cur.execute(query)
max_tip_info = cur.fetchone()
max_tip_dropoff_location_id = max_tip_info[0]
max_tip_amount = max_tip_info[1]

# Get the zone name for the drop-off location
query_zone = """
    SELECT Zone
    FROM lookup
    WHERE LocationID = %s;
"""
cur.execute(query_zone, [max_tip_dropoff_location_id])
max_tip_dropoff_zone = cur.fetchone()[0]
print(f"Largest tip drop-off zone: {max_tip_dropoff_zone}, Tip Amount: {max_tip_amount}")


UndefinedTable: relation "lookup" does not exist
LINE 4:     JOIN lookup b ON t.PULocationID = b.LocationID
                 ^


In [10]:
import psycopg2

# Establish the database connection
conn = psycopg2.connect(
    dbname="ny_taxi",
    user="root",
    password="root",
    host="localhost",
    port="5432"
)

# Create a cursor
cur = conn.cursor()

# List of queries to run
queries = [
    """
    SELECT COUNT(*)
    FROM taxi_table
    WHERE DATE(lpep_pickup_datetime) = '2019-09-18'
    AND DATE(lpep_dropoff_datetime) = '2019-09-18';
    """,
    # ... (Add other queries here)
]

# Run queries sequentially
for query in queries:
    try:
        cur.execute(query)
        result = cur.fetchone()[0]
        print(result)
        conn.commit()  # Commit the current transaction
    except Exception as e:
        print(f"An error occurred: {e}")
        conn.rollback()  # Rollback the transaction on error

# Close the cursor and connection
cur.close()
conn.close()


15612


In [17]:
import psycopg2
from psycopg2 import extras

# Establish the database connection
conn = psycopg2.connect(
    dbname="ny_taxi",
    user="root",
    password="root",
    host="localhost",
    port="5432"
)

# Create a cursor with a dictionary factory, so we can refer to the data by their column names
cur = conn.cursor(cursor_factory=extras.DictCursor)

# Define a function to execute and print the result of SQL queries
def execute_query(cur, query):
    try:
        cur.execute(query)
        result = cur.fetchone()
        conn.commit()  # Commit the current transaction
        return result
    except Exception as e:
        print(f"An error occurred: {e}")
        conn.rollback()  # Rollback the transaction on error
        return None

# Query 1: Count the number of trips that started and ended on September 18th, 2019
query1 = """
    SELECT COUNT(*) AS num_trips
    FROM taxi_table
    WHERE DATE(lpep_pickup_datetime) = '2019-09-18'
      AND DATE(lpep_dropoff_datetime) = '2019-09-18';
"""

# Query 2: Find the maximum trip distance for each date specified
dates_to_check = ['2019-09-18', '2019-09-16', '2019-09-26', '2019-09-21']
largest_trip_each_day = {}

for date in dates_to_check:
    query2 = f"""
        SELECT MAX(trip_distance) AS max_trip_distance
        FROM taxi_table
        WHERE DATE(lpep_pickup_datetime) = '{date}';
    """
    max_trip_distance = execute_query(cur, query2)
    largest_trip_each_day[date] = max_trip_distance['max_trip_distance'] if max_trip_distance else None

# Query 3: Find the top 3 boroughs with the highest total_amount on September 18th, 2019
query3 = """
    WITH trip_borough_data AS (
        SELECT 
            td.*, tl.Borough
        FROM 
            taxi_table td
        INNER JOIN 
            taxi_lookup tl ON td.PULocationID = tl.LocationID
        WHERE 
            DATE(td.lpep_pickup_datetime) = '2019-09-18'
    )
    SELECT 
        Borough, SUM(total_amount) AS total_amount
    FROM 
        trip_borough_data
    GROUP BY 
        Borough
    HAVING 
        SUM(total_amount) > 50000
    ORDER BY 
        total_amount DESC
    LIMIT 3;
"""

# Run the first query
num_trips_on_sept_18 = execute_query(cur, query1)
print(f"Number of trips on September 18th, 2019: {num_trips_on_sept_18['num_trips']}")

# Print the results of the second query
print("Maximum trip distance for each date:")
for date, max_distance in largest_trip_each_day.items():
    print(f"Date: {date}, Max Trip Distance: {max_distance}")

# Run the third query and print the results
top_3_boroughs = execute_query(cur, query3)
print("Top 3 boroughs with the highest total amount on September 18th, 2019:")
if top_3_boroughs:
    for borough in top_3_boroughs:
        print(f"Borough: {borough['borough']}, Total Amount: {borough['total_amount']}")

# Close the cursor and connection
cur.close()
conn.close()


Number of trips on September 18th, 2019: 15612
Maximum trip distance for each date:
Date: 2019-09-18, Max Trip Distance: 70.28
Date: 2019-09-16, Max Trip Distance: 114.3
Date: 2019-09-26, Max Trip Distance: 341.64
Date: 2019-09-21, Max Trip Distance: 135.53
An error occurred: column td.pulocationid does not exist
LINE 8:             taxi_lookup tl ON td.PULocationID = tl.LocationI...
                                      ^

Top 3 boroughs with the highest total amount on September 18th, 2019:
