![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

## Load Two Months of NYC Taxi/Limousine Trip Data

This notebook uses the New York City Taxi and Limousine Commission Trip Record Data available on the AWS Open Data Registry. This contains data of trips taken by taxis and for-hire vehicles in New York City. This data is stored in an iceberg table called `taxis`.

To be able to rerun the notebook several times, let's drop the table and the views if they exist to start fresh.

In [None]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc.taxis;

In [None]:
%%sql

DROP TABLE IF EXISTS nyc.taxis


In [None]:
%%sql

DROP VIEW IF EXISTS nyc.long_distances

In [None]:
%%sql

DROP VIEW IF EXISTS nyc.negative_amounts

## Create the table

In [None]:
%%sql

CREATE TABLE nyc.taxis (
    VendorID              bigint,
    tpep_pickup_datetime  timestamp,
    tpep_dropoff_datetime timestamp,
    passenger_count       double,
    trip_distance         double,
    RatecodeID            double,
    store_and_fwd_flag    string,
    PULocationID          bigint,
    DOLocationID          bigint,
    payment_type          bigint,
    fare_amount           double,
    extra                 double,
    mta_tax               double,
    tip_amount            double,
    tolls_amount          double,
    improvement_surcharge double,
    total_amount          double,
    congestion_surcharge  double,
    airport_fee           double
)
USING iceberg
PARTITIONED BY (days(tpep_pickup_datetime))

# Write a month of data

In [None]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2022-01.parquet")
df.writeTo("nyc.taxis").append()

In [None]:
%%sql

SELECT * FROM nyc.taxis

# Create a view

Let's create an Iceberg view to look at the longest distances travelled and the total amount of the trips.

In [None]:
%%sql

CREATE VIEW nyc.long_distances (
    vendor_id COMMENT 'Vendor ID',
    pickup_date,
    dropoff_date,
    distance COMMENT 'Trip Distance',
    total COMMENT 'Total amount')
    AS SELECT VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, trip_distance, total_amount FROM nyc.taxis ORDER BY trip_distance

In [None]:
%%sql

SELECT * FROM nyc.long_distances

## Update View to order results differently

The output isn't as helpful as imagined, so let's update the view and change the order of columns and the ordering of the results.

In [None]:
%%sql

CREATE OR REPLACE VIEW nyc.long_distances (
    distance COMMENT 'Trip Distance',
    total COMMENT 'Total amount',
    vendor_id COMMENT 'Vendor ID',
    pickup_date,
    dropoff_date)
    AS SELECT trip_distance, total_amount, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime
    FROM nyc.taxis
    WHERE trip_distance > 35 ORDER BY total_amount, trip_distance

In [None]:
%%sql

SELECT * FROM nyc.long_distances

In [None]:
%%sql

SELECT count(*) FROM nyc.long_distances

# Write a month of data

Let's write another month of data and see how the results of the view change

In [None]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2022-02.parquet")
df.writeTo("nyc.taxis").append()

In [None]:
%%sql

SELECT * FROM nyc.long_distances

In [None]:
%%sql

SELECT count(*) FROM nyc.long_distances

# Create another view
It appears that there are trips with negative total amounts. Let's display these results in a separate view

In [None]:
%%sql

CREATE OR REPLACE VIEW nyc.negative_amounts (
    total COMMENT 'Total amount',
    distance COMMENT 'Trip Distance',
    vendor_id COMMENT 'Vendor ID',
    pickup_date,
    dropoff_date)
    AS SELECT total_amount, trip_distance, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime
    FROM nyc.taxis
    WHERE total_amount < 0 ORDER BY total_amount

In [None]:
%%sql

SELECT * FROM nyc.negative_amounts

# Listing and describing views

In [None]:
%%sql

SHOW VIEWS in nyc

In [None]:
%%sql

SHOW VIEWS in nyc LIKE '*neg*'

In [None]:
%%sql

DESCRIBE nyc.long_distances

In [None]:
%%sql

DESCRIBE EXTENDED nyc.long_distances

# Displaying the CREATE statement of a view

In [None]:
%%sql

SHOW CREATE TABLE nyc.long_distances

# Altering and displaying properties of a view

This will add a new property and also update the comment of the view. 
The comment will be shown when describing the view.
The end of this section will also remove a property from the view.

In [None]:
%%sql

SHOW TBLPROPERTIES nyc.long_distances

In [None]:
%%sql

ALTER VIEW nyc.long_distances SET TBLPROPERTIES ('key1' = 'val1', 'key2' = 'val2', 'comment' = 'This is a view comment')

In [None]:
%%sql

SHOW TBLPROPERTIES nyc.long_distances

In [None]:
%%sql

DESCRIBE EXTENDED nyc.long_distances

In [None]:
%%sql

ALTER VIEW nyc.long_distances UNSET TBLPROPERTIES ('key1')

In [None]:
%%sql

SHOW TBLPROPERTIES nyc.long_distances