# Spark SQL

In [None]:
spark.sparkContext.setLogLevel("ERROR")

### Check all the `databases` present

In [None]:
spark.sql('SHOW databases').show()           # This is coming from AWS Glue Catalog

### Check the present `database` selected

In [None]:
spark.catalog.currentDatabase() # This with show the presently selected `database`

### Check all the `tables` present

In [None]:
spark.sql('SHOW tables').show() # In the `default` Database

### Select a particular `database`

In [None]:
spark.sql('USE db_youtube_raw')   # Select a differet database

In [None]:
spark.catalog.currentDatabase()

### Show the `tables` within the `database`

In [None]:
spark.sql('SHOW tables').show(truncate=False)  # Show all the tables within the database 

### Create a database

In [None]:
spark.sql('CREATE DATABASE IF NOT EXISTS my_db_spark')

In [None]:
spark.sql('SHOW databases').show()

In [None]:
spark.sql('USE my_db_spark')              # Use this newly created database 

In [None]:
spark.sql('SHOW tables').show()           # Show all the tables within the database 

In [None]:
# Check if a particular database is present or not (in case the list is long)

( 
    spark
    .sql('SHOW databases')
    .filter("namespace = 'db_youtube_raw'")
    .show() 
)

In [None]:
# Another way to check if a list of databases are present or not (in case the list is long)

( 
    spark
    .sql('SHOW databases')
    .filter("namespace LIKE 'db%'")
    .show() 
)

### Create a Table

In [None]:
spark.sql('USE my_db_spark') 
spark.catalog.currentDatabase()    # Check the present database (which is selected)

In [None]:
spark.sql('SHOW tables').show(truncate=False)           

In [None]:
spark.sql('CREATE TABLE orders \
               (order_id integer, \
                order_date string, \
                customer_id integer, \
                order_status string)')

In [None]:
spark.sql('SHOW tables').show()

> Now you can go to a new shell and run this 
```python
spark.sql('USE my_db_spark')
spark.sql('SHOW tables').show()
+-----------+---------+-----------+                                             
|  namespace|tableName|isTemporary|
+-----------+---------+-----------+
|my_db_spark|   orders|      false|
+-----------+---------+-----------+
```

### Create a `TempView` using the DataFrame

In [None]:
# Lets first create a DF using that we will create a TempView
dataset = 's3://fcc-spark-example/dataset/2023/orders.csv'
df = spark.read.csv(dataset, header=True, inferSchema=True)

# Creating a TempView
df.createOrReplaceTempView('orders_view')

In [None]:
spark.sql('SHOW tables').show()

> Now you can go to a new shell and run this **(WE WON'T SEE THE TEMP VIEW)**
```python
spark.sql('USE my_db_spark')
spark.sql('SHOW tables').show()
+-----------+---------+-----------+                                             
|  namespace|tableName|isTemporary|
+-----------+---------+-----------+
|my_db_spark|   orders|      false|
+-----------+---------+-----------+
```

### Insert data from the `TempView` into the new `Table` (the persistent table)

In [None]:
spark.sql("INSERT INTO orders \
            SELECT * \
            FROM orders_view")

In [None]:
spark.sql("SELECT * FROM orders").show(10)

### Describe `Table`

In [None]:
spark.sql("DESCRIBE TABLE orders").show()

In [None]:
# For the persistent table 
spark.sql("DESCRIBE EXTENDED orders").show(truncate=False)              # Its a managed table

In [None]:
# For the TempView
spark.sql("DESCRIBE EXTENDED orders_view").show()              

### Check the underline data in `HDFS` (Managed Table)

In [None]:
%%bash

hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse

In [None]:
%%bash

hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders

In [None]:
%%bash 

hadoop fs -head hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders/part-00000-e687636c-e1bf-4205-baef-0315bae3cc48-c000

### Deleting the `table`

In [None]:
spark.sql("DROP TABLE orders") # It actually deleted the metadata and the data (BOTH) 

In [None]:
# spark.sql("DESCRIBE TABLE orders").show() # It will throw an ERROR 

In [None]:
%%bash

hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders

# It should give an error (Here we used MANAGED Table, and hence the data and metadata both got deleted when we ran DROP)


# Clean Up 

In [None]:
spark.sql("DROP DATABASE my_db_spark")

In [None]:
spark.catalog.currentDatabase()

In [None]:
spark.sql('USE default')

In [None]:
spark.catalog.currentDatabase()

In [None]:
spark.sql("SHOW databases").show()

In [None]:
%%bash

hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db