# Spark SQL

In [1]:
spark.sparkContext.setLogLevel("ERROR")

### Check all the `databases` present

In [8]:
spark.sql('SHOW databases').show() # This is coming from AWS Glue Catalog

+--------------------+
|           namespace|
+--------------------+
|db_youtube_analytics|
| db_youtube_cleansed|
|      db_youtube_raw|
|             default|
|        dev_feedback|
+--------------------+



### Check all the `TABLES` present

In [9]:
spark.sql('SHOW tables').show() # In the default DB

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [10]:
spark.sql('USE dev_feedback')   # Select a differet database

DataFrame[]

In [11]:
spark.sql('SHOW tables').show()  # Show all the tables within the database 

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



### Create a database

In [12]:
spark.sql('CREATE DATABASE IF NOT EXISTS my_db_spark')

23/05/12 20:07:31 INFO FileUtils: Creating directory if it doesn't exist: hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db


DataFrame[]

In [13]:
spark.sql('USE my_db_spark') 

DataFrame[]

In [14]:
spark.sql('SHOW tables').show()  # Show all the tables within the database 

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [15]:
spark.sql('SHOW databases').show() # This is coming from AWS Glue Catalog

+--------------------+
|           namespace|
+--------------------+
|db_youtube_analytics|
| db_youtube_cleansed|
|      db_youtube_raw|
|             default|
|        dev_feedback|
|         my_db_spark|
+--------------------+



### Create a DataFrame

In [16]:
spark.catalog.currentDatabase()    # Check the present database (which is selected)

'my_db_spark'

In [17]:
spark.sql('CREATE TABLE my_db_spark.orders \
               (order_id integer, \
                order_date string, \
                customer_id integer, \
                order_status string)')

23/05/12 20:08:00 INFO SQLStdHiveAccessController: Created SQLStdHiveAccessController for session context : HiveAuthzSessionContext [sessionString=beb503e2-c9f4-4a44-947a-9a8027057309, clientType=HIVECLI]
23/05/12 20:08:00 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/05/12 20:08:00 INFO AWSCatalogMetastoreClient: Mestastore configuration hive.metastore.filter.hook changed from org.apache.hadoop.hive.metastore.DefaultMetaStoreFilterHookImpl to org.apache.hadoop.hive.ql.security.authorization.plugin.AuthorizationMetaStoreFilterHook
23/05/12 20:08:00 INFO AWSGlueClientFactory: Using region from ec2 metadata : us-east-2
23/05/12 20:08:01 INFO AWSGlueClientFactory: Using region from ec2 metadata : us-east-2
23/05/12 20:08:01 INFO FileUtils: Creating directory if it doesn't exist: hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders


DataFrame[]

In [18]:
spark.catalog.currentDatabase()    # Check the present database (which is selected)

'my_db_spark'

In [19]:
spark.catalog.listTables()

                                                                                

[Table(name='orders', database='my_db_spark', description=None, tableType='MANAGED', isTemporary=False)]

In [20]:
spark.sql('SHOW tables').show()

+-----------+---------+-----------+
|  namespace|tableName|isTemporary|
+-----------+---------+-----------+
|my_db_spark|   orders|      false|
+-----------+---------+-----------+



In [21]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'
df = spark.read.csv('s3://fcc-spark-example/dataset/2023/orders.csv', header=True, inferSchema=True)

                                                                                

In [22]:
df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

### Create a `TempView` using the DataFrame

In [23]:
df.createOrReplaceTempView('my_db_spark.orders_view')

In [24]:
spark.sql('SHOW tables').show()

+-----------+-----------+-----------+
|  namespace|  tableName|isTemporary|
+-----------+-----------+-----------+
|my_db_spark|     orders|      false|
|           |orders_view|       true|
+-----------+-----------+-----------+



### Insert data from the `TempView` into the new `Table`

In [25]:
spark.sql("INSERT INTO orders \
            SELECT * \
            FROM orders_view")

23/05/12 20:09:22 INFO log: Updating table stats fast for orders                
23/05/12 20:09:22 INFO log: Updated size of table orders to 2862089


DataFrame[]

In [26]:
spark.sql("SELECT * FROM orders").show()

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|       1837|         CLOSED|
|      13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|       9842|     PROCESSIN

### Describe `Table`

In [27]:
spark.sql("DESCRIBE TABLE orders").show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|    order_id|      int|   null|
|  order_date|   string|   null|
| customer_id|      int|   null|
|order_status|   string|   null|
+------------+---------+-------+



In [28]:
spark.sql("DESCRIBE EXTENDED orders").show() # Its a managed table

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|            order_id|                 int|   null|
|          order_date|              string|   null|
|         customer_id|                 int|   null|
|        order_status|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|         my_db_spark|       |
|               Table|              orders|       |
|               Owner|              hadoop|       |
|        Created Time|Fri May 12 20:08:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|  Spark 3.3.0-amzn-1|       |
|                Type|             MANAGED|       |
|            Provider|                hive|       |
|          Statistics|       2862089 bytes|       |
|            Location|hdfs://ip-172-31-...|       |
|       Serd

In [29]:
spark.sql("DESCRIBE EXTENDED orders").show(truncate=False) # Its a managed table

+----------------------------+------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                       |comment|
+----------------------------+------------------------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                                             |null   |
|order_date                  |string                                                                                          |null   |
|customer_id                 |int                                                                                             |null   |
|order_status                |string                                                                                          |null   |
|                            |                  

### Check the underline data in `HDFS` (Managed Table)

In [30]:
%%bash

hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse

Found 1 items
drwxrwxrwt   - hadoop spark          0 2023-05-12 20:08 hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db


In [31]:
%%bash

hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders

Found 1 items
-rwxrwxrwt   1 hadoop spark    2862089 2023-05-12 20:09 hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders/part-00000-842663d4-ead0-4f34-81f7-aefb2747b906-c000


In [32]:
%%bash 

hadoop fs -head hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders/part-00000-842663d4-ead0-4f34-81f7-aefb2747b906-c000

12013-07-25 00:00:0011599CLOSED
22013-07-25 00:00:00256PENDING_PAYMENT
32013-07-25 00:00:0012111COMPLETE
42013-07-25 00:00:008827CLOSED
52013-07-25 00:00:0011318COMPLETE
62013-07-25 00:00:007130COMPLETE
72013-07-25 00:00:004530COMPLETE
82013-07-25 00:00:002911PROCESSING
92013-07-25 00:00:005657PENDING_PAYMENT
102013-07-25 00:00:005648PENDING_PAYMENT
112013-07-25 00:00:00918PAYMENT_REVIEW
122013-07-25 00:00:001837CLOSED
132013-07-25 00:00:009149PENDING_PAYMENT
142013-07-25 00:00:009842PROCESSING
152013-07-25 00:00:002568COMPLETE
162013-07-25 00:00:007276PENDING_PAYMENT
172013-07-25 00:00:002667COMPLETE
182013-07-25 00:00:001205CLOSED
192013-07-25 00:00:009488PENDING_PAYMENT
202013-07-25 00:00:009198PROCESSING
212013-07-25 00:00:002711PENDING
222013-07-25 00:00:00333COMPLETE
232013-07-25 00:00:004367PENDING_PAYMENT
242013-07-25 00:00:0011441CLOSED
252013-07-25 00:00:009503CLOSED
262013-07-25 00:00:007562COMPLET

### Deleting the `table`

In [33]:
spark.sql("DROP TABLE orders")

23/05/12 20:10:38 INFO GlueMetastoreClientDelegate: Initiating drop table partitions


DataFrame[]

In [35]:
# spark.sql("DESCRIBE TABLE orders").show() # It will throw an ERROR 

In [36]:
## It should give an error (Here we used MANAGED Table, and hence the data and metadata both got deleted when we ran DROP)

# %%bash

# hadoop fs -ls hdfs://ip-172-31-2-35.us-east-2.compute.internal:8020/user/spark/warehouse/my_db_spark.db/orders

In [38]:
spark.sql("DROP DATABASE my_db_spark")

DataFrame[]

In [39]:
spark.catalog.currentDatabase()

'my_db_spark'

In [43]:
spark.sql('USE default')

DataFrame[]

In [47]:
spark.catalog.currentDatabase()

'default'