# Spark Save DataFrame to Hive Table

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

In [3]:
import findspark
findspark.init('spark-3.3.1-bin-hadoop3')

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType

In [5]:
spark = SparkSession.builder.master('local[*]').enableHiveSupport().appName("SparkTest").getOrCreate()

In [6]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

schema = StructType([ \
    StructField("Product",StringType(),True), \
    StructField("Amount",IntegerType(),True), \
    StructField("Country",StringType(),True)
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [7]:
spark.sql("""CREATE DATABASE IF NOT EXISTS test_db""")

DataFrame[]

In [8]:
spark.sql("""SHOW DATABASES""").show(10)

+---------+
|namespace|
+---------+
|  default|
|  test_db|
+---------+



In [9]:
(
    df.write
    .partitionBy("Product")
    .mode("overwrite")
    .saveAsTable("test_db.sales_hive")
 
)

In [10]:
spark.sql("""SHOW TABLES IN test_db""").show(100, truncate=False)

+---------+----------+-----------+
|namespace|tableName |isTemporary|
+---------+----------+-----------+
|test_db  |sales_hive|false      |
+---------+----------+-----------+



In [11]:
spark.sql("""DESCRIBE FORMATTED test_db.sales_hive""").show(100, truncate=False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|Amount                      |int                                                           |null   |
|Country                     |string                                                        |null   |
|Product                     |string                                                        |null   |
|# Partition Information     |                                                              |       |
|# col_name                  |data_type                                                     |comment|
|Product                     |string                                                        |null   |
|                            |                                                    

In [12]:
spark.sql("""SELECT db.* FROM test_db.sales_hive as db""").show(10)

+------+-------+-------+
|Amount|Country|Product|
+------+-------+-------+
|  2000|    USA| Orange|
|  2000|    USA| Orange|
|  2000| Canada| Banana|
|  1500|  China|  Beans|
|  2000| Mexico|  Beans|
|  1200|  China|Carrots|
|  2000| Canada|Carrots|
|  4000|  China| Orange|
|  1000|    USA| Banana|
|   400|  China| Banana|
+------+-------+-------+
only showing top 10 rows

