# NOTEBOOK 3.7 PySpark and Hive

## 1. Create SparkSession with Hive Enabled

In [None]:
!pip install numpy pyspark

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

spark = SparkSession\
         .builder\
         .appName("SparkHiveDemo")\
         .config('spark.sql.warehouse.dir', 'hdfs:/user/hive/warehouse/')\
         .config("spark.sql.catalogImplementation", "hive")\
         .enableHiveSupport()\
         .getOrCreate()

### 1.1 Check Spark Version

In [None]:
spark.version

'3.5.1'

### 1.2 Get SparkContext

In [None]:
sc = spark.sparkContext
sc

### 1.3 Get Spark Configuration

In [None]:
import pprint

conf = sc.getConf()
configurations = conf.getAll()
pprint.pprint(configurations)

## 2. Access Existing Databases in Hive Warehouse

### 2.1 List existing databases

In [None]:
spark.sql("SHOW DATABASES").show()

24/03/31 11:40:38 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.


+---------+
|namespace|
+---------+
|  default|
|     hrdb|
|  salesdb|
+---------+



### 2.2 List existing tables

(a) Show existing tables in the **default** database

In [None]:
tables = spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



(b) Show existing tables in the **salesdb** database

In [None]:
spark.sql('USE salesdb')
tables = spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  salesdb|  invites|      false|
|  salesdb|    pokes|      false|
|  salesdb|    sales|      false|
+---------+---------+-----------+



### 2.3 Run DML commands

In [None]:
spark.sql("SELECT * FROM sales").show()

24/03/31 11:40:42 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


+----+------------+----------+--------+
|  id| description|unit_price|quantity|
+----+------------+----------+--------+
|1005|         pen|       2.5|       4|
|1007|      pencil|       1.0|      10|
|1001|    notebook|       5.0|       2|
|1003|       ruler|       1.0|       1|
|1002|  calculator|      55.0|       1|
|2005|     A4paper|       7.8|       2|
|2007|      eraser|       2.0|       4|
|2001|watercolours|      12.5|       1|
|2003|  paintbrush|       3.0|       4|
+----+------------+----------+--------+



In [None]:
# Create DataFrame
columns = ["id", "name", "age", "gender"]

data = [(1, "James", 30, "M"),
        (2, "Ann", 40, "F"),
        (3, "Jeff",  41, "M"),
        (4, "Jennifer", 20, "F")]

employeeDF = spark.createDataFrame(data, columns)
employeeDF

DataFrame[id: bigint, name: string, age: bigint, gender: string]

In [None]:
employeeDF.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)



In [None]:
employeeDF.show()

+---+--------+---+------+
| id|    name|age|gender|
+---+--------+---+------+
|  1|   James| 30|     M|
|  2|     Ann| 40|     F|
|  3|    Jeff| 41|     M|
|  4|Jennifer| 20|     F|
+---+--------+---+------+



In [None]:
# Create temporary view
employeeDF.createOrReplaceTempView("emp_view")
spark.sql("SELECT * FROM emp_view").show()

+---+--------+---+------+
| id|    name|age|gender|
+---+--------+---+------+
|  1|   James| 30|     M|
|  2|     Ann| 40|     F|
|  3|    Jeff| 41|     M|
|  4|Jennifer| 20|     F|
+---+--------+---+------+



## 3. Creating Databases and Tables

### 3.1 Creating a Database
Create a new database named **hrdb**

In [None]:
spark.sql("DROP DATABASE IF EXISTS hrdb CASCADE")
spark.sql("CREATE DATABASE IF NOT EXISTS hrdb")
spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
|     hrdb|
|  salesdb|
+---------+



### 3.2 Creating a Table
Create a table named as **emp_table** in the **hrdb** database

In [None]:
# Check current database
spark.catalog.currentDatabase()

'salesdb'

In [None]:
# Switch to the hrdb database
spark.sql("USE hrdb")
spark.catalog.currentDatabase()

'hrdb'

In [None]:
# Check current tables
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         | emp_view|       true|
+---------+---------+-----------+



In [None]:

spark.sql("CREATE TABLE hrdb.emp_table (id INT, name STRING, age INT, gender STRING)")
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|     hrdb|emp_table|      false|
|         | emp_view|       true|
+---------+---------+-----------+



24/03/31 11:40:45 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.


## 4. Inserting Data into Tables
Insert data from **emp_view** into **emp_table**

In [None]:
spark.sql("INSERT INTO TABLE hrdb.emp_table  SELECT * FROM emp_view")

DataFrame[]

In [None]:
# View data from emp_table
spark.sql("SELECT * FROM hrdb.emp_table").show()

+---+--------+---+------+
| id|    name|age|gender|
+---+--------+---+------+
|  1|   James| 30|     M|
|  2|     Ann| 40|     F|
|  3|    Jeff| 41|     M|
|  4|Jennifer| 20|     F|
+---+--------+---+------+



## 5. Read Hive table using table()

In [None]:
spark.sql("USE salesdb")
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  salesdb|  invites|      false|
|  salesdb|    pokes|      false|
|  salesdb|    sales|      false|
|         | emp_view|       true|
+---------+---------+-----------+



In [None]:
df = spark.read.table("sales")
df.show()

+----+------------+----------+--------+
|  id| description|unit_price|quantity|
+----+------------+----------+--------+
|1005|         pen|       2.5|       4|
|1007|      pencil|       1.0|      10|
|1001|    notebook|       5.0|       2|
|1003|       ruler|       1.0|       1|
|1002|  calculator|      55.0|       1|
|2005|     A4paper|       7.8|       2|
|2007|      eraser|       2.0|       4|
|2001|watercolours|      12.5|       1|
|2003|  paintbrush|       3.0|       4|
+----+------------+----------+--------+



In [None]:
conf.get('spark.sql.warehouse.dir')

'hdfs:/user/hive/warehouse/'

## 6. Dropping a Database

In [None]:
spark.sql("DROP DATABASE IF EXISTS de_company CASCADE")
spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
|     hrdb|
|  salesdb|
+---------+



In [None]:
spark.stop()