## Create Hive Tables

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import broadcast

In [4]:
spark = SparkSession.builder.appName("Spark features").getOrCreate()

### Remove existing directories if exists

In [8]:
from shutil import rmtree
import os

# Define the locations
emp_location = 'spark-warehouse/hive_emp'
dept_location = 'spark-warehouse/hive_dept'

# Remove the existing directories if they exist
if os.path.exists(emp_location):
    rmtree(emp_location)
if os.path.exists(dept_location):
    rmtree(dept_location)


In [9]:
# Create a DataFrame
emp = [(1, "Smith", "fi", 1000),
        (2, "Rose", "ma", 2000),
        (3, "Williams", "ma", 1000),
        (4, "Jones", "sa", 2000),
        (5, "Brown", "sa", 1000),
        (6, "Katie", "fi", 2000),
        (7, "Linda", "it", 2000),
        (8, "Michael", "it", 1000),
        (9, "Johnson", "ma", 1000),
        (10, "Tom", "fi", 2000)]

dept = [("Finance", "fi"),
        ("Marketing", "ma"),
        ("Sales", "sa"),
        ("Computer Science", "cs"),
        ("Info Tech", "it")]
df = spark.createDataFrame(emp, ["emp_id", "name", "dept_id", "salary"])
deptdf = spark.createDataFrame(dept, ["name", "dept_id"])

# Create Temp Tables for SQL
df.createOrReplaceTempView("empdf")
deptdf.createOrReplaceTempView("deptdf")

# Drop the existing tables if they exist
spark.sql("DROP TABLE IF EXISTS hive_emp")
spark.sql("DROP TABLE IF EXISTS hive_dept")

# Save as hive table
df.write.saveAsTable("hive_emp", mode="overwrite")
deptdf.write.saveAsTable("hive_dept", mode="overwrite")

## Broadcast Join
Size of broadcast table is 10MB.
Can change threshold up to 8GB

In [10]:
# Check size of transmission table
threshold_str = spark.conf.get("spark.sql.autoBroadcastJoinThreshold")
threshold_value = int(threshold_str.rstrip('b'))
size = threshold_value / (1024 * 1024)
print(f"Default size of broadcast table is {size} MB.")

Default size of broadcast table is 10.0 MB.


In [11]:
# set size of streaming table as 50mb
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 50 * 1024 * 1024)

In [12]:
join_df = df.join(broadcast(deptdf), df["dept_id"] == deptdf["dept_id"])

## Caching
Use cache/persistence function to keep the dataframe in memoery

In [13]:
# Cache the DataFrame
df.cache()

# Trigger an action to ensure caching is materialized
df.count()

# Print the storage levels
print("Memory used: {0}".format(df.storageLevel.useMemory))
print("Disk used: {0}".format(df.storageLevel.useDisk))

# You can cache again and re-check the storage level if necessary, but it's not needed
# df.cache()
# df.count()
# Print the storage levels again if you want to check after caching again
print("Memory used after re-cache: {0}".format(df.storageLevel.useMemory))
print("Disk used after re-cache: {0}".format(df.storageLevel.useDisk))

Memory used: True
Disk used: True
Memory used after re-cache: True
Disk used after re-cache: True


When use the `cache()` function, it will use storage tier as Memory_Only (~2.0.2) and Memory_and_DISK (2.1.x afterwards) 
But we can use `persist()` to specify levels of storage 

In [14]:
from pyspark.storagelevel import StorageLevel


In [17]:
deptdf.persist(StorageLevel.MEMORY_ONLY)
deptdf.count()
print("Memory used for deptdf: {0}".format(deptdf.storageLevel.useMemory))
print("Disk used for deptdf: {0}".format(deptdf.storageLevel.useDisk))

Memory used for deptdf: True
Disk used for deptdf: False


## Dont persist
Clear cache of data when no longer needed!

In [18]:
df.unpersist()

DataFrame[emp_id: bigint, name: string, dept_id: string, salary: bigint]

In [19]:
deptdf.unpersist()

DataFrame[name: string, dept_id: string]

AttributeError: 'SparkSession' object has no attribute '_wrapped'

In [23]:
# clear cache
spark.catalog.clearCache()