In [None]:
pip install delta-spark==2.4.0

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|   silver|
|  silver1|
+---------+



In [3]:
spark.sql(
    """
    DROP DATABASE IF EXISTS silver CASCADE
    """
)

DataFrame[]

In [4]:
# you can choose any location in HDFS, just be organized 
# Your data lake will grow with time and will become a swamp
spark.sql(
    """
    CREATE DATABASE silver LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/'
    """
)

DataFrame[]

In [6]:
spark.sql(
    """
    DROP TABLE IF EXISTS silver.MentalHealth_DeltaTable
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE silver.MentalHealth_DeltaTable (
        Timestamp Timestamp,
        Age INT,
        Gender STRING,
        Country STRING,
        state STRING,
        self_employed STRING,
        family_history STRING,
        treatment STRING,
        work_interfere STRING,
        Number_employees STRING,
        remote_worker STRING,
        tech_company STRING,
        benefits STRING,
        care_options STRING,
        Wellness_program STRING,
        Seek_help STRING,
        anonymity STRING,
        leave STRING,
        mental_health_consequence STRING,
        phys_health_consequence STRING,
        coworkers STRING,
        supervisor STRING,
        mental_health_interview STRING,
        phys_health_interview STRING,
        mental_vs_physical STRING,
        observed_consequence STRING,
        comments STRING
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/MentalHealth_DeltaTable'
    """
)

DataFrame[]

In [7]:
spark.sql(
 "SELECT * FROM silver.MentalHealth_DeltaTable "
).toPandas()



Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,Number_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,observed_consequence,comments


In [8]:
spark.stop()