In [1]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|     demo|
+---------+



In [4]:
spark.sql(
    """
    DROP DATABASE IF EXISTS silver CASCADE
    """
)

DataFrame[]

In [5]:
# you can choose any location in HDFS, just be organized 
# Your data lake will grow with time and will become a swamp
spark.sql(
    """
    CREATE DATABASE silver LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/'
    """
)

DataFrame[]

In [6]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|     demo|
|   silver|
+---------+



In [7]:
spark.sql(
    """
    DROP TABLE IF EXISTS silver.StudentsMentalHealth_DeltaTable
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE  silver.StudentsMentalHealth_DeltaTable (
        TimeStamp Timestamp,
        Choose_your_gender String,
        Age INT,
        What_is_your_course String,
        Your_current_year_of_study String,
        What_is_your_GPA String,
        Marital_status String,
        Do_you_havedepression String,
        Do_you_have_anxiety String,
        Do_you_have_depression String,
        Do_you_have_panic_attack String,
        Did_you_seek_any_specialist_for_treatment String

    )
    USING DELTA

    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/StudentsMentalHealth_DeltaTable'
    """
)

DataFrame[]

In [8]:
spark.sql(
 "SELECT * FROM silver.StudentsMentalHealth_DeltaTable "
).toPandas()

Unnamed: 0,TimeStamp,Choose_your_gender,Age,What_is_your_course,Your_current_year_of_study,What_is_your_GPA,Marital_status,Do_you_havedepression,Do_you_have_anxiety,Do_you_have_depression,Do_you_have_panic_attack,Did_you_seek_any_specialist_for_treatment


In [None]:
spark.stop()