In [None]:
pip install delta-spark==2.4.0

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
spark.sql(
    """
    CREATE DATABASE silver LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/'
    """
)

In [2]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|   silver|
|  silver1|
+---------+



In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS silver.GenderInequalityIndex_DeltaTable
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE silver.GenderInequalityIndex_DeltaTable (
        ISO3 VARCHAR(3),
        Pais VARCHAR(20),
        Continente VARCHAR(20),
        Hemisferio VARCHAR(20),
        Desenvolvimento_Humano VARCHAR(15),
        UNDP_regioes VARCHAR(3),
        HDI_rank int,
        GII_rank int,
        Index_1990 double,
        Index_1991 double,
        Index_1992 double,
        Index_1993 double,
        Index_1994 double,
        Index_1995 double,
        Index_1996 double,
        Index_1997 double,
        Index_1998 double,
        Index_1999 double,
        Index_2000 double,
        Index_2001 double,
        Index_2002 double,
        Index_2003 double,
        Index_2004 double,
        Index_2005 double,
        Index_2006 double,
        Index_2007 double,
        Index_2008 double,
        Index_2009 double,
        Index_2010 double,
        Index_2011 double,
        Index_2012 double,
        Index_2013 double,
        Index_2014 double,
        Index_2015 double,
        Index_2016 double,
        Index_2017 double,
        Index_2018 double,
        Index_2019 double,
        Index_2020 double,
        Index_2021 double
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/GenderInequalityIndex_DeltaTable/'
    """
)

DataFrame[]

In [4]:
spark.sql(
    """
    SELECT *
    FROM silver.GenderInequalityIndex_DeltaTable
    """
).show()

+----+----+----------+----------+----------------------+------------+--------+--------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|ISO3|Pais|Continente|Hemisferio|Desenvolvimento_Humano|UNDP_regioes|HDI_rank|GII_rank|Index_1990|Index_1991|Index_1992|Index_1993|Index_1994|Index_1995|Index_1996|Index_1997|Index_1998|Index_1999|Index_2000|Index_2001|Index_2002|Index_2003|Index_2004|Index_2005|Index_2006|Index_2007|Index_2008|Index_2009|Index_2010|Index_2011|Index_2012|Index_2013|Index_2014|Index_2015|Index_2016|Index_2017|Index_2018|Index_2019|Index_2020|Index_2021|
+----+----+----------+----------+----------------------+------------+--------+--------+----------+----------+----------+

In [5]:
spark.sql(
    """
    DESCRIBE FORMATTED silver.GenderInequalityIndex_DeltaTable
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,ISO3,string,
1,Pais,string,
2,Continente,string,
3,Hemisferio,string,
4,Desenvolvimento_Humano,string,
5,UNDP_regioes,string,
6,HDI_rank,int,
7,GII_rank,int,
8,Index_1990,double,
9,Index_1991,double,


In [6]:
spark.stop()