In [25]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.TrabalhoPratico.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [14]:
spark.sql(
    """
    DROP DATABASE silver
    """
)

DataFrame[]

In [15]:
spark.sql(
    """
    CREATE DATABASE silver LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/'
    """
)

DataFrame[]

In [26]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|     demo|
|   silver|
+---------+



In [27]:
spark.sql(
    """
    SHOW TABLES FROM silver
    """
).show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|   silver|genderstatsdata_d...|      false|
+---------+--------------------+-----------+



In [18]:
spark.sql(
    """
    DROP TABLE IF EXISTS silver.genderStatsData_DeltaTable
    """
)


DataFrame[]

In [19]:
spark.sql(
    """
    CREATE EXTERNAL TABLE  silver.genderStatsData_DeltaTable (
        countryName VARCHAR(50),
        countryCode CHAR(3),
        indicatorName VARCHAR(200),
        indicatorCode VARCHAR(50),
        y1960 VARCHAR(50),
        y1961 VARCHAR(50),
        y1962 VARCHAR(50),
        y1963 VARCHAR(50),
        y1964 VARCHAR(50),
        y1965 VARCHAR(50),
        y1966 VARCHAR(50),
        y1967 VARCHAR(50),
        y1968 VARCHAR(50),
        y1969 VARCHAR(50),
        y1970 VARCHAR(50),
        y1971 VARCHAR(50),
        y1972 VARCHAR(50),
        y1973 VARCHAR(50),
        y1974 VARCHAR(50),
        y1975 VARCHAR(50),
        y1976 VARCHAR(50),
        y1977 VARCHAR(50),
        y1978 VARCHAR(50),
        y1979 VARCHAR(50),
        y1980 VARCHAR(50),
        y1981 VARCHAR(50),
        y1982 VARCHAR(50),
        y1983 VARCHAR(50),
        y1984 VARCHAR(50),
        y1985 VARCHAR(50),
        y1986 VARCHAR(50),
        y1987 VARCHAR(50),
        y1988 VARCHAR(50),
        y1989 VARCHAR(50),
        y1990 VARCHAR(50),
        y1991 VARCHAR(50),
        y1992 VARCHAR(50),
        y1993 VARCHAR(50),
        y1994 VARCHAR(50),
        y1995 VARCHAR(50),
        y1996 VARCHAR(50),
        y1997 VARCHAR(50),
        y1998 VARCHAR(50),
        y1999 VARCHAR(50),
        y2000 VARCHAR(50),
        y2001 VARCHAR(50),
        y2002 VARCHAR(50),
        y2003 VARCHAR(50),
        y2004 VARCHAR(50),
        y2005 VARCHAR(50),
        y2006 VARCHAR(50),
        y2007 VARCHAR(50),
        y2008 VARCHAR(50),
        y2009 VARCHAR(50),
        y2010 VARCHAR(50),
        y2011 VARCHAR(50),
        y2012 VARCHAR(50),
        y2013 VARCHAR(50),
        y2014 VARCHAR(50),
        y2015 VARCHAR(50),
        y2016 VARCHAR(50),
        y2017 VARCHAR(50),
        y2018 VARCHAR(50),
        y2019 VARCHAR(50),
        y2020 VARCHAR(50),
        y2021 VARCHAR(50),
        y2022 VARCHAR(50)        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/silver/genderStatsData_DeltaTable/'
    """
)


DataFrame[]

In [20]:
spark.sql(
    """
    SHOW TABLES FROM silver
    """
).show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|   silver|genderstatsdata_d...|      false|
+---------+--------------------+-----------+



In [28]:
spark.sql(
    """
    SELECT *
    FROM silver.genderStatsData_DeltaTable
    """
).show()

+--------------------+-----------+--------------------+-----------------+-----+------------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+----------------+--------------+----------------+----------------+-----+
|         countryName|countryCode|       indicatorName|    indicatorCode|y2000|             y2001|            y2002|            y2003|           y2004|           y2005|           y2006|           y2007|           y2008|           y2009|           y2010|           y2011|           y2012|           y2013|          y2014|           y2015|           y2016|           y2017|           y2018|         y2019|           y2020|           y2021|y2022|
+--------------------+-----------+--------------------+-----------------+-----+------------------+--------------

In [29]:
spark.sql(
    """
    DESCRIBE FORMATTED silver.genderStatsData_DeltaTable
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,countryName,string,
1,countryCode,string,
2,indicatorName,string,
3,indicatorCode,string,
4,y2000,string,
5,y2001,string,
6,y2002,string,
7,y2003,string,
8,y2004,string,
9,y2005,string,


In [23]:
spark.stop()