In [1]:
# Install dependencies

import sys
!{sys.executable} -m pip install hdfs



In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPL/gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# create gold database
spark.sql(
    """
    DROP DATABASE IF EXISTS Projeto_gold CASCADE
    """
)
spark.sql(
    """
    create database Projeto_gold location 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db'
    """
)


DataFrame[]

In [2]:
from pyspark.sql.functions import substring, avg, sum

# read air_quality from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Ar"

air_quality = spark\
             .read\
             .load(hdfs_path)

air_quality.show()
air_quality.printSchema()

+---------+------------+--------------------+--------------+------------+-------------+-----------+--------------------+-----------+----------+----------+----------+-------------+----+
|Unique_ID|Indicator_ID|                Name|       Measure|Measure_Info|Geo_Type_Name|Geo_Join_ID|      Geo_Place_Name|Time_Period|Start_Date|Data_Value|  End_Date|   Localidade| Ano|
+---------+------------+--------------------+--------------+------------+-------------+-----------+--------------------+-----------+----------+----------+----------+-------------+----+
|   179718|         642|Boiler Emissions-...|Number per km2|      number|        UHF42|        504|South Beach - Tot...|       2015|2015-01-01|       2.0|2015-12-31|Staten Island|2015|
|   179719|         642|Boiler Emissions-...|Number per km2|      number|        UHF42|        503|         Willowbrook|       2015|2015-01-01|       2.1|2015-12-31|Staten Island|2015|
|   179720|         642|Boiler Emissions-...|Number per km2|      number|  

In [3]:
from pyspark.sql.functions import avg
gold_air_quality = air_quality \
    .groupBy("Localidade", "Name")\
    .agg(
        avg(air_quality.Data_Value).alias("Media")
    ) \
   
gold_air_quality.toPandas()


Unnamed: 0,Localidade,Name,Media
0,Manhattan,PM2.5-Attributable Cardiovascular Hospitalizat...,16.242764
1,Queens,Ozone (O3),32.009091
2,Bronx,PM2.5-Attributable Cardiovascular Hospitalizat...,22.291722
3,Queens,PM2.5-Attributable Cardiovascular Hospitalizat...,16.150512
4,Bronx,Nitrogen Dioxide (NO2),20.535799
...,...,...,...
109,New York City,Traffic Density- Annual Vehicle Miles Traveled...,22.150001
110,Bronx,Traffic Density- Annual Vehicle Miles Traveled,26.843590
111,Manhattan,Traffic Density- Annual Vehicle Miles Traveled,50.368889
112,New York City,Traffic Density- Annual Vehicle Miles Traveled...,1.250000


In [5]:
# create air_quality table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.Tabela_Ar
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_Ar (
        Localidade VARCHAR(250),
        Name VARCHAR(500), 
        Media Double
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Ar/'
    """
)
        

DataFrame[]

In [6]:
# write to delta table
gold_air_quality \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Ar/")

In [7]:
# check the results in the table
spark.table("Projeto_gold.Tabela_Ar").toPandas()


Unnamed: 0,Localidade,Name,Media
0,Manhattan,PM2.5-Attributable Cardiovascular Hospitalizat...,16.242764
1,Queens,Ozone (O3),32.009091
2,Bronx,PM2.5-Attributable Cardiovascular Hospitalizat...,22.291722
3,Queens,PM2.5-Attributable Cardiovascular Hospitalizat...,16.150512
4,Bronx,Nitrogen Dioxide (NO2),20.535799
...,...,...,...
109,New York City,Traffic Density- Annual Vehicle Miles Traveled...,22.150001
110,Bronx,Traffic Density- Annual Vehicle Miles Traveled,26.843590
111,Manhattan,Traffic Density- Annual Vehicle Miles Traveled,50.368889
112,New York City,Traffic Density- Annual Vehicle Miles Traveled...,1.250000


In [8]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Ar/`
""").show()

++
||
++
++



In [9]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Tabela_Ar_Presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_Ar_Presto (
        Localidade VARCHAR(250),
        Name VARCHAR(500), 
        Media Double
        )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Ar/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

