In [None]:
# Install dependencies

import sys
!{sys.executable} -m pip install hdfs

In [1]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPL/gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# create gold database
spark.sql(
    """
    DROP DATABASE IF EXISTS Projeto_gold CASCADE
    """
)
spark.sql(
    """
    create database Projeto_gold location 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db'
    """
)


DataFrame[]

In [27]:
from pyspark.sql.functions import substring, avg, sum

# read recycling from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Reciclagem"

recycling = spark\
             .read\
             .load(hdfs_path)

recycling.show()
recycling.printSchema()

+----------+--------+-------------------+----------+--------------------+------------------+----------------+------------------+-----------+
|      Zone|District|Fiscal_Month_Number|Month_Name|Diversion_Rate_Total|Capture_Rate_Paper|Capture_Rate_MGP|Capture_Rate_Total|Fiscal_Year|
+----------+--------+-------------------+----------+--------------------+------------------+----------------+------------------+-----------+
|Brooklyn  |   BKN01|                 10|     April|                13.7|              41.0|            44.3|              42.3|       2016|
|Brooklyn  |   BKN02|                 10|     April|                17.6|              33.2|            55.4|              39.8|       2016|
|Brooklyn  |   BKN03|                 10|     April|                10.9|              30.5|            39.3|              34.1|       2016|
|Brooklyn  |   BKN04|                 10|     April|                13.4|              30.3|            59.3|              42.1|       2016|
|Brooklyn  | 

In [28]:
from pyspark.sql.functions import count
gold_recycling = recycling \
    .groupBy("Fiscal_Year","Month_Name")\
    .agg(
        avg(recycling.Capture_Rate_Total).alias("Media_Reciclado_Comum"),
        avg(recycling.Diversion_Rate_Total).alias("Media_Reciclagem")
    ) \
   
gold_recycling.toPandas()


Unnamed: 0,Fiscal_Year,Month_Name,Media_Reciclado_Comum,Media_Reciclagem
0,2016,June,46.979661,16.615254
1,2016,May,46.811864,16.666102
2,2016,March,44.001695,17.037288
3,2016,December,46.661017,18.083051
4,2016,October,44.935593,16.355932
5,2016,July,42.022034,15.571186
6,2016,February,49.891525,19.115254
7,2016,September,45.427119,16.325424
8,2016,April,47.349153,16.771186
9,2016,January,41.327119,16.728814


In [29]:
# create recycling table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.Tabela_Reciclagem_2
    """
)


spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_Reciclagem_2 (
        Fiscal_Year INT,
        Month_Name VARCHAR(50),
        Media_Reciclado_Comum DOUBLE,
        Media_Reciclagem DOUBLE
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Reciclagem_2/'
    """
)
       
 

DataFrame[]

In [30]:
# write to delta table
gold_recycling \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Reciclagem_2/")

In [31]:
# check the results in the table
spark.table("Projeto_gold.Tabela_Reciclagem_2").toPandas()


Unnamed: 0,Fiscal_Year,Month_Name,Media_Reciclado_Comum,Media_Reciclagem
0,2016,June,46.979661,16.615254
1,2016,May,46.811864,16.666102
2,2016,March,44.001695,17.037288
3,2016,December,46.661017,18.083051
4,2016,October,44.935593,16.355932
5,2016,July,42.022034,15.571186
6,2016,February,49.891525,19.115254
7,2016,September,45.427119,16.325424
8,2016,April,47.349153,16.771186
9,2016,January,41.327119,16.728814


In [32]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Reciclagem_2/`
""").show()

++
||
++
++



In [33]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Tabela_Reciclagem_Presto_2 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_Reciclagem_Presto_2 (
        Fiscal_Year INT,
        Month_Name VARCHAR(50),
        Media_Reciclado_Comum DOUBLE,
        Media_Reciclagem DOUBLE
    )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Reciclagem_2/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

