In [1]:
# Install dependencies

import sys
!{sys.executable} -m pip install hdfs



In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Habitos_sustentaveis/gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
from pyspark.sql.functions import substring, avg, sum

# read air_quality from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Petroleo"

tabela_petroleo = spark\
             .read\
             .load(hdfs_path)

tabela_petroleo.show()
tabela_petroleo.printSchema()

+------------+---------------------+--------------------+----------------+--------+---------+----------+-------------------+--------------------+--------------------+--------------------+------------------+--------+------------+---------+------------+-------------+----------+----+
|Spill_Number|Program_Facility_Name|              Street|        Locality|  Contry|SWIS_Code|DEC_Region|Contributing_Factor|           Waterbody|              Source|       Material_Name|   Material_Family|Quantity|       Units|Recovered|Data_Derrame|Data_Relatada|  Data_Fim| Ano|
+------------+---------------------+--------------------+----------------+--------+---------+----------+-------------------+--------------------+--------------------+--------------------+------------------+--------+------------+---------+------------+-------------+----------+----+
|    9506819 | 1 BEEKMAN PLACE  ...|     1 BEEKMAN PLACE|       MANHATTAN|New York|     3101|         2|            Unknown|             Nenhuma|Instituti

In [14]:
from pyspark.sql.functions import count, countDistinct
gold_petroleo = tabela_petroleo \
    .groupBy("Contributing_Factor", "Material_Family")\
    .agg(
        (sum(tabela_petroleo.Quantity)-sum(tabela_petroleo.Recovered)).alias("Desperdicio"),
    ) \
   
gold_petroleo.toPandas()

Unnamed: 0,Contributing_Factor,Material_Family,Desperdicio
0,Housekeeping,Other,765377.4
1,Equipment Failure,Petroleum,6386269.0
2,Equipment Failure,Hazardous Material,434517.9
3,Abandoned Drums,Hazardous Material,4680.5
4,Tank Test Failure,Hazardous Material,0.0
5,Missing Code in Old Data - Must be fixed,Other,2251.0
6,Equipment Failure,Other,234269700.0
7,Traffic Accident,Petroleum,642407.3
8,Deliberate,Petroleum,692016.9
9,Housekeeping,Petroleum,72737.9


In [17]:
# create air_quality table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.Tabela_Petroleo_3
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_Petroleo_3 (
        Contributing_Factor VARCHAR(100),
        Material_Family VARCHAR(250),
        Desperdicio Double
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Petroleo_3/'
    """
)
        

DataFrame[]

In [18]:
# write to delta table
gold_petroleo \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Petroleo_3/")

In [19]:
# check the results in the table
spark.table("Projeto_gold.Tabela_Petroleo_3").show()


+--------------------+------------------+-------------------+
| Contributing_Factor|   Material_Family|        Desperdicio|
+--------------------+------------------+-------------------+
|        Housekeeping|             Other|  765377.4399999995|
|   Equipment Failure|         Petroleum|  6386269.100000223|
|   Equipment Failure|Hazardous Material|  434517.8699885942|
|     Abandoned Drums|Hazardous Material|             4680.5|
|   Tank Test Failure|Hazardous Material|                0.0|
|Missing Code in O...|             Other|             2251.0|
|   Equipment Failure|             Other|2.342696589800236E8|
|    Traffic Accident|         Petroleum|  642407.2799996063|
|          Deliberate|         Petroleum|  692016.9400000498|
|        Housekeeping|         Petroleum|  72737.89999998547|
|        Housekeeping|Hazardous Material|  85404.00999999978|
|    Traffic Accident|Hazardous Material| 117592.70000002533|
|       Tank Overfill|         Petroleum|           722329.0|
|   Equi

In [24]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Petroleo_3/`
""").show()

++
||
++
++



In [25]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Tabela_Petroleo3_Presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_Petroleo3_Presto (
        Contributing_Factor VARCHAR(100),
        Material_Family VARCHAR(250),
        Desperdicio Double
    )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Tabela_Petroleo_3/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

