In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPL'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    drop table if exists Projeto_gold.Geral_3
    """
)

DataFrame[]

In [3]:
ar = spark.table("Projeto_Gold.Tabela_Ar")
ar = ar.withColumnRenamed("Media","Media_Ozono")
reci = spark.table("Projeto_Gold.Tabela_Reciclagem_3")
reci = reci.withColumnRenamed("Media","Media_Reciclagem")
ar.show()
reci.show()

+-------------+--------------------+------------------+
|   Localidade|                Name|       Media_Ozono|
+-------------+--------------------+------------------+
|    Manhattan|PM2.5-Attributabl...|16.242763671008024|
|       Queens|          Ozone (O3)| 32.00909091005422|
|        Bronx|PM2.5-Attributabl...|22.291722238063812|
|       Queens|PM2.5-Attributabl...|16.150511958382346|
|        Bronx|Nitrogen Dioxide ...|20.535798615879482|
|        Bronx|Boiler Emissions-...|           1.28125|
|New York City|Boiler Emissions-...| 4.799999952316284|
|       Queens|Boiler Emissions-...|16.136363809758965|
|     Brooklyn|Sulfur Dioxide (SO2)|1.8124999977739118|
|    Manhattan|Fine Particulate ...|10.722705766497326|
|        Bronx|Boiler Emissions-...| 9.918750114738941|
|        Bronx|          Ozone (O3)|30.706354207462734|
|New York City|O3-Attributable C...| 4.850000023841858|
|        Bronx|O3-Attributable A...|125.53125062584877|
|New York City|O3-Attributable A...|14.000000178

In [4]:
geral = reci.join(ar, (reci.Zone == ar.Localidade),"inner")

In [6]:
geral = geral.drop(ar.Localidade)

In [7]:
geral.toPandas()

Unnamed: 0,Zone,Media_Reciclado_Comum,Name,Media_Ozono
0,Manhattan,46.330382,PM2.5-Attributable Cardiovascular Hospitalizat...,16.242764
1,Bronx,40.196875,PM2.5-Attributable Cardiovascular Hospitalizat...,22.291722
2,Bronx,40.196875,Nitrogen Dioxide (NO2),20.535799
3,Bronx,40.196875,Boiler Emissions- Total PM2.5 Emissions,1.28125
4,Manhattan,46.330382,Fine Particulate Matter (PM2.5),10.722706
5,Bronx,40.196875,Boiler Emissions- Total SO2 Emissions,9.91875
6,Bronx,40.196875,Ozone (O3),30.706354
7,Bronx,40.196875,O3-Attributable Asthma Emergency Department Vi...,125.531251
8,Bronx,40.196875,O3-Attributable Cardiac and Respiratory Deaths,4.725
9,Bronx,40.196875,PM2.5-Attributable Asthma Emergency Department...,128.662832


In [8]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Geral_3
""")

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Geral_3 (
        Zone VARCHAR(50), 
        Media_Reciclado_Comum DOUBLE, 
        Name VARCHAR(250), 
        Media_Ozono DOUBLE
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_3'
    """
)

DataFrame[]

In [9]:
geral \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true")\
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_3")

In [10]:
spark.table("Projeto_gold.Geral_3").show()

+-------------+---------------------+--------------------+------------------+
|         Zone|Media_Reciclado_Comum|                Name|       Media_Ozono|
+-------------+---------------------+--------------------+------------------+
|    Manhattan|    46.33038197623359|PM2.5-Attributabl...|16.242763671008024|
|        Bronx|   40.196875029140045|PM2.5-Attributabl...|22.291722238063812|
|        Bronx|   40.196875029140045|Nitrogen Dioxide ...|20.535798615879482|
|        Bronx|   40.196875029140045|Boiler Emissions-...|           1.28125|
|    Manhattan|    46.33038197623359|Fine Particulate ...|10.722705766497326|
|        Bronx|   40.196875029140045|Boiler Emissions-...| 9.918750114738941|
|        Bronx|   40.196875029140045|          Ozone (O3)|30.706354207462734|
|        Bronx|   40.196875029140045|O3-Attributable A...|125.53125062584877|
|        Bronx|   40.196875029140045|O3-Attributable C...| 4.725000008940697|
|        Bronx|   40.196875029140045|PM2.5-Attributabl...|128.66

In [11]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_3/`
""").show()

++
||
++
++



In [12]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Geral_Presto_3
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE IF NOT EXISTS Projeto_gold.Geral_Presto_3 (
        Zone VARCHAR(50), 
        Media_Reciclado_Comum DOUBLE, 
        Name VARCHAR(250), 
        Media_Ozono DOUBLE
    )
    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_3/_symlink_format_manifest/'
    """).show()

++
||
++
++

++
||
++
++



DataFrame[]