In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPL'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    drop table if exists Projeto_gold.Geral_2
    """
)

DataFrame[]

In [17]:
ar = spark.table("Projeto_Gold.Tabela_Ar_2")
ar = ar.withColumnRenamed("Media","Media_Ar")
agua = spark.table("Projeto_Gold.Tabela_Agua_4")
agua = agua.withColumnRenamed("Media","Media_Agua")
ar.show()
agua.show()

+----+--------------------+-------------------+
| Ano|                Name|           Media_Ar|
+----+--------------------+-------------------+
|2009|PM2.5-Attributabl...|  73.48229187726974|
|2015|PM2.5-Attributabl...|  54.18229158222675|
|2009|Fine Particulate ...| 10.531465733304938|
|2009|PM2.5-Attributabl...|  49.25833344459534|
|2015|PM2.5-Attributabl...|  38.55625029404958|
|2015|Boiler Emissions-...|  51.28333353996277|
|2009|O3-Attributable A...|  76.27187449733417|
|2015|O3-Attributable A...| 10.589583413675427|
|2009|Nitrogen Dioxide ...|  24.26985812807196|
|2015|Boiler Emissions-...|  7.752083425720532|
|2009|Sulfur Dioxide (SO2)| 3.7139007123649543|
|2015|Boiler Emissions-...|  1.087499991680185|
|2009|O3-Attributable A...| 14.052083270003399|
|2009|          Ozone (O3)| 24.762978689044925|
|2009|PM2.5-Attributabl...| 17.708333154519398|
|2015|Sulfur Dioxide (SO2)|0.26595744670282867|
|2015|          Ozone (O3)|  30.91090568921245|
|2009|O3-Attributable C...| 4.8729166487

In [18]:
geral = agua.join(ar, (agua.Ano == ar.Ano),"inner")

In [19]:
geral = geral.drop(ar.Ano)

In [20]:
geral.toPandas()

Unnamed: 0,Ano,Analyte,Media_Agua,Name,Media_Ar
0,2005,"Nickel, Total",2.380000,Air Toxics Concentrations- Average Benzene Con...,2.910417
1,2005,"Nickel, Total",2.380000,Air Toxics Concentrations- Average Formaldehyd...,3.195833
2,2005,"Nickel, Total",2.380000,O3-Attributable Cardiac and Respiratory Deaths,5.085417
3,2005,"Nickel, Total",2.380000,PM2.5-Attributable Asthma Emergency Department...,90.432292
4,2005,"Nickel, Total",2.380000,PM2.5-Attributable Deaths,66.908333
...,...,...,...,...,...
5208,2013,Temperature,9.932264,Boiler Emissions- Total NOx Emissions,56.300000
5209,2013,Temperature,9.932264,Boiler Emissions- Total SO2 Emissions,14.231250
5210,2013,Temperature,9.932264,Nitrogen Dioxide (NO2),22.237187
5211,2013,Temperature,9.932264,Boiler Emissions- Total PM2.5 Emissions,1.660417


In [22]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Geral_2
""")

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Geral_2 (
        Ano INT, 
        Analyte VARCHAR(250), 
        Media_Agua DOUBLE, 
        Name VARCHAR(250), 
        Media_Ar DOUBLE
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_2'
    """
)

DataFrame[]

In [23]:
geral \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true")\
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_2")

In [24]:
spark.table("Projeto_gold.Geral_2").show()

+----+----------------+-------------------+--------------------+------------------+
| Ano|         Analyte|         Media_Agua|                Name|          Media_Ar|
+----+----------------+-------------------+--------------------+------------------+
|2005|   Nickel, Total|               2.38|Air Toxics Concen...|2.9104166701436043|
|2005|   Nickel, Total|               2.38|Air Toxics Concen...|3.1958333402872086|
|2005|   Nickel, Total|               2.38|O3-Attributable C...|  5.08541668454806|
|2005|   Nickel, Total|               2.38|PM2.5-Attributabl...| 90.43229215343793|
|2005|   Nickel, Total|               2.38|PM2.5-Attributabl...| 66.90833346048991|
|2005|   Nickel, Total|               2.38|Traffic Density- ...|28.979439164990577|
|2005|   Nickel, Total|               2.38|Traffic Density- ...|31.063725317225735|
|2005|   Nickel, Total|               2.38|PM2.5-Attributabl...| 20.72916669646899|
|2005|   Nickel, Total|               2.38|PM2.5-Attributabl...| 26.14999983

In [26]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_2/`
""").show()

++
||
++
++



In [27]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Geral_Presto_2
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE IF NOT EXISTS Projeto_gold.Geral_Presto_2 (
        Ano INT, 
        Analyte VARCHAR(250), 
        Media_Agua DOUBLE, 
        Name VARCHAR(250), 
        Media_Ar DOUBLE
    )
    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_2/_symlink_format_manifest/'
    """).show()

++
||
++
++

++
||
++
++



DataFrame[]