In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPL'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [36]:
spark.sql(
    """
    drop table if exists Projeto_gold.Geral_1
    """
)

DataFrame[]

In [50]:
petroleo = spark.table("Projeto_Gold.Tabela_Petroleo")
agua = spark.table("Projeto_Gold.Tabela_Agua_4")
petroleo.show()
agua.show()

+----+--------------+
| Ano|Derrames_Count|
+----+--------------+
|1994|          5730|
|1996|          6673|
|1998|          6537|
|2009|          5195|
|1995|          6232|
|2000|          7434|
|2008|          5834|
|1999|          7668|
|1993|          5024|
|2003|          5749|
|2007|          5705|
|2006|          6092|
|1997|          5093|
|2004|          6169|
|1992|          4579|
|2005|          5793|
|1990|          4071|
|1991|          3998|
|2012|          5295|
|2001|          6128|
+----+--------------+
only showing top 20 rows

+----+--------------------+-------------------+
| Ano|             Analyte|              Media|
+----+--------------------+-------------------+
|1999|Phosphorus, Total...| 16.513007284079084|
|1999|Soluble Reactive ...| 16.395276497695853|
|1997|Copper, Total (Un...| 3.2553956834532376|
|1998|Mercury, Total (U...|                0.3|
|1999|     Coliform, Fecal| 171.11699164345404|
|1998|Nitrate/Nitrite (...|0.38505833333333256|
|1998|     Fie

In [51]:
geral = agua.join(petroleo, (agua.Ano == petroleo.Ano),"inner")

In [52]:
geral = geral.drop(petroleo.Ano)

In [53]:
geral.toPandas()

Unnamed: 0,Ano,Analyte,Media,Derrames_Count
0,1999,"Phosphorus, Total Dissolved (as P)",16.513007,7668
1,1999,Soluble Reactive Phosphorus (as P),16.395276,7668
2,1997,"Copper, Total (Undigested)",3.255396,5093
3,1998,"Mercury, Total (Undigested)",0.300000,6537
4,1999,"Coliform, Fecal",171.116992,7668
...,...,...,...,...
1846,2013,Apparent Color,26.388889,3747
1847,2013,Bird 2 Gull,,3747
1848,2013,"Phosphorus, Total Dissolved (as P)",14.000883,3747
1849,2013,Chloride,28.524927,3747


In [41]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Geral_1
""")

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Geral_1 (
        Ano INT, 
        Analyte VARCHAR(250), 
        Media DOUBLE, 
        Derrames_Count LONG
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_1'
    """
)

DataFrame[]

In [42]:
geral \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true")\
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_1")

In [43]:
spark.table("Projeto_gold.Geral_1").show()

+----+--------------------+-------------------+--------------+
| Ano|             Analyte|              Media|Derrames_Count|
+----+--------------------+-------------------+--------------+
|1999|Phosphorus, Total...| 16.513007284079084|          7668|
|1999|Soluble Reactive ...| 16.395276497695853|          7668|
|1997|Copper, Total (Un...| 3.2553956834532376|          5093|
|1998|Mercury, Total (U...|                0.3|          6537|
|1999|     Coliform, Fecal| 171.11699164345404|          7668|
|1998|Nitrate/Nitrite (...|0.38505833333333256|          6537|
|1998|     Field Turbidity| 0.5583333333333335|          6537|
|1999|Lead, Total (Undi...|  4.691044776119404|          7668|
|1999|                 BOD|                2.5|          7668|
|1997|    Dissolved Oxygen| 10.635762224084955|          5093|
|1997|     Field Turbidity| 0.5865384615384616|          5093|
|1999|Mercury, Total (U...|               null|          7668|
|1998|Soluble Reactive ...|  18.07328072153326|        

In [46]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_1/`
""").show()

++
||
++
++



In [48]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Geral_Presto_1
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE IF NOT EXISTS Projeto_gold.Geral_Presto_1 (
        Ano INT, 
        Analyte VARCHAR(250), 
        Media DOUBLE, 
        Derrames_Count LONG
    )
    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/gold/Projeto_gold.db/Geral_1/_symlink_format_manifest/'
    """).show()

++
||
++
++

++
||
++
++



DataFrame[]