In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import *

In [2]:
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
gold_MentalHealth = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/mentalHealth_TratamentoPerSex")

In [4]:
gold_whosuicide = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerCountryAndGender")

In [6]:
spark.sql(
    """
    DROP TABLE IF EXISTS gold.mentalHealth_WhoSuicide
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE  gold.mentalHealth_WhoSuicide (
         Ano DATE, 
         Pais STRING, 
         Homens_em_tratamento INT, 
         Mulheres_em_tratamento INT,
         Percentagem_De_Homens_Em_Tratamento LONG, 
         Percentagem_De_Mulheres_Em_Tratamento LONG,
         Genero STRING,
         PercentagemSuicidiosPorGenero LONG
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/gold/mentalHealth_WhoSuicide/'
    """
)

DataFrame[]

In [7]:
joined_data = gold_MentalHealth.join(gold_whosuicide, ['Ano', 'Pais'], 'left')
joined_data.show()

+----+--------------------+--------------------+----------------------+-----------------------------------+-------------------------------------+------+-----------------------------+
| Ano|                Pais|Homens_em_tratamento|Mulheres_em_tratamento|Percentagem_De_Homens_Em_Tratamento|Percentagem_De_Mulheres_Em_Tratamento|Genero|PercentagemSuicidiosPorGenero|
+----+--------------------+--------------------+----------------------+-----------------------------------+-------------------------------------+------+-----------------------------+
|2014|            Colombia|                   0|                     0|                               null|                                 null|  male|                       0.0081|
|2014|            Colombia|                   0|                     0|                               null|                                 null|female|                       0.0017|
|2014|             Croatia|                   2|                     0|              

In [8]:
joined_data \
    .select( "Ano" , "Pais" , "Homens_em_tratamento", "Mulheres_em_tratamento","Percentagem_De_Homens_Em_Tratamento", "Percentagem_De_Mulheres_Em_Tratamento", "Genero","PercentagemSuicidiosPorGenero") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/mentalHealth_WhoSuicide") 