In [None]:
pip install delta-spark==2.4.0

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.TrabalhoPratico.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS gold LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/gold/'
    """
)

DataFrame[]

In [3]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+---------+
|namespace|
+---------+
|  default|
|     gold|
|   silver|
|  silver1|
+---------+



In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS gold.who_suicide_statistics_SuicidesPerSex
    """
)

DataFrame[]

In [None]:
spark.sql(
    """
    SHOW TABLES FROM gold
    """
).show()

In [5]:
# 1-> Criação de uma tabela de maneira a criar uma nova deltaTable , uma com uma coluna chamada PercentualSuicidiosPorSexo

spark.sql(
    """
    DROP TABLE IF EXISTS gold.who_suicide_statistics_SuicidesPerSex
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE  gold.who_suicide_statistics_SuicidesPerSex (
        country STRING,
        year INT,
        sex STRING,
        PercentageOfSuicidesByGender STRING
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerSex/'
    """
)

DataFrame[]

In [8]:
# 1.1-> Caminho para a DeltaTable criada na layer de Silver

who_suicide_silver = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/who_suicide_statistics_DeltaTable")

# Drop á coluna de age
mentalHealth_df = who_suicide_silver.drop("Idade")

# Agrupar as linhas por país, por Género e por Genero, somando o numero de suicidios e população
who_suicide_PerSex = mentalHealth_df.groupBy("Pais", "Ano", "Genero").agg(
    sum("Numero_Suicidios").alias("total_suicides"),
    sum("Populacao").alias("total_population")
)
# Organizar o resultado por ordem dos paises
who_suicide_PerSex = who_suicide_PerSex.orderBy("Pais", "Ano", "Genero")

#Criar coluna PercentualSuicidiosPorGenero que equivale ao número total de suicidios a dividir pelo número total da população
who_suicide_statistics_SuicidesPerSex = who_suicide_PerSex.withColumn(
    "PercentageOfSuicidesByGender",
    format_number(col("total_suicides") / col("total_population") * 100, 4)
)

who_suicide_statistics_SuicidesPerSex.show()




+-------+----+------+--------------+----------------+----------------------------+
|   Pais| Ano|Genero|total_suicides|total_population|PercentageOfSuicidesByGender|
+-------+----+------+--------------+----------------+----------------------------+
|Albania|1987|female|            25|         1316900|                      0.0019|
|Albania|1987|  male|            48|         1392700|                      0.0034|
|Albania|1988|female|            22|         1343600|                      0.0016|
|Albania|1988|  male|            41|         1420700|                      0.0029|
|Albania|1989|female|            15|         1363300|                      0.0011|
|Albania|1989|  male|            53|         1439800|                      0.0037|
|Albania|1992|female|            14|         1423200|                      0.0010|
|Albania|1992|  male|            33|         1399300|                      0.0024|
|Albania|1993|female|            27|         1427400|                      0.0019|
|Alb

In [10]:
# 1.2-> Guardar os dados na layer de Gold

who_suicide_statistics_SuicidesPerSex \
    .select("Pais","Ano", "Genero", "PercentageOfSuicidesByGender") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerSex") 

In [11]:
# 1.3-> Output do resultado esperado

spark.sql(
    """
    Select *
    from gold.who_suicide_statistics_SuicidesPerSex
    """
).show()

+-------+----+------+----------------------------+
|   Pais| Ano|Genero|PercentageOfSuicidesByGender|
+-------+----+------+----------------------------+
|Albania|1987|female|                      0.0019|
|Albania|1987|  male|                      0.0034|
|Albania|1988|female|                      0.0016|
|Albania|1988|  male|                      0.0029|
|Albania|1989|female|                      0.0011|
|Albania|1989|  male|                      0.0037|
|Albania|1992|female|                      0.0010|
|Albania|1992|  male|                      0.0024|
|Albania|1993|female|                      0.0019|
|Albania|1993|  male|                      0.0033|
|Albania|1994|female|                      0.0010|
|Albania|1994|  male|                      0.0025|
|Albania|1995|female|                      0.0023|
|Albania|1995|  male|                      0.0038|
|Albania|1996|female|                      0.0026|
|Albania|1996|  male|                      0.0035|
|Albania|1997|female|          

In [12]:
# 2-> Criação da table de Suicidios por pais 
spark.sql(
    """
    DROP TABLE IF EXISTS gold.who_suicide_statistics_SuicidesPerCountry
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE  gold.who_suicide_statistics_SuicidesPerCountry (
        country STRING,
        year INT,
        PercentageOfSuicidesByCountry STRING
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerCountry/'
    """
)

DataFrame[]

In [16]:
# 2.1 -> Caminho para a DeltaTable criada na layer de Silver
who_suicide_silver_Country = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/who_suicide_statistics_DeltaTable")

# Drop á coluna de age
mentalHealth_df_Country = who_suicide_silver_Country.drop("Idade")

# Agrupar as linhas por país e por Género, somando o numero de suicidios e população
who_suicide_PerCountry = mentalHealth_df_Country.groupBy("Pais", "Ano").agg(
    sum("Numero_Suicidios").alias("total_suicides"),
    sum("Populacao").alias("total_population")
)

# Organizar o resultado por ordem dos paises e por ano
who_suicide_PerCountry = who_suicide_PerCountry.orderBy("Pais", "Ano")

#Criar coluna PercentualSuicidiosPorPais que equivale ao número total de suicidios a dividir pelo número total da população
who_suicide_statistics_SuicidesPerCountry = who_suicide_PerCountry.withColumn(
    "PercentageOfSuicidesByCountry",
    format_number(col("total_suicides") / col("total_population") * 100, 4)
)

who_suicide_statistics_SuicidesPerCountry.show()


+-------+----+--------------+----------------+-----------------------------+
|   Pais| Ano|total_suicides|total_population|PercentageOfSuicidesByCountry|
+-------+----+--------------+----------------+-----------------------------+
|Albania|1987|            73|         2709600|                       0.0027|
|Albania|1988|            63|         2764300|                       0.0023|
|Albania|1989|            68|         2803100|                       0.0024|
|Albania|1992|            47|         2822500|                       0.0017|
|Albania|1993|            73|         2807300|                       0.0026|
|Albania|1994|            50|         2849300|                       0.0018|
|Albania|1995|            88|         2903400|                       0.0030|
|Albania|1996|            89|         2940200|                       0.0030|
|Albania|1997|           170|         2977300|                       0.0057|
|Albania|1998|           154|         3012700|                       0.0051|

In [17]:
# 2.2 -> Guardar os dados na layer de Gold

who_suicide_statistics_SuicidesPerCountry \
    .select ("Pais","Ano","PercentageOfSuicidesByCountry") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema","true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerCountry")

In [18]:
# 2.3 -> Verificação
spark.sql(
    """
    SELECT *
    FROM gold.who_suicide_statistics_SuicidesPerCountry
    """
).show()

+-------+----+-----------------------------+
|   Pais| Ano|PercentageOfSuicidesByCountry|
+-------+----+-----------------------------+
|Albania|1987|                       0.0027|
|Albania|1988|                       0.0023|
|Albania|1989|                       0.0024|
|Albania|1992|                       0.0017|
|Albania|1993|                       0.0026|
|Albania|1994|                       0.0018|
|Albania|1995|                       0.0030|
|Albania|1996|                       0.0030|
|Albania|1997|                       0.0057|
|Albania|1998|                       0.0051|
|Albania|1999|                       0.0046|
|Albania|2000|                       0.0019|
|Albania|2001|                       0.0043|
|Albania|2002|                       0.0047|
|Albania|2003|                       0.0044|
|Albania|2004|                       0.0051|
|Albania|2005|                       0.0000|
|Albania|2006|                       0.0000|
|Albania|2007|                       0.0045|
|Albania|2

In [19]:
# 3-> Criação da table de Suicidios por pais e Genero 
spark.sql(
    """
    DROP TABLE IF EXISTS gold.who_suicide_statistics_SuicidesPerCountryAndGender
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE  gold.who_suicide_statistics_SuicidesPerCountryAndGender (
        Pais STRING,
        Genero STRING,
        PercentageOfSuicidesByGender STRING
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerCountryAndGender/'
    """
)

DataFrame[]

In [20]:
# 3.1 -> Criação da table de Suicidios por pais e Genero 

#Caminho para a DeltaTable criada na layer de Silver
who_suicide_silver = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/who_suicide_statistics_DeltaTable")

# Drop á coluna de age e year
mentalHealth_df = who_suicide_silver.drop("Idade")

# Agrupar as linhas por Género e por Genero, somando o numero de suicidios e população
who_suicide_PerSexAndCountry = mentalHealth_df.groupBy("Pais", "Ano", "Genero").agg(
    sum("Numero_Suicidios").alias("Total_Suicidios"),
    sum("Populacao").alias("Total_Populacao")
)
# Organizar o resultado por ordem dos paises e Genero
who_suicide_PerSexAndCountry = who_suicide_PerSexAndCountry.orderBy("Pais","Ano", "Genero")

#Criar coluna PercentualSuicidiosPorGenero_Pais que equivale ao número total de suicidios a dividir pelo número total da população
who_suicide_statistics_SuicidesPerSexAndCountry = who_suicide_PerSexAndCountry.withColumn(
    "PercentagemSuicidiosPorGenero",
    format_number(col("Total_Suicidios") / col("Total_Populacao") * 100, 4)
)

who_suicide_statistics_SuicidesPerSexAndCountry.show()

+-------+----+------+---------------+---------------+-----------------------------+
|   Pais| Ano|Genero|Total_Suicidios|Total_Populacao|PercentagemSuicidiosPorGenero|
+-------+----+------+---------------+---------------+-----------------------------+
|Albania|1987|female|             25|        1316900|                       0.0019|
|Albania|1987|  male|             48|        1392700|                       0.0034|
|Albania|1988|female|             22|        1343600|                       0.0016|
|Albania|1988|  male|             41|        1420700|                       0.0029|
|Albania|1989|female|             15|        1363300|                       0.0011|
|Albania|1989|  male|             53|        1439800|                       0.0037|
|Albania|1992|female|             14|        1423200|                       0.0010|
|Albania|1992|  male|             33|        1399300|                       0.0024|
|Albania|1993|female|             27|        1427400|                       

In [21]:
# 3.2 -> Guardar os dados na layer de Gold

who_suicide_statistics_SuicidesPerSexAndCountry \
    .select("Pais", "Ano", "Genero", "PercentagemSuicidiosPorGenero") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerCountryAndGender") 

In [22]:
# 3.3 -> Verificação 
spark.sql(
    """
    Select *
    From gold.who_suicide_statistics_SuicidesPerCountryAndGender
    """
).show()

+-------+----+------+-----------------------------+
|   Pais| Ano|Genero|PercentagemSuicidiosPorGenero|
+-------+----+------+-----------------------------+
|Albania|1987|female|                       0.0019|
|Albania|1987|  male|                       0.0034|
|Albania|1988|female|                       0.0016|
|Albania|1988|  male|                       0.0029|
|Albania|1989|female|                       0.0011|
|Albania|1989|  male|                       0.0037|
|Albania|1992|female|                       0.0010|
|Albania|1992|  male|                       0.0024|
|Albania|1993|female|                       0.0019|
|Albania|1993|  male|                       0.0033|
|Albania|1994|female|                       0.0010|
|Albania|1994|  male|                       0.0025|
|Albania|1995|female|                       0.0023|
|Albania|1995|  male|                       0.0038|
|Albania|1996|female|                       0.0026|
|Albania|1996|  male|                       0.0035|
|Albania|199

In [None]:
# 4 -> Criar Tabela para a junção dos Indexs e Suicidios

spark.sql(
    """
    DROP TABLE IF EXISTS gold.indexs_Suicidios_Gold
    """
)

In [25]:
# 4 -> Criar Tabela para a junção dos Indexs e Suicidios

spark.sql(
    """
    DROP TABLE IF EXISTS gold.tratamento_Suicidios
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE  gold.tratamento_Suicidios (
        Pais STRING,
        Genero STRING,
        PercentagemSuicidiosPorGenero STRING,
        Sum_Treatments_Per_Country INT
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPratico/gold/indexs_Suicidios_Gold/'
    """
)

DataFrame[]

In [3]:
# Começar a junção de duas tabelas

Suicides = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/who_suicide_statistics_SuicidesPerCountryAndGender")

Tratamento = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/mentalHealth_Benefits_Care_TreatmentPerCountry")


Suicidio_Tratamento_Gold = Suicides.join(Tratamento, ["Ano", "Pais"])

Suicidio_Tratamento_Gold.show()

AnalysisException: [UNRESOLVED_USING_COLUMN_FOR_JOIN] USING column `Ano` cannot be resolved on the right side of the join. The right-side columns: [`Country`, `Gender`, `Sum_Benefits_Per_Country`, `Sum_Care_Options_Per_Country`, `Sum_Treatments_Per_Country`].

In [None]:
Indexs_Suicidios_Gold \
    .select("Ano", "Pais", "Genero", "PercentagemSuicidiosPorGenero", "Index_Desigualdade") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/gold/indexs_Suicidios_Gold") 

In [None]:
spark.sql(
    """
    Select *
    From gold.indexs_Suicidios_Gold
    """
).show()

In [None]:
spark.stop()