In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import WindowSpec
from pyspark.sql.window import Window

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.TrabalhoPratico.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
#read hdfs file to dataframe
#
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPratico/bronze/who_suicide_statistics"
#define the schema for the dataframe
customSchema = StructType([
    StructField("country", StringType(), False),        
    StructField("year", IntegerType(), False),
    StructField("sex", StringType(), False),
    StructField("age", StringType(), False),
    StructField("suicides_number", IntegerType(), False),
    StructField("population", IntegerType(), False),
])

suicides_dt = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)

suicides_dt.show()

In [3]:
# ----------------------- MODIFICAÇÕES NECESSÁRIAS PARA A LAYER DE SILVER ------------------

# Elimina todas as linhas que contenham algum dado a null
who_suicide_statistics = suicides_dt.dropna()

#Criar coluna PercentualSuicidios que equivale ao número total de suicidios a dividir pelo número total da população
who_suicide_statistics_FinalMods = who_suicide_statistics.withColumn(
    "PercentualSuicidiosPorIdade",
    format_number(col("suicides_number") / col("population") * 100, 4)
)

In [None]:
#View das tabelas criadas
who_suicide_statistics_FinalMods.show()
who_suicide_statistics_FinalMods.toPandas()

In [75]:
# Guardar os dados na layer de Silver

who_suicide_statistics_FinalMods \
    .select("country","year", "sex", "age", "suicides_number", "population", "PercentualSuicidiosPorIdade") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/who_suicide_statistics_DeltaTable") 

In [None]:
spark.sql(
    """
    Select *
    from silver.who_suicide_statistics_DeltaTable
    """
).toPandas()

In [5]:
spark.stop()