In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType,FloatType

warehouse_location = 'hdfs://hdfs-nn:9870/TrabalhoPL'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPL/bronze/Watershed_Water_Quality_-_Hydrology.csv"

customSchema = StructType([
    StructField("Sample Id", StringType(), True),        
    StructField("Sample Site", StringType(), True),
    StructField("Sample Date", StringType(), True),
    StructField("Sample Time", StringType(), True),
    StructField("Analyte", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Final Result", StringType(), True),
    StructField("Units", StringType(), True), 
    StructField("Stream Group", StringType(), True)
    
])

projeto_water = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
projeto_water.show()

+---------+-----------+-----------+-----------+---------------+------+------------+-----+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|Status|Final Result|Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------+------------+-----+------------+
| C-185028|         S4| 01/07/1987|       null|    Temperature|  null|     1.11111|    C|   Schoharie|
| C-186342|        S6I| 01/07/1987|       null|    Temperature|  null|    0.555556|    C|   Schoharie|
| C-187744|         S8| 01/07/1987|       null|    Temperature|  null|     2.22222|    C|   Schoharie|
| C-185028|         S4| 01/07/1987|       null|Scent Character|  null|          1V| null|   Schoharie|
| C-186342|        S6I| 01/07/1987|       null|Scent Character|  null|          1V| null|   Schoharie|
| C-187744|         S8| 01/07/1987|       null|Scent Character|  null|          1V| null|   Schoharie|
| C-182701|         S1| 01/07/1987|       null|Scent Character|  null|   

In [3]:
replaced_projeto_water = projeto_water.withColumn(
    "Sample Time",
    when(
        (col("Sample Time").isNull() | (col("Sample Time") == None)), 
        "00:00"
    ).otherwise(col("Sample Time")))
    
replaced_projeto_water.show()

+---------+-----------+-----------+-----------+---------------+------+------------+-----+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|Status|Final Result|Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------+------------+-----+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|  null|     1.11111|    C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|  null|    0.555556|    C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|  null|     2.22222|    C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|  null|          1V| null|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|  null|          1V| null|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|  null|          1V| null|   Schoharie|
| C-182701|         S1| 01/07/1987|      00:00|Scent Character|  null|   

In [4]:
replaced_projeto_water = replaced_projeto_water.withColumn('Sample Time', concat(split(replaced_projeto_water['Sample Time'], ':').getItem(0), lit(':'), 
                                                                                 split(replaced_projeto_water['Sample Time'], ':').getItem(1))) 
replaced_projeto_water.show()

+---------+-----------+-----------+-----------+---------------+------+------------+-----+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|Status|Final Result|Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------+------------+-----+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|  null|     1.11111|    C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|  null|    0.555556|    C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|  null|     2.22222|    C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|  null|          1V| null|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|  null|          1V| null|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|  null|          1V| null|   Schoharie|
| C-182701|         S1| 01/07/1987|      00:00|Scent Character|  null|   

In [5]:
replaced_projeto_water2 = replaced_projeto_water.withColumn(
    "Status",
    when(
        (col("Status").isNull() | (col("Status") == None)), 
        "Desconhecido"
    ).otherwise(col("Status")))
replaced_projeto_water2.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+-----+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|      Status|Final Result|Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------------+------------+-----+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|Desconhecido|     1.11111|    C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|Desconhecido|    0.555556|    C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|Desconhecido|     2.22222|    C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V| null|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V| null|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V| null|   Schoharie|
| C-182701|        

In [6]:
replaced_projeto_water3 = replaced_projeto_water2.withColumn(
    "Final Result",
    when(
        (col("Final Result").isNull() | (col("Final Result") == None)), 
        "Desconhecido"
    ).otherwise(col("Final Result")))
replaced_projeto_water3.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+-----+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|      Status|Final Result|Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------------+------------+-----+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|Desconhecido|     1.11111|    C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|Desconhecido|    0.555556|    C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|Desconhecido|     2.22222|    C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V| null|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V| null|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V| null|   Schoharie|
| C-182701|        

In [7]:
replaced_projeto_water4 = replaced_projeto_water3.withColumn(
    "Units",
    when(
        (col("Units").isNull() | (col("Units") == None)), 
        "Desconhecida"
    ).otherwise(col("Units")))
replaced_projeto_water4.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|      Status|Final Result|       Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|Desconhecido|     1.11111|           C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|Desconhecido|    0.555556|           C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|Desconhecido|     2.22222|           C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|Desconhec

In [8]:
replaced_projeto_water5 = replaced_projeto_water4.withColumn(
    "Stream Group",
    when(
        (col("Stream Group").isNull() | (col("Stream Group") == None)), 
        "Desconhecida"
    ).otherwise(col("Stream Group")))
replaced_projeto_water5.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|      Status|Final Result|       Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|Desconhecido|     1.11111|           C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|Desconhecido|    0.555556|           C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|Desconhecido|     2.22222|           C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|Desconhec

In [9]:
replaced_projeto_water5 = replaced_projeto_water5.withColumn(
    "Analyte",
    when(
        (col("Analyte").isNull() | (col("Analyte") == None)), 
        "Desconhecida"
    ).otherwise(col("Analyte")))
replaced_projeto_water5.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|      Status|Final Result|       Units|Stream Group|
+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+
| C-185028|         S4| 01/07/1987|      00:00|    Temperature|Desconhecido|     1.11111|           C|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|    Temperature|Desconhecido|    0.555556|           C|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|    Temperature|Desconhecido|     2.22222|           C|   Schoharie|
| C-185028|         S4| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|
| C-186342|        S6I| 01/07/1987|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|
| C-187744|         S8| 01/07/1987|      00:00|Scent Character|Desconhec

In [10]:
replaced_projeto_water6 = replaced_projeto_water5.withColumn("Sample Date", to_date(col("Sample Date"), "MM/dd/yyy"))
replaced_projeto_water6 = replaced_projeto_water6.withColumn('Ano', (split(replaced_projeto_water6['Sample Date'], '-').getItem(0)).cast(IntegerType()))
replaced_projeto_water6.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+----+
|Sample Id|Sample Site|Sample Date|Sample Time|        Analyte|      Status|Final Result|       Units|Stream Group| Ano|
+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+----+
| C-185028|         S4| 1987-01-07|      00:00|    Temperature|Desconhecido|     1.11111|           C|   Schoharie|1987|
| C-186342|        S6I| 1987-01-07|      00:00|    Temperature|Desconhecido|    0.555556|           C|   Schoharie|1987|
| C-187744|         S8| 1987-01-07|      00:00|    Temperature|Desconhecido|     2.22222|           C|   Schoharie|1987|
| C-185028|         S4| 1987-01-07|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|1987|
| C-186342|        S6I| 1987-01-07|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|1987|
| C-187744|         S8| 1987-01-

In [11]:
replaced_projeto_water6.printSchema()

root
 |-- Sample Id: string (nullable = true)
 |-- Sample Site: string (nullable = true)
 |-- Sample Date: date (nullable = true)
 |-- Sample Time: string (nullable = true)
 |-- Analyte: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Final Result: string (nullable = true)
 |-- Units: string (nullable = true)
 |-- Stream Group: string (nullable = true)
 |-- Ano: integer (nullable = true)



In [12]:
replaced_projeto_water7 = replaced_projeto_water6 \
    .withColumnRenamed("Sample Id","Sample_Id") \
    .withColumnRenamed("Sample Site","Sample_Site") \
    .withColumnRenamed("Sample Date","Sample_Date") \
    .withColumnRenamed("Sample Time","Sample_Time") \
    .withColumnRenamed("Final Result","Final_Result")\
    .withColumnRenamed("Stream Group","Stream_Group")
replaced_projeto_water7.show()

+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+----+
|Sample_Id|Sample_Site|Sample_Date|Sample_Time|        Analyte|      Status|Final_Result|       Units|Stream_Group| Ano|
+---------+-----------+-----------+-----------+---------------+------------+------------+------------+------------+----+
| C-185028|         S4| 1987-01-07|      00:00|    Temperature|Desconhecido|     1.11111|           C|   Schoharie|1987|
| C-186342|        S6I| 1987-01-07|      00:00|    Temperature|Desconhecido|    0.555556|           C|   Schoharie|1987|
| C-187744|         S8| 1987-01-07|      00:00|    Temperature|Desconhecido|     2.22222|           C|   Schoharie|1987|
| C-185028|         S4| 1987-01-07|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|1987|
| C-186342|        S6I| 1987-01-07|      00:00|Scent Character|Desconhecido|          1V|Desconhecida|   Schoharie|1987|
| C-187744|         S8| 1987-01-

In [13]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Tabela_Agua
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto.Tabela_Agua (
        Sample_Id VARCHAR(50),
        Sample_Site VARCHAR(50), 
        Sample_Date date,
        Sample_Time VARCHAR(50),
        Analyte VARCHAR(500),
        Status VARCHAR(50),
        Final_Result VARCHAR(50),
        Units VARCHAR(50),
        Stream_Group VARCHAR(50)

    )
       USING DELTA
   
   PARTITIONED BY (
        Ano INT

    )
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Agua'
    """
)

DataFrame[]

In [14]:
#write df to hive deltalake_table
replaced_projeto_water7 \
    .select("Sample_Id","Sample_Site","Sample_Date","Sample_Time","Analyte","Status",
            "Final_Result","Units","Stream_Group", "Ano") \
    .write \
    .mode("overwrite") \
    .partitionBy("Ano") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Agua")
from pyspark.sql.types import *

In [15]:
spark.sql("USE Projeto")
spark.sql("SHOW tables").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  projeto|             agua|      false|
|  projeto|      tabela_agua|      false|
|  projeto|        tabela_ar|      false|
|  projeto|  tabela_petroleo|      false|
|  projeto|tabela_reciclagem|      false|
+---------+-----------------+-----------+



In [16]:
spark.table("Projeto.Tabela_Agua").show()

+---------+-----------+-----------+-----------+--------------------+------------+------------+------------+-------------+----+
|Sample_Id|Sample_Site|Sample_Date|Sample_Time|             Analyte|      Status|Final_Result|       Units| Stream_Group| Ano|
+---------+-----------+-----------+-----------+--------------------+------------+------------+------------+-------------+----+
| C-175259|       E16I| 1997-01-02|       8:32|Solids, Total Sus...|Desconhecido|           3|        mg/L|       Esopus|1997|
| C-175259|       E16I| 1997-01-02|       8:32|                  pH|Desconhecido|        6.73|Desconhecida|       Esopus|1997|
| C-175259|       E16I| 1997-01-02|       8:32|         Temperature|Desconhecido|           1|           C|       Esopus|1997|
| C-175259|       E16I| 1997-01-02|       8:32|      Apparent Color|Desconhecido|           6|          CU|       Esopus|1997|
| C-175259|       E16I| 1997-01-02|       8:32|     Scent Character|Desconhecido|           0|Desconhecida|    