In [5]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType,FloatType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9870/TrabalhoPL'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [6]:
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPL/bronze/Recycling_Diversion_and_Capture_Rates.csv"

customSchema = StructType([
    StructField("Zone", StringType(), True),        
    StructField("District", StringType(), True),
    StructField("Fiscal Month Number", IntegerType(), True),
    StructField("Fiscal Year", IntegerType(), True),
    StructField("Month Name", StringType(), True),
    StructField("Diversion Rate-Total (Total Recycling / Total Waste)", FloatType(), True),
    StructField("Capture Rate-Paper (Total Paper / Max Paper)", FloatType(), True),
    StructField("Capture Rate-MGP (Total MGP / Max MGP)", FloatType(), True), 
    StructField("Capture Rate-Total ((Total Recycling - Leaves (Recycling)) / (Max Paper + Max MGP))x100", FloatType(), True),  
])

projeto_reci = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
projeto_reci.toPandas()

Unnamed: 0,Zone,District,Fiscal Month Number,Fiscal Year,Month Name,Diversion Rate-Total (Total Recycling / Total Waste),Capture Rate-Paper (Total Paper / Max Paper),Capture Rate-MGP (Total MGP / Max MGP),Capture Rate-Total ((Total Recycling - Leaves (Recycling)) / (Max Paper + Max MGP))x100
0,Brooklyn North,BKN01,10,2019,April,14.700000,44.900002,43.000000,44.099998
1,Brooklyn North,BKN02,10,2019,April,20.000000,34.200001,57.900002,41.200001
2,Brooklyn North,BKN03,10,2019,April,12.200000,33.500000,44.900002,38.200001
3,Brooklyn North,BKN04,10,2019,April,15.500000,35.200001,68.500000,48.799999
4,Brooklyn North,BKN05,10,2019,April,10.100000,22.299999,45.099998,31.500000
...,...,...,...,...,...,...,...,...,...
2827,Queens West,QW06,3,2016,September,20.100000,30.400000,68.000000,39.000000
2828,Queens West,QW09,3,2016,September,17.400000,41.099998,79.699997,54.299999
2829,Staten Island,SI01,3,2016,September,18.700001,39.500000,71.699997,49.700001
2830,Staten Island,SI02,3,2016,September,19.000000,44.500000,75.000000,54.099998


In [7]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

projeto_reci4 = projeto_reci3.withColumn('Zone', 
            when(projeto_reci3.Zone.endswith('North'),regexp_replace(projeto_reci3.Zone,'North',' ')) \
            .when(projeto_reci3.Zone.endswith('South'),regexp_replace(projeto_reci3.Zone,'South',' ')) \
            .when(projeto_reci3.Zone.endswith('East'),regexp_replace(projeto_reci3.Zone,'East',' ')) \
            .when(projeto_reci3.Zone.endswith('West'),regexp_replace(projeto_reci3.Zone,'West',' ')) \
            .when (projeto_reci3.Zone.endswith('Manhattan'),regexp_replace(projeto_reci3.Zone,'Manhattan','Manhattan')) \
            .when (projeto_reci3.Zone.endswith('Staten Island'),regexp_replace(projeto_reci3.Zone,'Staten Island','Staten Island')) \
            .when (projeto_reci3.Zone.endswith('Bronx'),regexp_replace(projeto_reci3.Zone,'Bronx','Bronx')) \
                                                     
)

projeto_reci4.toPandas()

Unnamed: 0,Zone,District,Fiscal_Month_Number,Fiscal_Year,Month_Name,Diversion_Rate_Total,Capture_Rate_Paper,Capture_Rate_MGP,Capture_Rate_Total
0,Brooklyn North,BKN01,10,2019,April,14.700000,44.900002,43.000000,44.099998
1,Brooklyn North,BKN02,10,2019,April,20.000000,34.200001,57.900002,41.200001
2,Brooklyn North,BKN03,10,2019,April,12.200000,33.500000,44.900002,38.200001
3,Brooklyn North,BKN04,10,2019,April,15.500000,35.200001,68.500000,48.799999
4,Brooklyn North,BKN05,10,2019,April,10.100000,22.299999,45.099998,31.500000
...,...,...,...,...,...,...,...,...,...
2827,Queens West,QW06,3,2016,September,20.100000,30.400000,68.000000,39.000000
2828,Queens West,QW09,3,2016,September,17.400000,41.099998,79.699997,54.299999
2829,Staten Island,SI01,3,2016,September,18.700001,39.500000,71.699997,49.700001
2830,Staten Island,SI02,3,2016,September,19.000000,44.500000,75.000000,54.099998


In [11]:
projeto_reci3 = projeto_reci \
    .withColumnRenamed("Fiscal Month Number","Fiscal_Month_Number") \
    .withColumnRenamed("Fiscal Year","Fiscal_Year") \
    .withColumnRenamed("Month Name","Month_Name") \
    .withColumnRenamed("Diversion Rate-Total (Total Recycling / Total Waste)","Diversion_Rate_Total") \
    .withColumnRenamed("Capture Rate-Paper (Total Paper / Max Paper)","Capture_Rate_Paper")\
    .withColumnRenamed("Capture Rate-MGP (Total MGP / Max MGP)","Capture_Rate_MGP")\
    .withColumnRenamed("Capture Rate-Total ((Total Recycling - Leaves (Recycling)) / (Max Paper + Max MGP))x100","Capture_Rate_Total")
projeto_reci3.toPandas()

Unnamed: 0,Zone,District,Fiscal_Month_Number,Fiscal_Year,Month_Name,Diversion_Rate_Total,Capture_Rate_Paper,Capture_Rate_MGP,Capture_Rate_Total
0,Brooklyn,BKN01,10,2019,April,14.700000,44.900002,43.000000,44.099998
1,Brooklyn,BKN02,10,2019,April,20.000000,34.200001,57.900002,41.200001
2,Brooklyn,BKN03,10,2019,April,12.200000,33.500000,44.900002,38.200001
3,Brooklyn,BKN04,10,2019,April,15.500000,35.200001,68.500000,48.799999
4,Brooklyn,BKN05,10,2019,April,10.100000,22.299999,45.099998,31.500000
...,...,...,...,...,...,...,...,...,...
2827,Queens,QW06,3,2016,September,20.100000,30.400000,68.000000,39.000000
2828,Queens,QW09,3,2016,September,17.400000,41.099998,79.699997,54.299999
2829,Staten Island,SI01,3,2016,September,18.700001,39.500000,71.699997,49.700001
2830,Staten Island,SI02,3,2016,September,19.000000,44.500000,75.000000,54.099998


In [26]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Tabela_Reciclagem
    """
)

DataFrame[]

In [27]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto.Tabela_Reciclagem (
        Zone VARCHAR(50),
        District VARCHAR(50),
        Fiscal_Month_Number INT,
        Month_Name VARCHAR(50),
        Diversion_Rate_Total FLOAT,
        Capture_Rate_Paper FLOAT,
        Capture_Rate_MGP FLOAT,
        Capture_Rate_Total FLOAT
       

    )
       USING DELTA
   
   PARTITIONED BY (
        Fiscal_Year INT

    )
    LOCATION 'hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Reciclagem'
    """
)

DataFrame[]

In [28]:
#write df to hive deltalake_table
projeto_reci4 \
    .select("Zone","District","Fiscal_Month_Number","Fiscal_Year","Month_Name","Diversion_Rate_Total",
            "Capture_Rate_Paper","Capture_Rate_MGP","Capture_Rate_Total") \
    .write \
    .mode("overwrite") \
    .partitionBy("Fiscal_Year") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPL/silver/Projeto.db/Tabela_Reciclagem")
from pyspark.sql.types import *

In [29]:
spark.sql("USE Projeto")
spark.sql("SHOW tables").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  projeto|             agua|      false|
|  projeto|          geral_1|      false|
|  projeto|      tabela_agua|      false|
|  projeto|        tabela_ar|      false|
|  projeto|  tabela_petroleo|      false|
|  projeto|tabela_reciclagem|      false|
+---------+-----------------+-----------+



In [30]:
spark.table("Projeto.Tabela_Reciclagem").toPandas()

Unnamed: 0,Zone,District,Fiscal_Month_Number,Month_Name,Diversion_Rate_Total,Capture_Rate_Paper,Capture_Rate_MGP,Capture_Rate_Total,Fiscal_Year
0,Brooklyn,BKN01,10,April,13.700000,41.000000,44.299999,42.299999,2016
1,Brooklyn,BKN02,10,April,17.600000,33.200001,55.400002,39.799999,2016
2,Brooklyn,BKN03,10,April,10.900000,30.500000,39.299999,34.099998,2016
3,Brooklyn,BKN04,10,April,13.400000,30.299999,59.299999,42.099998,2016
4,Brooklyn,BKN05,10,April,9.600000,22.400000,41.599998,30.200001,2016
...,...,...,...,...,...,...,...,...,...
2827,Queens,QW06,3,September,20.900000,29.500000,77.699997,40.500000,2019
2828,Queens,QW09,3,September,22.400000,40.099998,88.500000,56.599998,2019
2829,Staten Island,SI01,3,September,20.400000,38.900002,80.500000,52.000000,2019
2830,Staten Island,SI02,3,September,19.500000,42.299999,79.800003,54.099998,2019
