In [0]:
%python
%run "../utils/common_functions"

#### 1.Reading data from silver tables

In [0]:
ocurrences_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_occurrences")

In [0]:
acft_type_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_aircraft_type")

In [0]:
acft_special_type_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_aircraft_special_type")

#### 2.Joining all DF's to enrich data

In [0]:
all_df_joined = ocurrences_silver_df \
  .join(acft_type_silver_df, (ocurrences_silver_df.acft_icao_type == acft_type_silver_df.icao_aircraft_type) | (ocurrences_silver_df.acft_manufacturer == acft_type_silver_df.manufacturer_code), how="left") \
  .join(acft_special_type_silver_df, ocurrences_silver_df.acft_icao_type == acft_special_type_silver_df.acft_special_type, "left")

#### 3.Dropping duplicates rows

In [0]:
all_df_joined_dropped_dedupe = drop_duplicates(all_df_joined,["ocurrence_number","occurrence_description","acft_model"])

In [0]:
all_df_joined_dropped_dedupe.count()

In [0]:
ocurrences_silver_df.count()

#### 4.Concatenating occurrence description to result in only one row

In [0]:
from pyspark.sql import functions as F

ocurrences_unified_desc_df = (
    all_df_joined_dropped_dedupe.groupBy(
        *[col for col in all_df_joined_dropped_dedupe.columns if col != "occurrence_description"]
    )
    .agg(
        F.concat_ws(" || ", F.collect_list("occurrence_description")).alias("occurrence_description")
    )
)

In [0]:
ocurrences_unified_desc_df.createOrReplaceTempView("processed_occurrences_joined")

In [0]:
%sql
SELECT COUNT(*), ocurrence_number
FROM processed_occurrences_joined
GROUP BY ocurrence_number
HAVING COUNT(*) > 1;

In [0]:
%sql
SELECT * FROM processed_occurrences_joined LIMIT 5;

#### 5.Performing merge

In [0]:
merge_condition = "tgt.ocurrence_number == src.ocurrence_number AND tgt.operator == src.operator"
merge_delta_data("anac_ocorrencias_dev","gold","transformed_occurrences",ocurrences_unified_desc_df,merge_condition,['occurrence_year','state','occurrence_classification'])