In [0]:
%run "../utils/common_functions"

#### 1.Reading data from silver tables

In [0]:
occurrences_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_occurrences")

In [0]:
acft_type_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_aircraft_type")

In [0]:
acft_special_type_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_aircraft_special_type")

#### 2.Joining all DF's to enrich data

In [0]:
all_df_joined = occurrences_silver_df \
  .join(acft_type_silver_df, (occurrences_silver_df.acft_icao_type == acft_type_silver_df.icao_aircraft_type) | (occurrences_silver_df.acft_manufacturer == acft_type_silver_df.manufacturer_code), how="left") \
  .join(acft_special_type_silver_df, occurrences_silver_df.acft_icao_type == acft_special_type_silver_df.acft_special_type, "left")

#### 3.Dropping duplicates rows and columns

In [0]:
from pyspark.sql.functions import col, when;

In [0]:
# juntar special_desc com afct_description quando este for nulo
description_merged_df = all_df_joined.withColumn("acft_description", when(col("acft_description").isNull(), col("acft_special_model_description")).otherwise(col("acft_description")));

In [0]:
dedupe_column_df = description_merged_df.drop('acft_icao_model','icao_aircraft_type','acft_special_type','acft_special_model_description')

In [0]:
all_df_joined_dropped_dedupe = dedupe_column_df.dropDuplicates(
    ["occurrence_number", "occurrence_description", "acft_model"]
)

#### 4.Concatenating occurrence description to result in only one row

In [0]:
from pyspark.sql import functions as F

ocurrences_unified_desc_df = (
    all_df_joined_dropped_dedupe.groupBy(
        *[col for col in all_df_joined_dropped_dedupe.columns if col != "occurrence_description"]
    )
    .agg(
        F.concat_ws(" || ", F.collect_list("occurrence_description")).alias("occurrence_description")
    )
)

In [0]:
# ocurrences_unified_desc_df.createOrReplaceTempView("processed_occurrences_joined")

In [0]:
# %sql
# SELECT COUNT(*), ocurrence_number
# FROM processed_occurrences_joined
# GROUP BY ocurrence_number
# HAVING COUNT(*) > 1;

#### 5.Enrich joined DF

In [0]:
date_enriched_df = enrich_date_columns(ocurrences_unified_desc_df)

In [0]:
final_df = enrich_occurrence_columns(date_enriched_df)

#### 6.Performing merge

In [0]:
merge_condition = "tgt.occurrence_number == src.occurrence_number AND tgt.operator == src.operator"
merge_delta_data("anac_ocorrencias_dev","gold","transformed_occurrences",final_df,merge_condition,['occurrence_year','state','occurrence_classification'])