In [0]:
%run "../utils/common_functions"

#### 1.Reading data from silver tables

In [0]:
occurrences_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_occurrences")

In [0]:
acft_type_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_aircraft_type")

In [0]:
acft_special_type_silver_df = spark.table("anac_ocorrencias_dev.silver.processed_aircraft_special_type")

#### 2.Joining all DF's to enrich data

In [0]:
all_df_joined = occurrences_silver_df \
  .join(acft_type_silver_df, (occurrences_silver_df.acft_icao_type == acft_type_silver_df.icao_aircraft_type) | (occurrences_silver_df.acft_manufacturer == acft_type_silver_df.manufacturer_code), how="left") \
  .join(acft_special_type_silver_df, occurrences_silver_df.acft_icao_type == acft_special_type_silver_df.acft_special_type, "left")

#### 3.Dropping duplicates rows and columns

In [0]:
from pyspark.sql.functions import col, when;

In [0]:
description_merged_df = all_df_joined.withColumn("acft_description", when(col("acft_description").isNull(), col("acft_special_model_description")).otherwise(col("acft_description")));

In [0]:
dedupe_column_df = description_merged_df.drop('acft_icao_model','icao_aircraft_type','acft_special_type','acft_special_model_description','manufacturer_code')

In [0]:
all_df_joined_dropped_dedupe = dedupe_column_df.dropDuplicates(
    ["occurrence_number", "occurrence_description", "acft_model"]
)

#### 4.Concatenating occurrence description and operators to result in only one row

In [0]:
from pyspark.sql import functions as F

grouped_df = (
    all_df_joined_dropped_dedupe
    .groupBy(*[col for col in all_df_joined_dropped_dedupe.columns if col not in ["occurrence_description", "operator"]])
    .agg(
        F.concat_ws(" || ", F.collect_list("occurrence_description")).alias("occurrence_description"),
        F.collect_set("operator").alias("operator_set")
    )
)

final_df = grouped_df.withColumn(
    "operator",
    F.when(
        F.size("operator_set") == 1,
        F.element_at("operator_set", 1)
    ).otherwise(
        F.concat_ws(" || ", "operator_set")
    )
).drop("operator_set")


#### 5.Parsing Latitude and Longitude to double

In [0]:
lat_formatted_df = change_data_in_column(final_df,"latitude",",",".")
lat_formatted_df = lat_formatted_df.withColumn("latitude",col("latitude").cast("double"))


In [0]:
long_formatted_df = change_data_in_column(lat_formatted_df,"longitude",",",".")
long_formatted_df = long_formatted_df.withColumn("longitude",col("longitude").cast("double"))

In [0]:
nulls_handled_df = long_formatted_df.na.fill(0, subset = ["latitude", "longitude","number_of_engines","acft_seats","acft_max_gross_weight"]) \
.na.fill({"acft_classification":"Não informado",                
    "engine_type":"Não informado",
    "wake_turbulence_group":"Não informado",
    "wake_turbulence_category":"Não informado",
    "acft_description":"Não informado"})

#### 6.Enrich joined DF

In [0]:
date_enriched_df = enrich_date_columns(nulls_handled_df)

In [0]:
final_df = enrich_occurrence_columns(date_enriched_df)

#### 7.Performing merge

In [0]:
final_df.write.format("delta").mode("overwrite").saveAsTable("anac_ocorrencias_dev.gold.transformed_occurrences")

In [0]:
# merge_condition = "tgt.occurrence_number == src.occurrence_number AND tgt.operator == src.operator"
# merge_delta_data("anac_ocorrencias_dev","gold","transformed_occurrences",final_df,merge_condition,['occurrence_year','state','occurrence_classification'])