In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, lit, sum, avg, count
from pyspark.sql.types import DoubleType, IntegerType, StringType

In [0]:
%pip uninstall -y databricks_helpers exercise_ev_databricks_unit_tests
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers git+https://github.com/data-derp/exercise_ev_databricks_unit_tests#egg=exercise_ev_databricks_unit_tests

In [0]:
spark = SparkSession.builder \
    .appName("hiv-project-gold") \
    .getOrCreate()


In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers
exercise_name = "hiv-project"
helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)



In [0]:
from utils import write_to_uc_table


In [0]:
def read_from_unity_catalog(table_name: str) -> DataFrame:
    return spark.read.table(table_name)

In [0]:
df_hiv_silver = read_from_unity_catalog("catalog_de.silver.hiv_data_clean")
df_poverty_silver = read_from_unity_catalog("catalog_de.silver.poverty_measure")

In [0]:
df_hiv_silver.select("year").distinct().orderBy("year").show()


In [0]:
print(df_poverty_silver.columns)



In [0]:
df_poverty_silver.select("year").distinct().orderBy("year").show()


In [0]:
df_hiv_agg = df_hiv_silver.groupBy("year", "borough", "gender", "age", "race").agg(
    sum("hiv_diagnoses").alias("hiv_diagnoses_total"),
    avg("hiv_diagnosis_rate").alias("hiv_diagnosis_rate_avg"),
    
    sum("aids_diagnoses").alias("aids_diagnoses_total"),
    avg("aids_diagnosis_rate").alias("aids_diagnosis_rate_avg"),
    
    avg("perc_linked_to_care_within_3_months").alias("perc_linked_to_care_avg"),
    avg("perc_viral_suppression").alias("perc_viral_suppression_avg"),
    
    sum("deaths").alias("deaths_total"),
    avg("death_rate").alias("death_rate_avg"),
    
    avg("plwdhi_prevalence").alias("plwdhi_prevalence_avg")
)


In [0]:
df_poverty_agg = df_poverty_silver.groupBy("year", "borough", "gender", "age", "race").agg(
    sum("pwgtp").alias("weighted_population"),
    
    (sum(F.col("nycgov_income") * F.col("pwgtp")) / sum("pwgtp")).alias("avg_income"),
    
    (sum(F.when(F.col("nycgov_pov_stat") == "In poverty", F.col("pwgtp"))) / sum("pwgtp") * 100).alias("poverty_rate"),
    
    (sum(F.when(F.col("educattain") == "Less than high school", F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_less_than_high_school"),
    (sum(F.when(F.col("educattain") == "High school diploma", F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_high_school"),
    (sum(F.when(F.col("educattain") == "Some college", F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_some_college"),
    (sum(F.when(F.col("educattain").isin("Bachelor's degree", "Graduate degree"), F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_bachelors_or_higher"),
    
    (sum(F.when(F.col("work_experience") == "Full-time, year-round", F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_full_time"),
    (sum(F.when(F.col("work_experience").isin("Part-time, year-round", "Full-time, part-year", "Part-time, part-year"), F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_part_time_or_part_year"),
    
    (sum(F.when(F.col("disability_status") == "With disability", F.col("pwgtp"))) / sum("pwgtp") * 100).alias("pct_with_disability")
)



In [0]:
df_hiv_poverty = df_hiv_agg.join(
    df_poverty_agg,
    on=["year", "borough", "gender", "age", "race"],
    how="inner"
)


In [0]:
print(f"Number of rows after joining: {df_hiv_poverty.count()}")
display(df_hiv_poverty.limit(10))

In [0]:
df_borough_analysis = df_hiv_poverty.groupBy("year", "borough").agg(
    sum("weighted_population").alias("total_population"),
    
    sum("hiv_diagnoses_total").alias("hiv_diagnoses"),
    (sum("hiv_diagnoses_total") / sum("weighted_population") * 100000).alias("hiv_rate_per_100k"),
    avg("plwdhi_prevalence_avg").alias("hiv_prevalence"),
    avg("perc_viral_suppression_avg").alias("viral_suppression_rate"),
    
    avg("poverty_rate").alias("poverty_rate"),
    avg("avg_income").alias("avg_income"),
    
    avg("pct_bachelors_or_higher").alias("pct_bachelors_or_higher")
)


In [0]:
df_race_analysis = df_hiv_poverty.groupBy("year", "race").agg(
    sum("weighted_population").alias("total_population"),
    
    sum("hiv_diagnoses_total").alias("hiv_diagnoses"),
    (sum("hiv_diagnoses_total") / sum("weighted_population") * 100000).alias("hiv_rate_per_100k"),
    avg("plwdhi_prevalence_avg").alias("hiv_prevalence"),
    avg("perc_viral_suppression_avg").alias("viral_suppression_rate"),
    
    avg("poverty_rate").alias("poverty_rate"),
    avg("avg_income").alias("avg_income"),
    avg("pct_bachelors_or_higher").alias("pct_bachelors_or_higher")
)


In [0]:
df_yearly_trends = df_hiv_poverty.groupBy("year").agg(
    sum("weighted_population").alias("total_population"),
    
    sum("hiv_diagnoses_total").alias("hiv_diagnoses"),
    (sum("hiv_diagnoses_total") / sum("weighted_population") * 100000).alias("hiv_rate_per_100k"),
    avg("plwdhi_prevalence_avg").alias("hiv_prevalence"),
    sum("aids_diagnoses_total").alias("aids_diagnoses"),
    sum("deaths_total").alias("hiv_deaths"),
    avg("perc_viral_suppression_avg").alias("viral_suppression_rate"),
    
    avg("poverty_rate").alias("poverty_rate"),
    avg("avg_income").alias("avg_income"),
    avg("pct_bachelors_or_higher").alias("pct_bachelors_or_higher"),
    avg("pct_full_time").alias("pct_full_time_employment")
)


In [0]:
df_borough_race = df_hiv_poverty.groupBy("year", "borough", "race").agg(
    sum("weighted_population").alias("total_population"),
    
    sum("hiv_diagnoses_total").alias("hiv_diagnoses"),
    (sum("hiv_diagnoses_total") / sum("weighted_population") * 100000).alias("hiv_rate_per_100k"),
    avg("plwdhi_prevalence_avg").alias("hiv_prevalence"),
    
    avg("poverty_rate").alias("poverty_rate"),
    avg("avg_income").alias("avg_income")
)


In [0]:
df_hiv_poverty = df_hiv_poverty.withColumn(
    "poverty_level",
    when(col("poverty_rate") < 10, "Low (<10%)") 
    .when(col("poverty_rate").between(10, 20), "Medium (10-20%)")
    .when(col("poverty_rate").between(20, 30), "High (20-30%)")
    .when(col("poverty_rate") > 30, "Very High (>30%)")
    .otherwise("Unknown")
)

In [0]:
df_poverty_correlation = df_hiv_poverty.groupBy("year", "poverty_level").agg(
    sum("weighted_population").alias("total_population"),
    
    sum("hiv_diagnoses_total").alias("hiv_diagnoses"),
    (sum("hiv_diagnoses_total") / sum("weighted_population") * 100000).alias("hiv_rate_per_100k"),
    avg("plwdhi_prevalence_avg").alias("hiv_prevalence"),
    avg("perc_viral_suppression_avg").alias("viral_suppression_rate")
)


In [0]:
df_hiv_poverty = df_hiv_poverty.withColumn(
    "education_level",
    when(col("pct_bachelors_or_higher") > 40, "High Education (>40% college degree)")
    .when(col("pct_bachelors_or_higher").between(20, 40), "Medium Education (20-40%)")
    .when(col("pct_bachelors_or_higher") < 20, "Low Education (<20%)")
    .otherwise("Unknown")
)


In [0]:
df_education_correlation = df_hiv_poverty.groupBy("year", "education_level").agg(
    sum("weighted_population").alias("total_population"),
    
    sum("hiv_diagnoses_total").alias("hiv_diagnoses"),
    (sum("hiv_diagnoses_total") / sum("weighted_population") * 100000).alias("hiv_rate_per_100k"),
    avg("plwdhi_prevalence_avg").alias("hiv_prevalence"),
    avg("perc_viral_suppression_avg").alias("viral_suppression_rate")
)


In [0]:
write_to_uc_table(
    input_df=df_hiv_poverty,
    table_name="hiv_poverty_integrated",
    mode="overwrite",
    catalog="catalog_de",
    schema="gold"
)


In [0]:
gold_tables = {
    "borough_analysis": df_borough_analysis,
    "race_analysis": df_race_analysis, 
    "yearly_trends": df_yearly_trends,
    "borough_race_analysis": df_borough_race,
    "poverty_correlation": df_poverty_correlation,
    "education_correlation": df_education_correlation
}

In [0]:
for table_name, dataframe in gold_tables.items():
    try:
        write_to_uc_table(
            input_df=dataframe,
            table_name=table_name,
            mode="overwrite",
            catalog="catalog_de",
            schema="gold"
        )
        print(f"Successfully wrote {table_name} to gold layer")
    except Exception as e:
        print(f"Error writing {table_name} to gold layer: {str(e)}")
