In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("hiv-project") \
    .getOrCreate()

In [0]:
%pip uninstall -y databricks_helpers exercise_ev_databricks_unit_tests
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers git+https://github.com/data-derp/exercise_ev_databricks_unit_tests#egg=exercise_ev_databricks_unit_tests

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers
exercise_name = "hiv-project"
helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

In [0]:
working_directory = helpers.working_directory()
print(working_directory)

# SILVER

## Read from Parquet

In [0]:

from pyspark.sql import DataFrame
def read_from_parquet(df_name: str) -> DataFrame:
    df_path = f"{working_directory}/parquet/{df_name}"
    return spark.read.parquet(df_path)

df_hiv_silver = read_from_parquet("df_hiv")

available_years = [2011, 2012, 2013, 2014, 2015, 2017, 2018] 
silver_year_dataframes = {}
for year in available_years:
    silver_year_dataframes[year] = read_from_parquet(f"df_poverty_{year}")




In [0]:
display(df_hiv_silver)

## Concat Poverty Datasets with years

In [0]:
from pyspark.sql.functions import lit

df_poverty_silver = None

for year, df in silver_year_dataframes.items():
    df_with_year = df.withColumn("year", lit(year))
    
    if df_poverty_silver is None:
        df_poverty_silver = df_with_year
    else:
        df_poverty_silver = df_poverty_silver.union(df_with_year)

In [0]:
def get_shape(df):
    return df.count(), len(df.columns)

In [0]:
get_shape(df_poverty_silver)

In [0]:
display(df_poverty_silver)

HIV Collapsed

In [0]:
from pyspark.sql.functions import sum, avg

df_hiv_collapsed = df_hiv_silver.groupBy("Year", "Borough", "Gender", "Age", "Race").agg(
    sum("HIV diagnoses").alias("HIV diagnoses"),
    avg("HIV diagnosis rate").alias("HIV diagnosis rate"),
    sum("Concurrent diagnoses").alias("Concurrent diagnoses"),
    avg("% linked to care within 3 months").alias("% linked to care within 3 months"),
    sum("AIDS diagnoses").alias("AIDS diagnoses"),
    avg("AIDS diagnosis rate").alias("AIDS diagnosis rate"),
    sum("PLWDHI prevalence").alias("PLWDHI prevalence"),
    avg("% viral suppression").alias("% viral suppression"),
    sum("Deaths").alias("Deaths"),
    avg("Death rate").alias("Death rate"),
    avg("HIV-related death rate").alias("HIV-related death rate"),
    avg("Non-HIV-related death rate").alias("Non-HIV-related death rate")
)


In [0]:
get_shape(df_hiv_collapsed)

In [0]:
display(df_hiv_collapsed)

## AGGREGATIONS YEAR - BORO - GENDER - RACE - SED - AGE

In [0]:
from pyspark.sql import functions as F

def group_age(df):
    conditions = [
        (df['AGEP'] >= 13) & (df['AGEP'] <= 19), '13-19',
        (df['AGEP'] >= 18) & (df['AGEP'] <= 29), '18-29',
        (df['AGEP'] >= 30) & (df['AGEP'] <= 39), '30-39',
        (df['AGEP'] >= 40) & (df['AGEP'] <= 49), '40-49',
        (df['AGEP'] >= 50) & (df['AGEP'] <= 59), '50-59',
        (df['AGEP'] >= 60), '60+'
    ]
    df = df.withColumn('AGEP_group', F.when(conditions[0], conditions[1])
                                  .when(conditions[2], conditions[3])
                                  .when(conditions[4], conditions[5])
                                  .when(conditions[6], conditions[7])
                                  .when(conditions[8], conditions[9])
                                  .when(conditions[10], conditions[11])
                                  .otherwise('Unknown'))  
    return df

df_poverty_silver = group_age(df_poverty_silver)

In [0]:
display(df_poverty_silver)

In [0]:
df_poverty_silver = df_poverty_silver.withColumnRenamed("year", "Year") \
                                     .withColumnRenamed("Boro", "Borough") \
                                     .withColumnRenamed("SEX", "Gender") \
                                     .withColumnRenamed("AGEP_group", "Age") \
                                     .withColumnRenamed("Ethnicity", "Race")



In [0]:
from pyspark.sql.functions import col, when

# Actualizar los valores de la columna 'Borough' en df_poverty_silver según los valores de df_hiv_silver
df_poverty_silver = df_poverty_silver.withColumn(
    "Borough", 
    when(col("Borough") == 1, "Bronx")
    .when(col("Borough") == 2, "Brooklyn")
    .when(col("Borough") == 3, "Manhattan")
    .when(col("Borough") == 4, "Queens")
    .when(col("Borough") == 5, "Staten Island")
    .when(col("Borough") == "All", "All")
    .otherwise(col("Borough"))
)

# Actualizar los valores de la columna 'Year' en df_poverty_silver según los valores de df_hiv_silver
df_poverty_silver = df_poverty_silver.withColumn(
    "Year", 
    when(col("Year") == 2011, 2011)
    .when(col("Year") == 2012, 2012)
    .when(col("Year") == 2013, 2013)
    .when(col("Year") == 2014, 2014)
    .when(col("Year") == 2015, 2015)
    .when(col("Year") == 2017, 2017)
    .when(col("Year") == 2018, 2018)
    .when(col("Year") == 2019, 2019)
    .when(col("Year") == 2020, 2020)
    .when(col("Year") == 2021, 2021)
    .otherwise(col("Year"))
)

# Actualizar los valores de la columna 'Age' en df_poverty_silver según los valores de df_hiv_silver
df_poverty_silver = df_poverty_silver.withColumn(
    "Age", 
    when(col("Age") == "30-39", "30 - 39")
    .when(col("Age") == "40-49", "40 - 49")
    .when(col("Age") == "50-59", "50 - 59")
    .when(col("Age") == "60+", "60+")
    .when(col("Age") == "All", "All")
    .when(col("Age") == "13-19", "13 - 19")
    .when(col("Age") == "18-29", "18 - 29")
    .otherwise(col("Age"))
)

# Actualizar los valores de la columna 'Gender' (SEX) en df_poverty_silver según los valores de df_hiv_silver
df_poverty_silver = df_poverty_silver.withColumn(
    "Gender", 
    when(col("Gender") == 1, "Male")
    .when(col("Gender") == 2, "Female")
    .when(col("Gender") == "All", "All")
    .when(col("Gender") == "Transgender", "Transgender")
    .otherwise(col("Gender"))
)

# Actualizar los valores de la columna 'Race' (Ethnicity) en df_poverty_silver según los valores correctos
df_poverty_silver = df_poverty_silver.withColumn(
    "Race", 
    when(col("Race") == 1, "Non-Hispanic White")
    .when(col("Race") == 2, "Non-Hispanic Black")
    .when(col("Race") == 3, "Non-Hispanic Asian")
    .when(col("Race") == 4, "Hispanic, Any Race")
    .when(col("Race") == 5, "Other Race/Ethnic Group")
    .when(col("Race") == "All", "All")
    .otherwise(col("Race"))
)



In [0]:
df_poverty_silver = df_poverty_silver.withColumn(
    "Race", 
    when(col("Race") == "Other Race/Ethnic Group", "Other/Unknown")
    .when(col("Race") == "Hispanic, Any Race", "Latino/Hispanic")
    .when(col("Race") == "Non-Hispanic Asian", "Asian/Pacific Islander")
    .when(col("Race") == "Non-Hispanic Black", "Black")
    .when(col("Race") == "Non-Hispanic White", "White")
    .otherwise(col("Race"))
)

In [0]:
"""
from pyspark.sql import functions as F



df_aggregated_poverty = df_poverty_silver.groupBy(
    'Year', 'Borough', 'Gender', 'Age', 'Race'
).agg(
    F.avg('NYCgov_Income').alias('avg_NYCgov_Income'),

    F.count(F.when(df_poverty_silver['Off_Pov_Stat'] == 1, 1)).alias('count_In_Poverty'),
    F.count(F.when(df_poverty_silver['Off_Pov_Stat'] == 2, 1)).alias('count_Not_In_Poverty'),

    F.count(F.when(df_poverty_silver['FTPTWork'] == 1, 1)).alias('count_Full_Time_Work_Year_Round'),
    F.count(F.when(df_poverty_silver['FTPTWork'] == 2, 1)).alias('count_Less_Than_Full_Time_Work_Year_Round'),
    F.count(F.when(df_poverty_silver['FTPTWork'] == 3, 1)).alias('count_No_Work'),

    F.count(F.when(df_poverty_silver['EducAttain'] == 1, 1)).alias('count_Less_Than_High_School'),
    F.count(F.when(df_poverty_silver['EducAttain'] == 2, 1)).alias('count_High_School_Degree'),
    F.count(F.when(df_poverty_silver['EducAttain'] == 3, 1)).alias('count_Some_College'),
    F.count(F.when(df_poverty_silver['EducAttain'] == 4, 1)).alias('count_Bachelors_Or_Higher')
)
"""

In [0]:
from pyspark.sql import functions as F

df_aggregated_poverty = df_poverty_silver.groupBy(
    'Year', 'Borough', 'Gender', 'Age', 'Race'
).agg(
    F.sum("PWGTP").alias("total_weight"),

    # Ingreso promedio ponderado
    F.sum(F.col("NYCgov_Income") * F.col("PWGTP")).alias("weighted_income_sum"),
    (F.col("weighted_income_sum") / F.col("total_weight")).alias("avg_NYCgov_Income"),

    # Población total en pobreza y fuera de pobreza (ponderada)
    F.sum(F.when(F.col("Off_Pov_Stat") == 1, F.col("PWGTP"))).alias("weighted_poverty_population"),
    F.sum(F.when(F.col("Off_Pov_Stat") == 2, F.col("PWGTP"))).alias("weighted_non_poverty_population"),
    
    # Tasa de pobreza real
    (F.col("weighted_poverty_population") / F.col("total_weight")).alias("poverty_rate"),
    (F.col("weighted_non_poverty_population") / F.col("total_weight")).alias("no_poverty_rate"),

    # Distribución laboral ponderada
    F.sum(F.when(F.col("FTPTWork") == 1, F.col("PWGTP"))).alias("weighted_Full_Time_Work_Year_Round"),
    F.sum(F.when(F.col("FTPTWork") == 2, F.col("PWGTP"))).alias("weighted_Less_Than_Full_Time_Work_Year_Round"),
    F.sum(F.when(F.col("FTPTWork") == 3, F.col("PWGTP"))).alias("weighted_No_Work"),
    
    # Distribución educativa ponderada
    F.sum(F.when(F.col("EducAttain") == 1, F.col("PWGTP"))).alias("weighted_Less_Than_High_School"),
    F.sum(F.when(F.col("EducAttain") == 2, F.col("PWGTP"))).alias("weighted_High_School_Degree"),
    F.sum(F.when(F.col("EducAttain") == 3, F.col("PWGTP"))).alias("weighted_Some_College"),
    F.sum(F.when(F.col("EducAttain") == 4, F.col("PWGTP"))).alias("weighted_Bachelors_Or_Higher")
)


&copy; 2025 Thoughtworks. All rights reserved.<br/>

In [0]:
display(df_aggregated_poverty)

In [0]:
# Filtrar los valores de 'hiv' que no están en 'poverty'
valid_years = ["2011", 2012, 2013, 2014, 2015, 2017, 2018]

df_hiv_filtered = df_hiv_collapsed \
    .filter(~col("Age").isin("All")) \
    .filter(~col("Race").isin("All")) \
    .filter(~col("Gender").isin("All")) \
    .filter(~col("Borough").isin("All")) \
    .filter(~col("Year").isin("All")) \
    .filter(col("Year").isin(valid_years))

df_hiv_filtered = df_hiv_filtered.withColumn(
    "Gender", 
    when(col("Gender") == "Women", "Female")
    .when(col("Gender") == "Men", "Male")
    .otherwise(col("Gender"))
)

df_hiv_filtered = df_hiv_filtered.withColumn(
    "Age", 
    when(col("Age") == "20 - 29", "18 - 29").otherwise(col("Age"))
)

df_hiv_filtered = df_hiv_filtered.withColumn(
    "Race", 
    when(col("Race") == "Latinx/Hispanic", "Latino/Hispanic").otherwise(col("Race"))
)




In [0]:
df_hiv_filtered.select("Borough").distinct().show()
df_poverty_silver.select("Borough").distinct().show()

df_hiv_filtered.select("Year").distinct().show()
df_poverty_silver.select("Year").distinct().show()

df_hiv_filtered.select("Age").distinct().show()
df_poverty_silver.select("Age").distinct().show()

df_hiv_filtered.select("Gender").distinct().show()
df_poverty_silver.select("Gender").distinct().show()

df_hiv_filtered.select("Race").distinct().show()
df_poverty_silver.select("Race").distinct().show()


In [0]:
df_hiv_filtered.count()


In [0]:
df_aggregated_poverty.count()


In [0]:
df_joined = df_hiv_filtered.join(
    df_aggregated_poverty, 
    on=["Year", "Borough","Race", "Gender", "Age"], 
    how="inner"
)
get_shape(df_joined)

In [0]:
display(df_joined)

# WRITE TO PARQUET

In [0]:
def write_to_gold(input_df: DataFrame, name_df: str):
    out_dir = f"{working_directory}/gold/{name_df}"
    mode_name = "overwrite"
    
    input_df. \
        write. \
        mode(mode_name). \
        parquet(out_dir)
    


In [0]:
df_name = "df_hiv_poverty"
write_to_gold(df_joined, df_name)