In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("hiv-project") \
    .getOrCreate()

In [0]:
%pip uninstall -y databricks_helpers exercise_ev_databricks_unit_tests
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers git+https://github.com/data-derp/exercise_ev_databricks_unit_tests#egg=exercise_ev_databricks_unit_tests

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers
exercise_name = "hiv-project"
helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

In [0]:
working_directory = helpers.working_directory()
print(working_directory)

# BRONZE

## HIV Dataset

In [0]:
from pyspark.sql.types import *
from pyspark.sql import DataFrame

def create_dataframe(filepath: str, schema: StructType, from_extension: str) -> DataFrame:
    df = spark.read.format(from_extension) \
        .option("header", True) \
        .option("delimiter", ",") \
        .option("escape", "\\") \
        .schema(schema) \
        .load(filepath)
    return df

In [0]:
hiv_schema = StructType([
    StructField("Year", IntegerType(), True),
    StructField("Borough", StringType(), True),
    StructField("UHF", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Age", StringType(), True),
    StructField("Race", StringType(), True),
    StructField("HIV diagnoses", IntegerType(), True),
    StructField("HIV diagnosis rate", FloatType(), True),
    StructField("Concurrent diagnoses", IntegerType(), True),
    StructField("% linked to care within 3 months", FloatType(), True),
    StructField("AIDS diagnoses", IntegerType(), True),
    StructField("AIDS diagnosis rate", FloatType(), True),
    StructField("PLWDHI prevalence", FloatType(), True),
    StructField("% viral suppression", FloatType(), True),
    StructField("Deaths", IntegerType(), True),
    StructField("Death rate", FloatType(), True),
    StructField("HIV-related death rate", FloatType(), True),
    StructField("Non-HIV-related death rate", FloatType(), True)
])


In [0]:
from_extension = "csv"
hiv_url = "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/DOHMH_HIV_AIDS_Annual_Report.csv"
hiv_filepath = helpers.download_to_local_dir(hiv_url)

df_hiv = create_dataframe(hiv_filepath, hiv_schema, from_extension)

In [0]:
display(df_hiv)

## Poverty Datasets

In [0]:
import os
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

poverty_schema = StructType([
    StructField("SERIALNO", IntegerType(), True),
    StructField("SPORDER", IntegerType(), True),
    StructField("PWGTP", DoubleType(), True),
    StructField("WGTP", DoubleType(), True),
    StructField("AGEP", IntegerType(), True),
    StructField("CIT", IntegerType(), True),
    StructField("REL", IntegerType(), True),
    StructField("SCH", IntegerType(), True),
    StructField("SCHG", IntegerType(), True),
    StructField("SCHL", IntegerType(), True),
    StructField("SEX", IntegerType(), True),
    StructField("ESR", IntegerType(), True),
    StructField("LANX", IntegerType(), True),
    StructField("ENG", IntegerType(), True),
    StructField("MSP", IntegerType(), True),
    StructField("MAR", IntegerType(), True),
    StructField("WKW", IntegerType(), True),
    StructField("WKHP", IntegerType(), True),
    StructField("DIS", IntegerType(), True),
    StructField("JWTR", IntegerType(), True),
    StructField("NP", IntegerType(), True),
    StructField("TEN", IntegerType(), True),
    StructField("HHT", IntegerType(), True),
    StructField("AgeCateg", IntegerType(), True),
    StructField("Boro", IntegerType(), True),
    StructField("CitizenStatus", IntegerType(), True),
    StructField("EducAttain", IntegerType(), True),
    StructField("EST_Childcare", DoubleType(), True),
    StructField("EST_Commuting", DoubleType(), True),
    StructField("EST_EITC", DoubleType(), True),
    StructField("EST_FICAtax", DoubleType(), True),
    StructField("EST_HEAP", DoubleType(), True),
    StructField("EST_Housing", DoubleType(), True),
    StructField("EST_IncomeTax", DoubleType(), True),
    StructField("EST_MOOP", DoubleType(), True),
    StructField("EST_Nutrition", DoubleType(), True),
    StructField("EST_PovGap", DoubleType(), True),
    StructField("EST_PovGapIndex", DoubleType(), True),
    StructField("Ethnicity", IntegerType(), True),
    StructField("FamType_PU", IntegerType(), True),
    StructField("FTPTWork", IntegerType(), True),
    StructField("INTP_adj", DoubleType(), True),
    StructField("MRGP_adj", DoubleType(), True),
    StructField("NYCgov_Income", DoubleType(), True),
    StructField("NYCgov_Pov_Stat", IntegerType(), True),
    StructField("NYCgov_REL", IntegerType(), True),
    StructField("NYCgov_Threshold", DoubleType(), True),
    StructField("Off_Pov_Stat", IntegerType(), True),
    StructField("Off_Threshold", DoubleType(), True),
    StructField("OI_adj", DoubleType(), True),
    StructField("PA_adj", DoubleType(), True),
    StructField("Povunit_ID", IntegerType(), True),
    StructField("Povunit_Rel", IntegerType(), True),
    StructField("PreTaxIncome_PU", DoubleType(), True),
    StructField("RETP_adj", DoubleType(), True),
    StructField("RNTP_adj", DoubleType(), True),
    StructField("SEMP_adj", DoubleType(), True),
    StructField("SSIP_adj", DoubleType(), True),
    StructField("SSP_adj", DoubleType(), True),
    StructField("TotalWorkHrs_PU", DoubleType(), True),
    StructField("WAGP_adj", DoubleType(), True)
])


In [0]:

file_urls = [
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2011__20250316.csv",
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2012__20250316.csv",
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2013__20250316.csv",
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2014__20250316.csv",
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2015__20250316.csv",
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2017__20250316.csv",
    "https://raw.githubusercontent.com/fernando-soto23/hiv-project-data/refs/heads/main/data/NYCgov_Poverty_Measure_Data__2018__20250315.csv"
]

dataframes = []
year_dataframes = {}

for url in file_urls:
    year = url.split('__')[1]
    
    local_path = helpers.download_to_local_dir(url)
    
    df_poverty = spark.read.format("csv") \
        .option("header", "true") \
        .option("delimiter", ",") \
        .schema(poverty_schema) \
        .load(local_path)
    
    year_dataframes[year] = df_poverty

In [0]:
df_2011 = year_dataframes['2011']
display(df_2011)

In [0]:
def get_shape(input_df):
    num_rows = input_df.count()
    num_cols = len(input_df.columns)
    return (num_rows, num_cols)


In [0]:
get_shape(df_2011)

## WRITE TO PARQUET

In [0]:
def write_to_parqet(input_df: DataFrame, name_df: str):
    out_dir = f"{working_directory}/parquet/{name_df}"
    mode_name = "overwrite"
    
    input_df. \
        write. \
        mode(mode_name). \
        parquet(out_dir)
    


## Write HIV Dataframe to Parquet

In [0]:
df_hiv_name = "df_hiv"
write_to_parqet(df_hiv, df_hiv_name)

## Write Poverty Dataframes for each year to Parquet

In [0]:
for year, df in year_dataframes.items():
    write_to_parqet(df, f"df_poverty_{year}")


&copy; 2025 Thoughtworks. All rights reserved.<br/>