In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("hiv-project") \
    .getOrCreate()

In [0]:
%pip uninstall -y databricks_helpers exercise_ev_databricks_unit_tests
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers git+https://github.com/data-derp/exercise_ev_databricks_unit_tests#egg=exercise_ev_databricks_unit_tests

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers
exercise_name = "hiv-project"
helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

In [0]:
working_directory = helpers.working_directory()
print(working_directory)

# SILVER

## Read from Parquet

In [0]:

from pyspark.sql import DataFrame
def read_from_parquet(df_name: str) -> DataFrame:
    df_path = f"{working_directory}/parquet/{df_name}"
    return spark.read.parquet(df_path)

df_hiv_silver = read_from_parquet("df_hiv")

available_years = [2011, 2012, 2013, 2014, 2015, 2017, 2018] 
silver_year_dataframes = {}
for year in available_years:
    silver_year_dataframes[year] = read_from_parquet(f"df_poverty_{year}")




## Concat Poverty Datasets with years

In [0]:
from pyspark.sql.functions import lit

df_poverty_silver = None

for year, df in silver_year_dataframes.items():
    df_with_year = df.withColumn("year", lit(year))
    
    if df_poverty_silver is None:
        df_poverty_silver = df_with_year
    else:
        df_poverty_silver = df_poverty_silver.union(df_with_year)

In [0]:
def get_shape(df):
    return df.count(), len(df.columns)

In [0]:
get_shape(df_poverty_silver)

In [0]:
import matplotlib.pyplot as plt

df_pandas = df_poverty_silver.select("year", "NYCgov_Income").toPandas()

df_grouped = df_pandas.groupby("year")["NYCgov_Income"].mean().reset_index()

plt.figure(figsize=(10, 5))
plt.plot(df_grouped["year"], df_grouped["NYCgov_Income"], marker='o', linestyle='-')

plt.xlabel("Year")
plt.ylabel("Average NYCgov_Income")
plt.title("NYCgov_Income Over the Years")
plt.grid(True)

plt.show()


## AGGREGATIONS YEAR - BORO - GENDER - RACE - SED - AGE

In [0]:
"""
from pyspark.sql import functions as F

def group_age(df):
    conditions = [
        (df['AGEP'] >= 13) & (df['AGEP'] <= 19), '13-19',
        (df['AGEP'] >= 18) & (df['AGEP'] <= 29), '18-29',
        (df['AGEP'] >= 20) & (df['AGEP'] <= 29), '20-29',
        (df['AGEP'] >= 30) & (df['AGEP'] <= 39), '30-39',
        (df['AGEP'] >= 40) & (df['AGEP'] <= 49), '40-49',
        (df['AGEP'] >= 50) & (df['AGEP'] <= 59), '50-59',
        (df['AGEP'] >= 60), '60+'
    ]
    df = df.withColumn('AGEP_group', F.when(conditions[0], conditions[1])
                                  .when(conditions[2], conditions[3])
                                  .when(conditions[4], conditions[5])
                                  .when(conditions[6], conditions[7])
                                  .when(conditions[8], conditions[9])
                                  .when(conditions[10], conditions[11])
                                  .otherwise('Unknown'))  
    return df

df_poverty_grouped = group_age(df_poverty_silver)

df_aggregated = df_poverty_grouped.groupBy(
    'year', 'Boro', 'SEX', 'AGEP_group', 'Ethnicity'
).agg(
    F.avg('NYCgov_Income').alias('avg_NYCgov_Income'),

    F.count(F.when(df_poverty_grouped['Off_Pov_Stat'] == 1, 1)).alias('count_In_Poverty'),
    F.count(F.when(df_poverty_grouped['Off_Pov_Stat'] == 2, 1)).alias('count_Not_In_Poverty'),

    F.count(F.when(df_poverty_grouped['FTPTWork'] == 1, 1)).alias('count_Full_Time_Work_Year_Round'),
    F.count(F.when(df_poverty_grouped['FTPTWork'] == 2, 1)).alias('count_Less_Than_Full_Time_Work_Year_Round'),
    F.count(F.when(df_poverty_grouped['FTPTWork'] == 3, 1)).alias('count_No_Work'),

    F.count(F.when(df_poverty_grouped['EducAttain'] == 1, 1)).alias('count_Less_Than_High_School'),
    F.count(F.when(df_poverty_grouped['EducAttain'] == 2, 1)).alias('count_High_School_Degree'),
    F.count(F.when(df_poverty_grouped['EducAttain'] == 3, 1)).alias('count_Some_College'),
    F.count(F.when(df_poverty_grouped['EducAttain'] == 4, 1)).alias('count_Bachelors_Or_Higher')
)

"""

&copy; 2025 Thoughtworks. All rights reserved.<br/>