In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("hiv-project") \
    .getOrCreate()

In [0]:
%pip uninstall -y databricks_helpers exercise_ev_databricks_unit_tests
%pip install git+https://github.com/data-derp/databricks_helpers#egg=databricks_helpers git+https://github.com/data-derp/exercise_ev_databricks_unit_tests#egg=exercise_ev_databricks_unit_tests

In [0]:
from databricks_helpers.databricks_helpers import DataDerpDatabricksHelpers
exercise_name = "hiv-project"
helpers = DataDerpDatabricksHelpers(dbutils, exercise_name)

In [0]:
working_directory = helpers.working_directory()
print(working_directory)

# Read from Silver


In [0]:

from pyspark.sql import DataFrame
def read_from_gold(df_name: str) -> DataFrame:
    df_path = f"{working_directory}/gold/{df_name}"
    return spark.read.parquet(df_path)

df_hiv_poverty = read_from_gold("df_hiv_poverty")

In [0]:
display(df_hiv_poverty)

In [0]:
df_hiv_poverty.columns

In [0]:
from pyspark.sql.functions import corr

correlation = df_hiv_poverty.select(corr("count_In_Poverty", "HIV diagnosis rate")).collect()[0][0]

print(f"Correlación entre pobreza y tasa de diagnóstico de VIH: {correlation}")

In [0]:
# Calcular la correlación entre pobreza y % vinculado a tratamiento en 3 meses
correlation_treatment = df_hiv_poverty.select(corr("count_In_Poverty", "% linked to care within 3 months")).collect()[0][0]

print(f"Correlación entre pobreza y acceso temprano a tratamiento: {correlation_treatment}")


In [0]:
from pyspark.sql.functions import avg, col

# Agrupar por borough y calcular métricas promedio
vulnerable_areas = df_hiv_poverty.groupBy("Borough").agg(
    avg("HIV diagnosis rate").alias("avg_HIV_rate"),
    avg("count_In_Poverty").alias("avg_poverty"),
    avg("% linked to care within 3 months").alias("avg_care_linkage"),
    avg("% viral suppression").alias("avg_viral_suppression")
).orderBy(col("avg_HIV_rate").desc())

# Mostrar las comunidades más vulnerables
display(vulnerable_areas)


In [0]:
numeric_cols = [
    "HIV diagnoses", "HIV diagnosis rate", "Concurrent diagnoses", "% linked to care within 3 months",
    "AIDS diagnoses", "PLWDHI prevalence", "% viral suppression", "Deaths", "Death rate",
    "avg_NYCgov_Income", "count_In_Poverty", "count_Not_In_Poverty"
]
correlation_results = []
for col1 in numeric_cols:
    row = []
    for col2 in numeric_cols:
        correlation = df_hiv_poverty.select(corr(col1, col2)).collect()[0][0]
        row.append(round(correlation, 3) if correlation is not None else None)
    correlation_results.append(row)

import pandas as pd
correlation_df = pd.DataFrame(correlation_results, index=numeric_cols, columns=numeric_cols)



In [0]:
correlation_df

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql.functions import corr

# Lista de variables numéricas a analizar
numeric_cols = df_hiv_poverty.columns

# Crear la matriz de correlación en PySpark
correlation_results = []
for col1 in numeric_cols:
    row = []
    for col2 in numeric_cols:
        correlation = df_hiv_poverty.select(corr(col1, col2)).collect()[0][0]
        row.append(correlation if correlation is not None else 0)
    correlation_results.append(row)

correlation_df = pd.DataFrame(correlation_results, index=numeric_cols, columns=numeric_cols)

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_df, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Mapa de Calor de Correlaciones entre Variables")
plt.show()


In [0]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

numerical_columns = [
    'HIV diagnoses', 'HIV diagnosis rate', 'Concurrent diagnoses', '% linked to care within 3 months',
    'AIDS diagnoses', 'AIDS diagnosis rate', 'PLWDHI prevalence', '% viral suppression', 
    'Deaths', 'Death rate', 'HIV-related death rate', 'Non-HIV-related death rate',
    'avg_NYCgov_Income', 'poverty_rate', 'no_poverty_rate',
    'weighted_Full_Time_Work_Year_Round', 'weighted_Less_Than_Full_Time_Work_Year_Round', 
    'weighted_No_Work', 'weighted_Less_Than_High_School', 'weighted_High_School_Degree', 
    'weighted_Some_College', 'weighted_Bachelors_Or_Higher'
]

assembler = VectorAssembler(inputCols=numerical_columns, outputCol="features")
df_features = assembler.transform(df_hiv_poverty.na.drop(subset=numerical_columns))

correlation_matrix = Correlation.corr(df_features, 'features').head()[0]
correlation_matrix = correlation_matrix.toArray()


In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

correlation_df = pd.DataFrame(correlation_matrix, columns=numerical_columns, index=numerical_columns)

plt.figure(figsize=(12, 8)) 
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, cbar=True)

plt.title("Mapa de Calor de Correlación entre Variables de Pobreza y VIH")
plt.show()


&copy; 2025 Thoughtworks. All rights reserved.<br/>