In [1]:
from pathlib import Path
import pandas as pd
import os
import json
import numpy as np
from sqlalchemy import text, create_engine
import tomli

from nvi_etl import setup_logging

# I fixed this, so it works now regardless of execution location.
# This will print all changes like normal, but also leave everything
# printing in the file nvi_etl/logs/nvi_etl.log so you can review as
# you go.
logger = setup_logging()

In [17]:
# Set WORKING_DIR to the root that you'll be accessing the files from. 
# This will avoid errors that can arise from moving the entire execution 
# directory somwhere else.

WORKING_DIR = Path.cwd()
# WORKING_DIR = Path("Q:/NVI/2024/Jordan Working")

logger.info(WORKING_DIR)

[INFO|2056940542.py|<module>|L8] @ 2025-04-04 15:10:32,178: c:\Users\mike\Desktop\2_responsibilities\nvi_etl\primary_survey\2024


In [26]:
def recode(survey_data, indicator_map):
    """
    The survey data has corrupted column values where sometimes it will
    be the answer index and other times it will be a text answer. 

    We have to make sure everything is not only an integer, but that the
    integer matches the answer type provided by Johnson Center.

    The indicator_map file has the 'recode' object that includes the map
    for each column.
    """

    # Recoding
    for column, recode_info in indicator_map["recode"].items():
        if column not in survey_data.columns:
            # Switching from a nested if to a guard statement + warning (MV).
            logger.warning(f"{column} not found in current dataset.")
            continue

        if recode_info["type"] == "categorical":
            # I'm skipping the type conversion on the mapping because
            # they all look correct in the indicator_map (MV).

            survey_data[column] = (
                survey_data[column]
                .astype(str)
                .map(recode_info["mapping"])
                .fillna(survey_data[column])
            )

        elif recode_info["type"] == "numeric":
            try:
                survey_data[column] = pd.to_numeric(survey_data[column], errors="coerce")

            except ValueError:  # FIXME Could this throw other errors?
               logger.warning(f"Column {column} could not be converted to numeric!")
        survey_data.to_csv('recode_test_20250227.csv')
        return survey_data

In [None]:
def aggregate(recoded, indicator_map, geographic_level):
    """
    Aggregate the survey data to a given geography level provided the 
    rules given in the indicator_map.

    You can provide any 'geographic_level' that appears as a column on 
    the 'recoded' dataframe.
    """

    results = []
    for indicator_id, indicator_info in indicator_map["indicators"].items():
        for question_id, question_info in indicator_info["questions"].items():
            question_col = question_info["column"]
            if question_col not in recoded.columns:
                print(f"'{question_col}' doesn't appear in the recoded file.")
                continue
            
            try:
                grouped = recoded.groupby(geographic_level)[question_col]
            except KeyError as e:
                raise KeyError(f"Invalid geography level: '{geographic_level}'!")

            universe = grouped.count()

            # need mapping numbers from recode section to fill survey question option id
            if "recode" in indicator_map and question_col in indicator_map["recode"]:
                recode_mapping = indicator_map["recode"][question_col]["mapping"]
                recode_mapping_numbers = {v: k for k, v in recode_mapping.items()}
            else:
                 recode_mapping_numbers = {}

            for option_id, option_value in question_info["options"].items():
                # I changed everything in 'indicator_map.json' to be lists to skip the
                # type check here (MV)
                count = grouped.apply(lambda x: sum(x.isin(option_value)))

                percentage = ((count * 100) / universe).fillna(0)

                # mapping numbers
                mapping_numbers = []
                for val in option_value:
                     if val in recode_mapping_numbers:
                          mapping_numbers.append(recode_mapping_numbers[val])

                for location, c, u, p in zip(
                    universe.index, count, universe, percentage
                ):
                    results.append(
                        {
                            "indicator_id": indicator_id,
                            "survey_question_id": question_id,
                            "survey_question_option_id": mapping_numbers[0] if mapping_numbers else 'unknown',
                            "location": location,
                            "count": c,
                            "universe": u,
                            "percentage": p,
                        }
                    )
        
        # Aggregate indicator level
        indicator_cols = [q_info["column"] for q_info in indicator_info["questions"].values()]
        indicator_options = [list(q_info["options"].values()) for q_info in indicator_info["questions"].values()]

        def check_row(row):
            for i, col in enumerate(indicator_cols):
                 if isinstance(indicator_options[i][0], list):
                    if isinstance(row[col], pd.Series):
                        if any(val in row[col].values for val in indicator_options[i][0]):
                             return 1  
                    else: # single values
                         if row[col] in indicator_options[i][0]:
                              return 1
                 else:
                    if row[col] in indicator_options[i]:
                         return 1
            return 0
        
        
        # recoded[indicator_id] = recoded[indicator_cols].apply(
        #      lambda row: 1 if any(val in row[indicator_cols].values for val in all_indicator_options) else 0, axis=1
        # )
        recoded[indicator_id] = recoded[indicator_cols].apply(check_row, axis=1)

        indicator_grouped = recoded.groupby(geographic_level)[indicator_id]

        indicator_universe = indicator_grouped.count()
        indicator_count = indicator_grouped.sum()
        indicator_percentage = (indicator_count / indicator_universe * 100).fillna(0)

        for location, c, u, p in zip(indicator_universe.index, indicator_count, indicator_universe, indicator_percentage):
            results.append({
                    "indicator_id": indicator_id,
                    "survey_question_id": "",
                    "survey_question_option_id": "",
                    "location": location,
                    "count": c,
                    "universe": u,
                    "percentage": p,
                } )

    return pd.DataFrame(results)

In [36]:
def agg_survey(df, config, location_map, geographic_level):
    results = []

    print(f"location_map: {location_map}")
    print(f"geographic_level: {geographic_level}")
    # Recode the DataFrame
    recoded_df = df.copy()
    if "recode" in config:
        for col, recode_info in config["recode"].items():
            if col in recoded_df.columns:
                # print(f"Recoding column: {col}")
                # print(f"Original unique values: {df[col].unique()}")
                # print(f"Mapping keys: {recode_info['mapping'].keys()}")

                # This isn't working properly
                # recoded_df[col] = recoded_df[col].astype(str)
                recoded_df[col] = df[col].apply(lambda x: str(int(x)) if isinstance(x, (int, float)) and x.is_integer() else str(x))
                recoded_df[col] = recoded_df[col].map(recode_info["mapping"])

                # print(f"Recoded unique values: {recoded_df[col].unique()}")
    # print("Recoded DataFrame:")
    # print(recoded_df.head())

    recoded_df.to_csv('recode_test_20250227.csv')

    if geographic_level in location_map and isinstance(location_map[geographic_level], dict):
        location_mapping = location_map[geographic_level]
    else:
        print(f"Geographic level {geographic_level} not found in location_map.")
        location_mapping = {}

    # Process indicators
    for indicator_id, indicator_info in config["indicators"].items():
        for question_id, question_info in indicator_info["questions"].items():
            question_col = question_info["column"]
            question_id_config = question_info["question_id"]
            options = question_info["options"]["values"]

            if question_col not in recoded_df.columns:
                print(f"Warning: Column '{question_col}' not found in DataFrame.")
                continue

            # convert to ints
            recoded_df[question_col] = pd.to_numeric(recoded_df[question_col], errors='coerce')
            recoded_df[question_col] = recoded_df[question_col].fillna(0).astype(int)

            grouped = recoded_df.groupby(geographic_level)[question_col]
            universe = grouped.count()

            
            for option_id, option_value_list in question_info["options"].items():
                for option_value in option_value_list:
                    count = grouped.apply(lambda x: sum(x.isin([option_value])))
                    percentage = (count / universe * 100).fillna(0)

                    for location, c, u, p in zip(universe.index, count, universe, percentage):
                        # Let's let this break if the ID isn't found
                        location_id = location_mapping[location]

                        results.extend([{
                                "indicator_id": indicator_id,
                                "survey_question_id": question_id_config,
                                "survey_question_option_id": option_value,
                                "location_id": location_id,
                                "count": c,
                                "universe": u,
                                "percentage": p,
                            }])

        # Indicator Level Aggregation
        indicator_cols = [q_info["column"] for q_info in indicator_info["questions"].values()]
        indicator_options = [q_info["options"]["values"] for q_info in indicator_info["questions"].values()]

        def indicator_check(row):
            for col, opts in zip(indicator_cols, indicator_options):
                if row[col] not in opts:
                    return 0
            return 1

        recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)

        indicator_grouped = recoded_df.groupby(geographic_level)[indicator_id]
        indicator_count = indicator_grouped.sum()
        indicator_universe = indicator_grouped.count()
        indicator_percentage = (indicator_count / indicator_universe * 100).fillna(0)

        for location, c, u, p in zip(indicator_universe.index, indicator_count, indicator_universe, indicator_percentage):
            if isinstance(location_mapping, dict):
                location_id = location_mapping.get(location, location)
            else: 
                location_id = location
                
            results.extend([{
                    "indicator_id": indicator_id,
                    "survey_question_id": "",
                    "survey_question_option_id": "",
                    "location_id": location_id,
                    "count": c,
                    "universe": u,
                    "percentage": p,
                }])
    print(f"location_id: {results}")

    return pd.DataFrame(results)

In [29]:
def append_to_table(df):
    os.chdir('C:/Users/jordan.D3V2/Desktop')
    with open("config.toml", "rb") as f:
        config = tomli.load(f)

    HOST, DBNAME, USER, PASSWORD = config["edw-v2"].values()

    engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{DBNAME}")

    for col in df.columns:
        if df[col].dtype == 'object':
            if any(df[col] == ""):
                df[col] = df[col].replace("", np.nan)
                df[col] = df[col].astype("Int64")            

    df.to_sql(
        "context_values", 
        engine, 
        schema="nvi", 
        if_exists="append", 
        index=False
    )
    return

In [96]:
def transform_data():
    df = pd.read_csv("Q:/NVI/2024/Raw Survey Response/final_nvi_surveys_complete_20250224.csv", encoding="latin-1")

    geo = (
        pd.read_csv("Q:/NVI/2024/Prelimenary Data/nvi_geocode_tempzones_20250311.csv")
        .rename(columns={"council_districts": "__district", "neighborhood_zones": "zone"})
        .astype({"__district": pd.Int64Dtype()})
        .assign(district=lambda df: np.where(df["__district"].isna(), pd.NA, df["__district"].astype(str)))
        .drop("__district", axis=1)
    )

    # What should we do with missing geocodes?

    logger.info(f"Null Districts: {geo["district"].isna().sum()}")
    logger.info(f"Null Zones: {geo["zone"].isna().sum()}")

    df = df.merge(geo, left_on="Response ID", right_on="id")
    logger.info(df.columns)

    indicator_map = json.loads((WORKING_DIR / "conf" / 'indicator_map.json').read_text())
    location_map = json.loads((WORKING_DIR / "conf" / 'location_map.json').read_text())

    
    city = agg_survey(df, indicator_map, location_map, "citywide")

    geocoded = df.dropna(subset=["district", "zone"])

    district = agg_survey(geocoded, indicator_map, location_map, "district")
    zone = agg_survey(geocoded, indicator_map, location_map, "zone")

    df = pd.concat([city, district, zone], ignore_index=True)



    df.insert(1, "survey_id", 1)
    df.insert(1, "year", 2024)
    df.to_csv("nvi_test_20250327.csv", index=False)

    # didn't get this part to work yet so I just did an insert on the table in sql outside of this to add the rows (see below)
    # append_to_table(df)
    return



In [103]:
transform_data()

  df = pd.read_csv("Q:/NVI/2024/Raw Survey Response/final_nvi_surveys_complete_20250224.csv", encoding="latin-1")


[INFO|1208991613.py|transform_data|L14] @ 2025-04-05 08:50:11,402: Null Districts: 424
[INFO|1208991613.py|transform_data|L15] @ 2025-04-05 08:50:11,405: Null Zones: 424
[INFO|1208991613.py|transform_data|L18] @ 2025-04-05 08:50:11,422: Index(['Response ID', 'Time Started', 'Date Submitted', 'Status', 'Contact ID',
       'Legacy Comments', 'Comments', 'Language', 'Referer', 'SessionID',
       ...
       'GiftCard_State', 'GiftCard_ZipCode', 'Receive_NVI_Update',
       'Receive_NVI_Update_Email', 'id', 'lat', 'lon', 'zone', 'citywide',
       'district'],
      dtype='object', length=283)
location_map: {'citywide': {'Detroit': 1}, 'district': {'1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8}, 'zone': {'1a': 9, '1b': 10, '1c': 11, '2a': 12, '2b': 13, '2c': 14, '3a': 15, '3b': 16, '3c': 17, '4a': 18, '4b': 19, '4c': 20, '5a': 21, '5b': 22, '5c': 23, '5d': 24, '6a': 25, '6b': 26, '6c': 27, '6d': 28, '7a': 29, '7b': 30, '7c': 31}}
geographic_level: citywide


  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  reco

location_id: [{'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 1, 'count': 643, 'universe': 3594, 'percentage': 17.890929326655534}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 21, 'location_id': 1, 'count': 2279, 'universe': 3594, 'percentage': 63.4112409571508}, {'indicator_id': '1', 'survey_question_id': '', 'survey_question_option_id': '', 'location_id': 1, 'count': 2922, 'universe': 3594, 'percentage': 81.30217028380635}, {'indicator_id': '2', 'survey_question_id': 1, 'survey_question_option_id': 3, 'location_id': 1, 'count': 1148, 'universe': 3594, 'percentage': 31.942125765164164}, {'indicator_id': '2', 'survey_question_id': 1, 'survey_question_option_id': 70, 'location_id': 1, 'count': 963, 'universe': 3594, 'percentage': 26.79465776293823}, {'indicator_id': '2', 'survey_question_id': 2, 'survey_question_option_id': 3, 'location_id': 1, 'count': 1121, 'universe': 3594, 'percentage': 31.19087367835281}

  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  reco

location_id: [{'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 2, 'count': 97, 'universe': 575, 'percentage': 16.869565217391305}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 3, 'count': 92, 'universe': 496, 'percentage': 18.548387096774192}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 4, 'count': 65, 'universe': 323, 'percentage': 20.123839009287924}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 5, 'count': 87, 'universe': 540, 'percentage': 16.11111111111111}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 6, 'count': 126, 'universe': 714, 'percentage': 17.647058823529413}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 7, 'count': 103, 'universe': 513, 'percentage': 20.077972709551656}, {'indic

  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  reco

location_id: [{'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 9, 'count': 36, 'universe': 200, 'percentage': 18.0}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 10, 'count': 29, 'universe': 144, 'percentage': 20.13888888888889}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 11, 'count': 32, 'universe': 231, 'percentage': 13.852813852813853}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 12, 'count': 51, 'universe': 200, 'percentage': 25.5}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 13, 'count': 19, 'universe': 168, 'percentage': 11.30952380952381}, {'indicator_id': '1', 'survey_question_id': 1, 'survey_question_option_id': 20, 'location_id': 14, 'count': 22, 'universe': 128, 'percentage': 17.1875}, {'indicator_id': '1', 'survey_question_id': 

  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)
  recoded_df[indicator_id] = recoded_df.apply(indicator_check, axis=1)


In [None]:
#  SQL insert
# insert into nvi.context_values(
#         indicator_id,
#         location_id,
#         year,
#         survey_id,
#         survey_question_id,
#         survey_question_option_id,
#         count,
#         universe,
#         percentage
# )
# select indicator_id,
#        location_id,
#        year,
#        survey_id,
#        survey_question_id,
#        survey_question_option_id,
#        count,
#        universe,
#        percentage
# from data_dua.nvi_survey_context_values_20250327

In [104]:
output = pd.read_csv("nvi_test_20250327.csv").astype({
    "survey_question_id": pd.Int64Dtype(),
    "survey_question_option_id": pd.Int64Dtype(),
})

In [105]:
output.to_csv(Path.cwd() / "output" / "primary_output.csv")