In [1]:
from pathlib import Path
import json
import configparser
import pandas as pd
import geopandas as gpd
from openpyxl.utils import get_column_letter

from nvi_etl import setup_logging
from nvi_etl.geo_reference import (
    pull_city_boundary, pull_council_districts, pull_zones
)

In [None]:
columns = []
for indicator_id, indicator_details in indicator_map["indicators"].items():
    # For now every respondent is in the universe, but this may change in the future
    universe = pd.Series(True, range(len(recoded)))

    # Count any respondent who meets the criteria
    count = pd.concat(
        [
            recoded[question_details["column"]].isin(question_details["options"]["values"])
            for _, question_details in indicator_details["questions"].items()
        ], axis=1
    ).any(axis=1) & universe # but include people who are in the universe

    indicator_cols = (
        recoded[["Response ID"]] # Leave as a one column Dataframe
        .assign(
            **{
                f"universe_{indicator_id}": universe,
                f"count_{indicator_id}": count
            }
        )
        .set_index("Response ID")
    )

    columns.append(indicator_cols)

complete = pd.concat(columns, axis=1)

In [None]:
aggregations = pd.concat(
    [
        (
            complete
            .join(
                recoded[["Response ID", "citywide"]]
                .set_index("Response ID")
            )
            .rename(columns={"citywide": "location_id"})
            .groupby("location_id")
            .sum()
            .reset_index()
        ),
        (
            complete
            .join(
                recoded[["Response ID", "district"]].set_index("Response ID")
            )
            .rename(columns={"district": "location_id"})
            .groupby("location_id")
            .sum()
            .reset_index()
        ),
        (
            complete
            .join(
                recoded[["Response ID", "zone"]].set_index("Response ID")
            )
            .rename(columns={"zone": "location_id"})
            .groupby("location_id")
            .sum()
            .reset_index()
        ),
    ],
    axis=0
)

long_file = pd.wide_to_long(
    aggregations,
    stubnames=["count", "universe"],
    i="location_id",
    j="indicator_id",
    sep="_"
)

# Working out using new data dictionary setup

In [None]:
normalized_data_dictionary = pd.read_excel(Path.cwd() / "conf" / "normalized_data_dict.xlsx")
recoded = pd.read_csv(Path.cwd() / "output" / "nvi_2024_analysis_source.csv", low_memory=False)

reconstructed_columns = set()
for _, row in normalized_data_dictionary.iterrows():
    question = row["question"]
    answer = row["answer"]
    group = row["group"]

    if row["multiselect"] and (group != question) and (question != answer):
        # Grid of death
        new_col = ":".join((answer, question, group))
        if new_col not in reconstructed_columns:
            reconstructed_columns.add(new_col)


    elif (group == question):
        # Ungrouped single-select questions
        if question not in reconstructed_columns:
            reconstructed_columns.add(question)


    elif not row["multiselect"]:
        # Likert / other single-select
        new_col = ":".join((question, group))
        if new_col not in reconstructed_columns:
            reconstructed_columns.add(new_col)


    else:
        # Grouped multi-select
        new_col = ":".join((answer, group))
        if new_col not in reconstructed_columns:
            reconstructed_columns.add(new_col)


ignore_columns = [
    'Response ID',
    'Time Started',
    'Date Submitted',
    'Status',
    'Contact ID',
    'Legacy Comments',
    'Comments',
    'Language',
    'Referer',
    'SessionID',
    'User Agent',
    'Tags',
    'IP Address',
    'Longitude',
    'Latitude',
    'Country',
    'City',
    'State/Region',
    'Postal',
    'By selecting this box, I confirm that I am at least 18 years old, that my primary residence is in the City of Detroit, and that I have read and understand the above information.:Confirm_18_Resident_Consent',
    'Detroit_Residence_Address',
    'Detroit_Residence_ZipCode',
    'Geocode_address',
    'Geocode_zip',
    'City_D',
    'State_M',
    'USER_Response_ID',
    'successful_geocode',
    'X',
    'Y',
    'citywide',
    'district',
    'zone',
]


within = [
    col for col in reconstructed_columns
    if col in recoded.columns
]

unbuilt = set([
    col for col in recoded.columns
    if (col not in within) and (col not in ignore_columns)
])


print(len(within))
# GOAL: 246 columns

246


In [59]:
case_results = pd.DataFrame(reconstructed_columns, columns=["case", "column"])

In [64]:
case_results[case_results["case"] == 4]

Unnamed: 0,case,column
0,4,"Yes, my household uses a program to help pay f..."
2,4,Submitting an online job application:Comfortab...
6,4,Other (if you use a program not listed above t...
7,4,Cleaned up or improved alley ways:In_The_Last_...
14,4,"Using video conferencing software (Zoom, Googl..."
...,...,...
238,4,From my local Community Development Organizati...
239,4,Other:Hear_Of_Survey
243,4,Advanced computer skills training:Resources_Im...
244,4,Community development organization(s):Communit...


In [None]:
# Write a query that shows what's missing from context / primary indicators
# What do we do with non-context primary survey answers?
# - Can we use the value type field to capture the difference between survey questions used in NVI and
#   survey questions that aren't?

In [None]:
indicator_columns = set()
for ind_id, indicator in indicator_map.items():
    for i, question in indicator["questions"].items():
        indicator_columns.add((ind_id, question["column"], question["question_id"]))

result = []
for ind_id, col, quest_id in indicator_columns:
    result.append(
        recoded[col]
        .value_counts()
        .to_frame()
        .reset_index()
        .rename(columns={col: "value"})
        .assign(
            column_name=col,
            indicator_id=ind_id,
            question_id=quest_id,
        )[["column_name", "value", "indicator_id", "question_id"]]
    )

def sort_key(col):
    """
    Break on the colon (or double colon) and return
    the right most value.
    """
    # If you get the 'value' col just return
    if col.dtype == pd.Int64Dtype():
        return col

    # Flip the last and the second to last
    filled = (
        col.replace(":", "::")
        .str.split(":", expand=True)
        .ffill(axis=1)
    )

    return filled.iloc[:, -1] + filled.iloc[:, 0]


pd.concat(result).sort_values(["column_name", "value"], key=sort_key).assign(
    question_option_id=pd.NA,
).to_csv(WORKING_DIR / "conf" / "variable_questions.csv", index=False)

(
    recoded[["Neighborhood_Environmental_Severity", "district"]]
    .fillna(pd.NA)
    .groupby("district")
    .value_counts(dropna=False)
    .reset_index(level=1)
    .pivot(columns="Will_Meet_Needs_10_Years:Current_Housing")
    .rename(columns=lambda c: {
        1: "strongly_disagree",
        2: "somewhat_disagree",
        3: "neither_agree_nor_disagree",
        4: "somewhat_agree",
        5: "strongly_agree",
    }.get(c, 'not_answered'))
    .fillna(0)
    .droplevel(level=0, axis=1)
    .assign(
        total=lambda df: df.sum(axis=1)
    )
    .astype(pd.Int64Dtype())
    .to_csv("current_housing_will_meet_needs_10_yrs.csv")
)

(
    recoded[["Meets_Needs_Now:Current_Housing", "zone"]]
    .fillna(pd.NA)
    .groupby("zone")
    .value_counts(dropna=False)
    .reset_index(level=1)
    .pivot(columns="Meets_Needs_Now:Current_Housing")
    .rename(columns=lambda c: {
        1: "strongly_disagree",
        2: "somewhat_disagree",
        3: "neither_agree_nor_disagree",
        4: "somewhat_agree",
        5: "strongly_agree",
    }.get(c, 'not_answered'))
    .fillna(0)
    .droplevel(level=0, axis=1)
    .assign(
        total=lambda df: df.sum(axis=1)
    )
    .astype(pd.Int64Dtype())
    .to_csv("current_housing_meets_needs_now.csv", mode="a")
)


def zebra(s):
    # grey 2 ≈ Excel “Gray 2” = #d9d9d9
    return ['background-color:#d9d9d9' if ((i % 2) == 0) else '' for i in range(len(s))]

base   = "font-family:'IBM Plex Sans';font-size:13pt;border:1px solid black"
header = base + ";font-weight:bold"
gray   = "background-color:#d9d9d9"   


environmental_columns = [
    col for col in recoded.columns if col.endswith("Neighborhood_Environmental_Severity")
]


renames = {
    0:"Severe",
    1:"Major",
    2:"Moderate",
    3:"Minor",
    4:"Insignificant/Not a Problem"
}


mode="w"
for column in environmental_columns:
    concern, *_ = column.split(":")

    q_table = (
        recoded[[column, "district"]]
        .fillna(pd.NA)
        .groupby("district")
        .value_counts(dropna=False)
        .reset_index(level=1)
        .pivot(columns=column)
        .rename(columns=lambda c: renames.get(c, 'not_answered'))
        .fillna(0)
        .droplevel(level=0, axis=1)
        .assign(
            total=lambda df: df.sum(axis=1)
        )
        .astype(pd.Int64Dtype())
        .reset_index()
    )

    with pd.ExcelWriter(
            "environmental_questions.xlsx",
            mode=mode,                 # append, don’t overwrite the file
            engine="openpyxl",        # required for append
    ) as writer:
        (
            q_table
            .style
            .set_properties(
                **{
                    "font-family": "IBM Plex Sans",   # Excel falls back if font not installed
                    "font-size":   "13pt",
                    "border":      "1px solid black"
                }
            )
            .apply(zebra)                 # alternating row shade
            .apply_index(lambda s: [header]*len(s), axis="columns")  # column headers
            .apply_index(lambda s: [base]*len(s),   axis="index")    # row index cells
            .set_table_styles(
                [
                    {
                        "selector": "th",
                        "props": [("font-weight", "bold"), ("border", "1px solid black")]
                    }
                ]
            )
            .to_excel(writer, sheet_name=concern, index=False)
        )

        ws = writer.sheets[concern]                 # openpyxl worksheet

        for idx, col in enumerate(ws.iter_cols(values_only=True), start=1):
            if idx == 1:
                continue
            max_len = max(len(str(v)) if v is not None else 0 for v in col + (ws.cell(1, idx).value,))
            # Excel’s unit ≈ character width; add a little padding
            ws.column_dimensions[get_column_letter(idx)].width = 25
        
        mode = "a"

In [None]:
(
    normalized_data_dictionary
    .reset_index()
    .drop("index", axis=1)
    .style
    .set_properties(
        **{
            "font-family": "IBM Plex Sans",   # Excel falls back if font not installed
            "font-size":   "13pt",
            "border":      "1px solid black"
        }
    )
    .apply(zebra)                 # alternating row shade
    .apply_index(lambda s: [header]*len(s), axis="columns")  # column headers
    .apply_index(lambda s: [base]*len(s),   axis="index")    # row index cells
    .set_table_styles(
        [
            {
                "selector": "th",
                "props": [("font-weight", "bold"), ("border", "1px solid black")]
            }
        ]
    )
    .to_excel("normalized_data_dict.xlsx", sheet_name="dictionary", index=False)
)


col_ref = pd.read_excel(Path.cwd().parent.parent / "survey_col_reference.xlsx")

result = []
for i, row in col_ref.dropna().iterrows():
    column_names = row["Question Alias"].split("\n")
    answers = row["Response Options and Coding"].split("\n")

    groups = []
    questions = []
    for col_name in column_names:
        q, *group = col_name.replace("::", ":").split(":")

        if group: 
            groups.append(group[0])
        else:
            groups.append("")

        questions.append(q)

    codes = []
    text_values = []
    for answer in answers:
        *code, text_val = answer.split("=")

        if code:
            codes.append(code[0])
        else:
            codes.append(None)

        text_values.append(text_val)

    group_out = []
    question_out = []
    answer_out = []
    code_out = []
    for group, question in zip(groups, questions):
        for code, text_val in zip(codes, text_values):
            group_out.append(group)
            question_out.append(question)
            answer_out.append(text_val)
            code_out.append(code)

    result.append(pd.DataFrame(
        {
            "group": group_out,
            "question": question_out,
            "answer": answer_out,
            "code": code_out
        }
    ))

normalized_data_dictionary = pd.concat(result)


import re

tabulate_info = pd.read_excel("normalized_data_dict.xlsx")

def slugify(txt: str) -> str:
    return re.sub(r'[^a-z0-9]+', '_', txt.lower()).strip('_')


output_tables = []
counter = 0
for (q_group, question), rows in tabulate_info.fillna("").groupby(["group", "question"]):
    data_column = question + ":" + q_group if q_group else question

    if data_column not in recoded.columns:
        print(data_column)
        continue

    renames = {
        int(item["code"]): item["answer"] for _, item in rows.iterrows() if item["code"]
    }

    q_table = (
        recoded[[data_column, "district"]]
        .fillna(pd.NA)
        .groupby("district")
        .value_counts(dropna=False)
        .reset_index(level=1)
        .pivot(columns=data_column)
        .rename(columns=lambda c: renames.get(c, 'not_answered'))
        .fillna(0)
        .droplevel(level=0, axis=1)
        .assign(
            total=lambda df: df.sum(axis=1)
        )
        .astype(pd.Int64Dtype())
        .reset_index()
    )

    q_table.to_csv(
        Path.home() / 
        "Desktop" / 
        "1_projects" / 
        "nvi_districts_tabulated" / 
        f"{slugify(q_group)}_{slugify(question)}.csv"
    )

    counter += 1
    
def expected_values(table):
    expected_values = (
        (
            table.sum(axis=1) 
            / table.sum().sum()
        ).values.reshape(-1, 1)
        * (
            table.sum(axis=0)
        ).values
    )

    return pd.DataFrame(expected_values, index=table.index, columns=table.columns).round(2)


group_cols = [
    "Age", 
    "General distrust of law enforcement:Hesitate_Reporting_Crime_Reason"
]

recodes = {
    1: "General distrust of law enforcement",
    2: "Concern that I won’t be taken seriously, or no action will be taken ",
    3: "Fear of retaliation",
    4: "Risk of consequences from immigration (such as deportation)",
    5: "Text Entry",
    6: "I would not hesitate or avoid reporting crime ",
}

renames = {
    1: "18-24",
    2: "25-34",
    3: "35-44",
    4: "45-54",
    5: "55-64",
    6: "65+",
    7: "Prefers not to answer age",
}

coding_crosstab = (
    recoded[group_cols]
    .value_counts(dropna=False)
    .reset_index()
    .pivot(
        index=group_cols[1], 
        columns=group_cols[0], 
        values="count"
    )
    .rename(columns=lambda c: renames.get(c, 'not_answered_crime'))
    .reset_index()
    .pipe(
        lambda d: d.assign(
            concerns_reporting_crime=d[group_cols[1]].map(
                lambda v: recodes.get(v, "Option Not selected")
            )
        )
    )
    .drop(group_cols[1], axis=1)
    .drop("not_answered_crime", axis=1)
    .set_index("concerns_reporting_crime")
)


aggregation_categories = [
    "citywide",
    "district",
    "zone"
]

columns = pd.read_excel(Path.cwd() / "normalized_data_dict.xlsx")
aggregate_to = "Age"

aggregate_rename = {
    row["code"]: row["answer"] for _, row in
    columns[columns["question"] == aggregate_to].iterrows()
}

groups = []
for (group, multiselect), rows in columns.groupby(["group", "multiselect"]):
    if not group:
        ...

    elif multiselect:
        questions = [col for col in recoded if col.endswith(group)]
        renames = {col: col.split(":")[0] for col in questions}
        groups.append((recoded[[aggregate_to] + questions].rename(columns=renames).copy(), list(renames.values())))
    
    else:
        print(group)


df, questions = groups[0]
df[questions] = ~df[questions].isna()

aggregated = pd.concat([
    df.groupby("Age", dropna=False).aggregate("sum"),
    df["Age"].value_counts(dropna=False).rename("Total in Aggregation Group"),
], axis=1)

aggregated.T.rename(columns=lambda v: aggregate_rename.get(v, "No Answer Provided")).T


grid_of_death = pd.read_excel(Path.cwd().parent.parent / "grid_of_death.xlsx")

result = []
for i, row in grid_of_death.iterrows():
    columns = row["column_name"].split("\n")
    answers = row["answer"].split("\n")

    for col, ans in zip(columns, answers):
        answer_left, question, group = col.split(":")
        code, answer_right = ans.split("=")

        if answer_left != answer_right:
            print(f"{answer_left} != {answer_right}")

        result.append({
            "group": group,
            "question": question,
            "answer": answer_right,
            "code": code,
        })

pd.DataFrame(result).to_excel("grid_of_life.xlsx")
    