### **Table of Contents**
    
* [Error Handling](#error-handling)
* [Exploration and Cleaning](#exploration-and-cleaning)
  * [EDA Function](#eda-function)
  * [K Count](#k-count)
  * [HUD](#hud)
* [Keep this file in your project](#keep-this-file-in-your-project)

In [10]:
import pandas as pd
import os
import json


# Error Handling

## No need to import your packages again. you always import them at the top. 

In [11]:
def safe_read_excel(filepath, **kwargs):
    """Read Excel safely and handle both single- and multi-sheet files."""
    import os
    import pandas as pd

    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return {}

    try:
        df = pd.read_excel(filepath, **kwargs)
        if isinstance(df, dict):
            print(f"Loaded {len(df)} sheets from {os.path.basename(filepath)}")
        else:
            print(f"Loaded {os.path.basename(filepath)} — shape {df.shape}")
        return df
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return {}

In [12]:
def safe_read_excel(filepath, **kwargs):
    """Read Excel safely and handle both single- and multi-sheet files."""

    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return {}

    try:
        df = pd.read_excel(filepath, **kwargs)
        if isinstance(df, dict):
            print(f"Loaded {len(df)} sheets from {os.path.basename(filepath)}")
        else:
            print(f"Loaded {os.path.basename(filepath)} — shape {df.shape}")
        return df
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return {}

# Exploration and Cleaning

## EDA Function

In [13]:
def basic_eda(df: pd.DataFrame, show_head: bool = True, show_tail: bool = True, show_info: bool = True) -> None:
    """
    Prints a quick summary of a DataFrame, including shape, column names, data types,
    missing values, and optional previews of the data.

    Args:
        df: The pandas DataFrame to summarize.
        show_head: Whether to display df.head(). Default is True.
        show_tail: Whether to display df.tail(). Default is True.
        show_info: Whether to display df.info(). Default is True.

    Returns:
        None. Creates a printed summary.
    """

    print("DataFrame Shape:", df.shape)
    print("\n Column Names:")
    print(df.columns.tolist())

    print("\n Data Types:")
    print(df.dtypes)

    if show_info:
        print("\n DataFrame Info:")
        df.info()

    print("\n Null Values (%):")
    nulls = df.isnull().mean() * 100 
    print(nulls[nulls > 0].round(2).sort_values(ascending=False))

    if show_head:
        print("\n Preview (Head):")
        print(df.head())

    if show_tail:
        print("\n Preview (Tail):")
        print(df.tail())

## K Count

We should put our notes here to keep it clean :) 

In [14]:
# Importing K count data (starts on line 2)
kcount_df = safe_read_excel("../ashley/data/k_count.xlsx", header=2)

# Calling EDA function

basic_eda(kcount_df)

Loaded k_count.xlsx — shape (133, 15)
DataFrame Shape: (133, 15)

 Column Names:
['County', 'KY Balance of State CoC Region', 'KY Area Development District (Balance of State Coordinated Entry Local Prioritization Community)', 'Total   Homeless ', 'Population Estimates 2024* (most recent available in 2025)', 'Percentage of Population', 'Total Number of persons', 'Total Number of persons.1', 'Total Number of persons.2', 'Total Number of Veterans', 'Total Number of Chronically Homeless Persons', 'Adults with Serious Mental Illness (self-reported)', 'Adults with a Substance Use Disorder (self-reported)', 'Adults with HIV/AIDS (self-reported)', 'Adults whose current episode of homelessness caused by domestic violence (self-reported)']

 Data Types:
County                                                                                               object
KY Balance of State CoC Region                                                                       object
KY Area Development District (

# check this out
notice I removed `print()` from the code below. 

```python 
print(kcount_df.head())
```

If you just use `kcount_df.head()` it prints a nice dataframe that is easy to read. 

### I would break this up into several cells
This is up to you but typically I would make each cell its own concern. 
- So Remove rows 
- remove columns 
- map... 

In [15]:
# Removing rows without a county name
kcount_df = kcount_df[kcount_df['County'].notna()].copy()

# Remove whitespace and special characters from column names
kcount_df.columns = (
    kcount_df.columns
    .str.strip()                      # remove extra spaces
    .str.replace(r"\s+", "_", regex=True)  # replace spaces with underscores
    .str.replace(r"[\*\(\)]", "", regex=True)  # remove parentheses and asterisks
)

# Dropping duplicate columns
kcount_df = kcount_df.drop(columns=["Total_Number_of_persons.1", "Total_Number_of_persons.2"], errors="ignore")

# Renaming long column names
rename_map = {
    "County": "county",
    "KY_Balance_of_State_CoC_Region": "coc_region",
    "KY_Area_Development_District_Balance_of_State_Coordinated_Entry_Local_Prioritization_Community": "add_region",
    "Total___Homeless": "total_homeless",
    "Population_Estimates_2024_most_recent_available_in_2025": "population_2024",
    "Percentage_of_Population": "percent_of_population",
    "Total_Number_of_persons": "total_persons",
    "Total_Number_of_Veterans": "veterans",
    "Total_Number_of_Chronically_Homeless_Persons": "chronically_homeless",
    "Adults_with_Serious_Mental_Illness_self-reported": "serious_mental_illness",
    "Adults_with_a_Substance_Use_Disorder_self-reported": "substance_use_disorder",
    "Adults_with_HIV/AIDS_self-reported": "hiv_aids",
    "Adults_whose_current_episode_of_homelessness_caused_by_domestic_violence_self-reported": "domestic_violence"
}
kcount_df = kcount_df.rename(columns=rename_map)

# Converting numeric columns to numeric datatype
num_cols = kcount_df.select_dtypes(include="number").columns
kcount_df[num_cols] = kcount_df[num_cols].apply(pd.to_numeric, errors="coerce")

# Removing lines that are summaries
kcount_df = kcount_df[~kcount_df["county"].isin(["Kentucky", "Balance of State"])]

# Resetting index
kcount_df = kcount_df.reset_index(drop=True)

print("Cleaned DataFrame Shape:", kcount_df.shape)
print("\n Columns:", kcount_df.columns.tolist())
print("\n Preview:")
kcount_df.head()

Cleaned DataFrame Shape: (120, 13)

 Columns: ['county', 'coc_region', 'add_region', 'Total_Homeless', 'population_2024', 'percent_of_population', 'total_persons', 'veterans', 'chronically_homeless', 'serious_mental_illness', 'substance_use_disorder', 'hiv_aids', 'domestic_violence']

 Preview:


Unnamed: 0,county,coc_region,add_region,Total_Homeless,population_2024,percent_of_population,total_persons,veterans,chronically_homeless,serious_mental_illness,substance_use_disorder,hiv_aids,domestic_violence
0,Adair,5,Lake Cumberland,5.0,19239.0,0.00026,5.0,0.0,0.0,1.0,2.0,0.0,0.0
1,Allen,2,Barren River,0.0,22037.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Anderson,6,Bluegrass,2.0,24883.0,8e-05,2.0,0.0,0.0,2.0,0.0,0.0,0.0
3,Ballard,1,Purchase,0.0,7626.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Barren,2,Barren River,0.0,45609.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## HUD

In [16]:
# Importing HUD data as a dictionary because it contains a sheet for each year
hud_sheets = safe_read_excel("../ashley/data/hud.xlsx", sheet_name=None, header=1)

hud_list = []

for year, df in hud_sheets.items():
    df = df.copy()
    df["year"] = str(year)  # adding year column
    hud_list.append(df)

hud_all = pd.concat(hud_list, ignore_index=True) # Putting all years into a single dataframe

# Calling EDA function
basic_eda(hud_all)

Loaded 9 sheets from hud.xlsx
DataFrame Shape: (3540, 138)

 Column Names:
['State', 'Continuum of Care (CoC)', 'HUD CoC Number', '2015 CoC Award Amount\n($0 indicates that an application was not submitted)', 'AHAR Part 1 CoC Category', 'Total Non-DV Beds on 2015 HIC ES+TH', 'Total Non-DV HMIS Beds on 2015 HIC ES+TH', '2015 Bed coverage Percent on HMIS for ES-TH Combined', 'ES-SH Avg (Days)', 'ES-SH Median (Days)', 'ES-SH-TH Avg (Days)', 'ES-SH-TH Median (Days)', 'Total Persons Exited (SO+ES+TH+SH+PH)', 'Total Persons Returns in 6 mths', 'Total Persons Returns in 12 mths (should include the 6-month cohort)', 'Total Persons Returns in 24 mths (should include both the 6- and 12-month cohort)', 'Percent Returns in 6 mths', 'Percent Returns in 12 mths (should include the 6-month cohort)', 'Percent Returns in 24 mths (should include both the 6- and 12-month cohort)', 'Total HMIS Count', 'Total Stayers (persons)', 'Total Stayers increased earned income', 'Percent Stayers increased earned inc

In [17]:
# Removing footnote rows
hud_all = hud_all[hud_all["State"].str.len() == 2]  # keep only 2-letter state abbreviations

# Normalize column names
hud_all.columns = (
    hud_all.columns
    .str.strip()
    .str.replace(r"\s+", "_", regex=True)
    .str.replace(r"[\(\)\n]", "", regex=True)
    .str.replace(r"[^A-Za-z0-9_]+", "", regex=True)
    .str.lower()
)

# Change numeric columns datatype
num_cols = hud_all.columns.drop(["state", "continuum_of_care_coc", "hud_coc_number", "ahar_part_1_coc_category"])
hud_all[num_cols] = hud_all[num_cols].apply(pd.to_numeric, errors="coerce")

# Drop columns with missing values >30%
threshold = len(hud_all) * 0.7
hud_df = hud_all.dropna(thresh=threshold, axis=1)

# Reset index
hud_all = hud_all.reset_index(drop=True)

# Filter for Kentucky
hud_ky = hud_all[hud_all["state"] == "KY"].copy()

print("Cleaned HUD Data Shape:", hud_df.shape)
print("\n Columns:", hud_df.columns.tolist())
print("\n Kentucky Rows Preview:")
hud_ky.head()

Cleaned HUD Data Shape: (1592, 43)

 Columns: ['state', 'continuum_of_care_coc', 'hud_coc_number', 'ahar_part_1_coc_category', 'essh_avg_days', 'essh_median_days', 'esshth_avg_days', 'esshth_median_days', 'total_persons_exited_soesthshph', 'total_persons_returns_in_6_mths', 'total_persons_returns_in_12_mths_should_include_the_6month_cohort', 'total_persons_returns_in_24_mths_should_include_both_the_6_and_12month_cohort', 'percent_returns_in_6_mths', 'percent_returns_in_12_mths_should_include_the_6month_cohort', 'percent_returns_in_24_mths_should_include_both_the_6_and_12month_cohort', 'total_hmis_count', 'total_stayers_persons', 'total_stayers_increased_earned_income', 'percent_stayers_increased_earned_income', 'total_stayers_increased_nonemployment_cash_income', 'percent_stayers_increased_nonemployment_cash_income', 'total_stayers_increased_total_income', 'percent_stayers_increased_total_income', 'total_leavers_persons', 'total_leavers_increased_earned_income', 'percent_leavers_increa

Unnamed: 0,state,continuum_of_care_coc,hud_coc_number,2015_coc_award_amount_0_indicates_that_an_application_was_not_submitted,ahar_part_1_coc_category,total_nondv_beds_on_2015_hic_esth,total_nondv_hmis_beds_on_2015_hic_esth,2015_bed_coverage_percent_on_hmis_for_esth_combined,essh_avg_days,essh_median_days,...,unnamed_3,total_nondv_beds_on_2021_hic_esth,total_nondv_hmis_beds_on_2021_hic_esth,2021_bed_coverage_percent_on_hmis_for_esth_combined,total_nondv_beds_on_2022_hic_esth,total_nondv_hmis_beds_on_2022_hic_esth,2022_bed_coverage_percent_on_hmis_for_esth_combined,total_year_round_nonehv_and_nondisaster_beds_in_2023_hic,total_year_round_beds_in_hmis_or_comparable_databases_in_2023_hic,2023_bed_coverage_percent
141,KY,Kentucky Balance of State CoC,KY-500,7981008.0,Balance of State and Statewide CoCs,1299.0,869.0,0.668976,39.0,20.0,...,,,,,,,,,,
142,KY,Louisville/Jefferson County CoC,KY-501,9064742.0,Major Cities,1018.0,1018.0,1.0,41.0,13.0,...,,,,,,,,,,
143,KY,Lexington/Fayette County CoC,KY-502,1589681.0,"Smaller Cities, Counties, and Regional CoCs",1210.0,747.0,0.617355,39.0,10.0,...,,,,,,,,,,
543,KY,Kentucky Balance of State CoC,KY-500,,Balance of State and Statewide CoCs,,,,39.0,19.0,...,,,,,,,,,,
544,KY,Louisville-Jefferson County CoC,KY-501,,Major Cities,,,,46.0,13.0,...,,,,,,,,,,


# Keep this file in your project

keep at the bottom of your project

We will go over it later, its a script that will generate a markdown table of contents based on your headings created here in markdown.

In [19]:
def generate_toc_from_notebook(notebook_path):
    """
    Parses a local .ipynb file and generates Markdown for a Table of Contents.
    """
    if not os.path.isfile(notebook_path):
        print(f"❌ Error: File not found at '{notebook_path}'")
        return

    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    toc_markdown = "### **Table of Contents**\n"
    for cell in notebook.get('cells', []):
        if cell.get('cell_type') == 'markdown':
            for line in cell.get('source', []):
                if line.strip().startswith('#'):
                    level = line.count('#')
                    title = line.strip('#').strip()
                    link = title.lower().replace(' ', '-').strip('-.()')
                    indent = '  ' * (level - 1)
                    toc_markdown += f"{indent}* [{title}](#{link})\n"

    print("\n--- ✅ Copy the Markdown below and paste"
          "it into a new markdown cell ---\n")
    print(toc_markdown)


if __name__ == "__main__":
    # Example usage
    notebook_path = 'ashley.ipynb'  # Replace with your notebook path
    generate_toc_from_notebook(notebook_path)


--- ✅ Copy the Markdown below and pasteit into a new markdown cell ---

### **Table of Contents**
    * [**Table of Contents**](#**table-of-contents**)
* [Error Handling](#error-handling)
* [Exploration and Cleaning](#exploration-and-cleaning)
  * [EDA Function](#eda-function)
  * [K Count](#k-count)
  * [HUD](#hud)
* [Keep this file in your project](#keep-this-file-in-your-project)

