In [28]:
import os
from datetime import datetime, timedelta
import subprocess
import time
from gcsfs import GCSFileSystem
import pandas as pd
import tempfile
import pyreadstat
from typing import Tuple, Optional, Any
import json

In [6]:
#TODO: Place credentials path in a config.py file
CREDENTIALS_PATH =  r"C:\Users\eprashar\AppData\Roaming\gcloud\application_default_credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(CREDENTIALS_PATH)

# Verify credentials
# TODO: This goes in a utils script later
def check_and_authenticate(json_path):
    '''
    Function to check google authentication token and re-generate if it is expired/doesn't exist
    '''
    try:
        if not os.path.exists(json_path):
            raise FileNotFoundError("Credentials file not found")
        # Get modification time of the file
        file_mod_time = datetime.fromtimestamp(os.path.getmtime(json_path))
        current_time = datetime.now()

        # Check if the file is older than 24 hours
        if current_time - file_mod_time > timedelta(hours=24):
            print("Credentials file is older than 24 hours. Re-authenticating...")

            # Re-authenticate
            try:
                print(f"Trying reauthentication on gcloud server using shell command...")
                subprocess.run("start cmd /c gcloud auth application-default login", shell=True, check=True)
                print('Login window opened...please complete authentication')
                
                # Poll for file modification
                print("Waiting for credentials file to update...")
                max_wait = 300  # seconds
                check_interval = 2  # seconds
                start_time = datetime.now()

                while (datetime.now() - start_time).total_seconds() < max_wait:
                    new_mod_time = datetime.fromtimestamp(os.path.getmtime(json_path))
                    if new_mod_time > file_mod_time:
                        print("Authentication confirmed! Credentials file updated.")
                        break
                    time.sleep(check_interval)
                else:
                    print("Timed out waiting for credentials file update.")

            except subprocess.CalledProcessError as e:
                print(f"Error during re-authentication: {e}")
            except Exception as e:
                print(f'Authentication failed because of {e}')
        else:
            print("Credentials file is valid.")
    except Exception as e:
        print(f"Error: {e}")

# Verify credentials
check_and_authenticate(CREDENTIALS_PATH)

Credentials file is valid.


In [9]:
# Define GCS paths
# TODO: Transfer these to a config file
# Token doesn't need to be provided as long as authentication is done as above
# However, it is better to be explicit. Possible ways to define token are here: https://gcsfs.readthedocs.io/en/latest/api.html
fs = GCSFileSystem(project='clgx-gis-app-dev-06e3')

#Path to GCS buckets
gcs_path = "gs://geospatial-projects/location_inc"

In [29]:
def read_sav(
    local_path: str, 
    gcs_folder_path: str = "", 
    fs: Optional[Any] = None
) -> Tuple[pd.DataFrame, Any]:
    """
    Reads an SPSS .sav file. It prioritizes reading from a local directory; 
    if the file is not found locally, it attempts to download from GCS using 
    the provided filesystem object.

    Args:
        local_path (str): The local path to the .sav file (e.g., "data/municipalities.sav").
        gcs_folder_path (str): The GCS bucket/folder path (e.g., "bucket_name/raw_data").
                               Required only if reading from GCS.
        fs (Optional[GCSFileSystem]): The GCS FileSystem object. Required only if 
                                      the file is not found locally.

    Returns:
        Tuple[pd.DataFrame, Any]: A tuple containing the pandas DataFrame and 
                                  the pyreadstat metadata object.

    Raises:
        FileNotFoundError: If file is not local and 'fs' is not provided.
    """
    
    # 1. Construct the expected local path
    local_path = os.path.abspath(local_path)

    # 2. Check if the file exists locally
    if os.path.exists(local_path):
        print(f"Reading local file from: {local_path}")
        df, meta = pyreadstat.read_sav(local_path)
        return df, meta

    # 3. If not local, attempt GCS download
    print(f"File not found locally at '{local_path}'. Attempting GCS download...")

    if fs is None:
        raise FileNotFoundError(
            f"File '{local_path}' not found locally and no GCSFileSystem (fs) was provided."
        )

    # Construct full GCS path (handling potential trailing slashes)
    # TODO: Correct this code to handle GCS path construction properly
    # filename = os.path.basename(local_path)
    # full_gcs_path = os.path.join(gcs_folder_path, filename)

    # try:
    #     with fs.open(full_gcs_path, "rb") as gcs_file:
    #         # Write GCS content to a temporary file
    #         with tempfile.NamedTemporaryFile(delete=False, suffix=".sav") as temp_file:
    #             print(f"Downloading from GCS: {full_gcs_path}")
    #             temp_file.write(gcs_file.read())
    #             temp_file.flush()
    #             temp_file.close() # Close to ensure pyreadstat can open it safely
                
    #             # Read the temp file
    #             df, meta = pyreadstat.read_sav(temp_file.name)
                
    #             # Optional: Clean up temp file immediately after read
    #             os.unlink(temp_file.name)
                
    #     return df, meta

    # except Exception as e:
    #     raise RuntimeError(f"Failed to read from GCS path '{full_gcs_path}': {e}")

In [38]:
# Local data path
# TODO: If this data is inserted in a sub-folder inside data, that will need to be reflected in the path
muni_acs_path = r'C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\demographics\location_inc_demographic_acs_5_2023_muni_acs.sav'

In [21]:
# Read the municipalities ACS data
start_time = time.time()
muni_acs_df, muni_acs_meta = read_sav(
    local_path=muni_acs_path
)
end_time = time.time()
print(f'Data loaded in {end_time - start_time:.2f} seconds')

Reading local file from: C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\location_inc_demographic_acs_5_2023_muni_acs.sav
Data loaded in 4.65 seconds


In [33]:
# Examine muni_acs columns
muni_acs_df.columns

Index(['geoid', 'NAME', 'STUSAB', 'GEO_ID', 'veteran_pct',
       'kids_working_parents_pct', 'kids_u6_working_parents_pct',
       'working_parents_hh_pct', 'kids_parent_ratio', 'md_hhinc',
       ...
       'socc_admin_m', 'mocc_manager_m', 'mocc_prof_m', 'mocc_service_m',
       'mocc_farm_m', 'farm_pct_m', 'mocc_constr_m', 'mocc_manu_m',
       'mocc_trans_m', 'mocc_sales_m'],
      dtype='object', length=635)

In [34]:
muni_acs_df.head(1000).to_csv("muni_acs_head_1000.csv", index=False)

In [26]:
# Read the municipalities ACS data
print(muni_acs_df.info())
#print(muni_acs_df.dtypes)
print(muni_acs_df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69948 entries, 0 to 69947
Columns: 635 entries, geoid to mocc_sales_m
dtypes: float64(630), object(5)
memory usage: 338.9+ MB
None
     geoid                      NAME STUSAB            GEO_ID  veteran_pct  \
0    01001   Autauga County, Alabama     AL    0500000US01001     7.811419   
1  0100100       Abanda CDP, Alabama     AL  1600000US0100100     0.000000   
2  0100124   Abbeville city, Alabama     AL  1600000US0100124     4.412955   
3    01003   Baldwin County, Alabama     AL    0500000US01003     8.622809   
4  0100460  Adamsville city, Alabama     AL  1600000US0100460     7.527383   

   kids_working_parents_pct  kids_u6_working_parents_pct  \
0                 67.373547                    61.585835   
1                100.000000                          NaN   
2                 64.691358                    66.666667   
3                 69.194412                    66.365550   
4                 24.843945                    10.

In [None]:
# Examine municipalities ACS metadata
# Throws TypeError: Object of type datetime is not JSON serializable when serializing dict item 'creation_time'
# TODO: Handle datetime serialization for JSON 
# json.dump(muni_acs_meta.__dict__, open("muni_acs_metadata.json", "w"), indent=4)

In [44]:
# For each state (STUSAB), check split between GEO_ID starting with 1600 (municipalities) and 060 (Townships)
# In the printed table, have 3 columns: state, count of municipalities and count of townships
# We use a simple function to tag each row based on how GEO_ID starts
def classify_geo(geo_id):
    if str(geo_id).startswith('1600'):
        return 'Place_Municipality'
    elif str(geo_id).startswith('060'):
        return 'Township_County_SubDivision'
    else:
        return 'Other'

# Apply the classification
muni_acs_df['Type'] = muni_acs_df['GEO_ID'].apply(classify_geo)

# 2. Group by State and Type, then reshape
# - groupby: counts occurrences of each Type per State
# - unstack: pivots 'Type' from rows to columns
# - fillna(0): replaces NaN with 0 for states that might lack one type
summary = (
    muni_acs_df.groupby(['STUSAB', 'Type'])
    .size()
    .unstack(fill_value=0)
    [['Place_Municipality', 'Township_County_SubDivision']] # Select only the columns we want
    .reset_index()
)

# Rename columns for the final clean output
summary.columns.name = None # Remove the index name 'Type'
summary = summary.rename(columns={
    'Place_Municipality': 'count_municipalities_places',
    'Township_County_SubDivision': 'count_townships_county_subdivisions'
})

print(summary)

   STUSAB  count_municipalities_places  count_townships_county_subdivisions
0                                    0                                    0
1      AK                          355                                   37
2      AL                          593                                  390
3      AR                          625                                 1095
4      AZ                          467                                   80
5      CA                         1618                                  404
6      CO                          482                                  209
7      CT                          215                                  170
8      DC                            0                                    1
9      DE                           79                                   27
10     FL                          955                                  316
11     GA                          675                                  586
12     HI   

In [None]:
# The "Strong MCD" states where we want to include Townships (SumLev 060)
# CT, MA, ME, MI, MN, NH, NJ, NY, PA, RI, VT, WI
STRONG_MCD_STATES = [
    "09", "25", "23", "26", "27", "33", 
    "34", "36", "42", "44", "50", "55"
]

# Column mapping for consistent ID handling
GEO_ID_COL_ACS = "GEO_ID"      # Usually '1600000US0100100'
GEO_ID_COL_SHP = "GEOID"       # Usually '0100100'