In [30]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely import wkt
from google.cloud import bigquery
import pandas_gbq
import os
import uuid

In [None]:
# Function to find project root
def find_project_root(start:Path | None=None) -> Path:
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        # print(p)
        if (p / 'pyproject.toml').exists() or (p / '.git').exists() or (p / 'data').exists():
            return p
    raise FileNotFoundError('Could not find root directory')

In [56]:
# Define constants to load data
# TODO: store these in a separate config.py file and load from there
PROJECT = 'clgx-gis-app-dev-06e3'
DATASET = 'teu_site_similarity'
TABLE = 'acs_5yr_place_features_v1'
DATA_DIR = find_project_root() / "data" / "intermediate" / "features"

c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\teu-site-similarity\src
c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\teu-site-similarity


In [57]:
DATA_DIR

WindowsPath('c:/Users/eprashar/OneDrive - CoreLogic Solutions, LLC/github/2026/geo similarity teu/teu-site-similarity/data/intermediate/features')

In [58]:
# Function to import data from a BQ table
# Function needs authentication to gcloud before function call
def load_data_from_bq(
    project:str,
    dataset:str,
    table:str,
    save:bool=True,
    data_dir:Path=DATA_DIR) -> gpd.GeoDataFrame:
    '''
    Given a project, dataset and a table, load and return data in a geopandas dataframe
    '''
    client = bigquery.Client(project=project)
    query = f"SELECT * FROM `{project}.{dataset}.{table}`"
    
    # Load to pandas
    df = client.query(query).to_dataframe()
    
    # Load geometry object
    if 'geometry' in df.columns:
        df['geometry'] = df['geometry'].apply(wkt.loads)
    else:
        raise ValueError("Geometry column not found in input dataframe")
    
    gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
    print(f'The crs of the dataframe is {gdf.crs}')
    print(f'Loaded dataframe has shape: {gdf.shape}')
    if save:
        if not data_dir:
            raise ValueError("Geometry not found in input dataframe.")
        # Ensure directory exists
        os.makedirs(data_dir, exist_ok=True)
        file_path = os.path.join(data_dir, f'{table}.parquet')
        gdf.to_parquet(file_path)
        print(f'Data saved to: {file_path}!')
    return gdf

In [59]:
# Load and save the data in a parquet file
gdf = load_data_from_bq(
    project=PROJECT, 
    dataset=DATASET,
    table=TABLE,
    save=True,
    data_dir=DATA_DIR)

The crs of the dataframe is EPSG:4326
Loaded dataframe has shape: (27056, 21)
Data saved to: c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\teu-site-similarity\data\intermediate\features\acs_5yr_place_features_v1.parquet!


In [60]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 27056 entries, 0 to 27055
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geoidfq                    27056 non-null  object  
 1   state_fips                 27056 non-null  object  
 2   state_name                 27056 non-null  object  
 3   stusps                     27056 non-null  object  
 4   namelsad                   27056 non-null  object  
 5   pop_2024                   27056 non-null  Int64   
 6   households_2024            27056 non-null  Int64   
 7   median_income_2024         27056 non-null  Int64   
 8   median_home_value_2024     27056 non-null  Int64   
 9   unq_parcel_count           26842 non-null  Int64   
 10  median_parcel_area_sq_mtr  26842 non-null  float64 
 11  parcel_density             26842 non-null  float64 
 12  unq_clips                  26537 non-null  Int64   
 13  unq_addr_count         

In [None]:
# Initialize customer in the loaded dataframe
def initialize_customer(gdf, customer_name):
    """
    Creates a unique Customer ID and maps it to the provided name.
    Adds a 'customer_name' and 'customer_id' column to a temporary copy of the GDF.
    """
    # Create customer id
    cust_id = str(uuid.uuid4())[:8]
    
    # Add columns to the GDF copy
    gdf['customer_name'] = customer_name
    gdf['customer_id'] = cust_id
    
    return gdf, cust_id

In [6]:
# Create reference group for analysis
def generate_reference_vector(gdf, reference_geoids, feature_pct_cols):
    """
    Creates the 'Gold Standard' profile by averaging the vectors of reference cities.
    
    Args:
        gdf (gp.GeoDataFrame): Master dataframe.
        reference_geoids (list): GEOIDs selected in the Reference UI.
        feature_pct_cols (list): The percentile columns to include in the vector.
        
    Returns:
        np.array: A 1D target vector.
    """
    pass

In [7]:
# Function to select comparison group
def apply_comparison_mask(gdf, target_states, pop_floor=500):
    """
    Creates a 'Comparison Mask' by filtering the universe to specific states and pop size.
    
    Args:
        gdf (gp.GeoDataFrame): Master dataframe.
        target_states (list): States selected in the Comparison UI.
        pop_floor (int): Minimum population threshold.
        
    Returns:
        gp.GeoDataFrame: Subset of the data for candidate ranking.
    """
    pass

In [8]:
# Calculate percentiles based on comparison mask
def calculate_local_percentiles(subset_gdf, feature_list):
    """
    Re-calculates percentiles (0-100) specifically for the comparison subset.
    This ensures similarity is relative to the chosen universe.
    
    Args:
        subset_gdf (gp.GeoDataFrame): The masked comparison dataframe.
        feature_list (list): Raw feature columns to be ranked.
        
    Returns:
        gp.GeoDataFrame: Dataframe with added '_pct' columns.
    """
    pass

In [None]:
# Function to calculate similarity
def run_similarity_engine(comparison_gdf, target_vector, feature_pct_cols, weights=None):
    """
    Computes Euclidean distance between target_vector and comparison_gdf.
    
    Args:
        comparison_gdf (gp.GeoDataFrame): The comparison universe.
        target_vector (np.array): The reference profile.
        feature_pct_cols (list): Percentile columns used for math.
        weights (dict): Weighting multipliers per feature.
        
    Returns:
        gp.GeoDataFrame: Sorted dataframe with 'similarity_score' column.
    """
    pass

**Streamlit functionality**

In [None]:
# Reference set
def ui_sidebar_reference(gdf):
    """
    Renders the Reference Tab.
    1. Select Customer -> Filters State -> Filters Places.
    2. Collects geoidfq for the 'Gold Standard'.
    """
    pass

# Comparison parameters
def ui_sidebar_comparison(gdf):
    """
    Renders the Comparison Tab.
    1. Select Target States (e.g., TX, FL).
    2. Set Population Threshold.
    """
    pass

def main():
    """
    The main app loop.
    1. Loads master data.
    2. Runs Reference UI to get Target Vector.
    3. Runs Comparison UI to get Comparison Mask.
    4. Triggers Analytics Engine.
    5. Displays Visualizer (Table & Map).
    """
    pass

In [None]:
df = pd.DataFrame({
  'first column': [1, 2, 3, 4],
  'second column': [10, 20, 30, 40]
})

df

2026-02-09 16:06:24.245 
  command:

    streamlit run c:\Users\eprashar\AppData\Local\miniforge3\envs\geo_env_v2\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
