In [None]:
"""Script to load county names"""

API_KEY = "xxx"

#%pip install requests

In [41]:
"""Load data"""
import pandas as pd

# List of file names
files = [f"df{i}.csv" for i in range(1, 13)]
dfs = [pd.read_csv(file, sep=",", encoding="latin1") for file in files]

df_input = pd.concat(dfs, ignore_index=True)

print(df_input.shape)

(1537584, 30)


In [42]:
"""Keep only relevant columns"""

df_locs = df_input[["State", "PWSName"]].drop_duplicates(subset=["PWSName"])
print(df_locs.shape)

import re
# Function to clean PWSName (removes numbers & parentheses)
def clean_pws_name(name):
    return re.sub(r"\s*\([^)]*\)|\d+", "", name).strip()

# Remove all states with 0 as text
df_locs = df_locs[~df_locs["State"].astype(str).str.contains("0", regex=True)]


# Apply function to the PWSName column
df_locs["PWSName"] = df_locs["PWSName"].apply(clean_pws_name)

print(df_locs.shape)


(12483, 2)
(12400, 2)


In [None]:
"""Check for missing entries"""

import pandas as pd

# Load the full dataset (df_locs) and the existing CSV file
CSV_FILE = "pws_with_counties.csv"
df_csv = pd.read_csv(CSV_FILE)

# Find missing rows by checking which PWSName values are not in the CSV
df_missing = df_locs[~df_locs["PWSName"].isin(df_csv["PWSName"])]

# Display the missing rows
print(f"Found {len(df_missing)} missing rows that are not in '{CSV_FILE}'.")
print(df_missing.head())  # Show first few missing rows

# Optionally save the missing rows to a new CSV file
#df_missing.to_csv("missing_pws_entries.csv", index=False)

print("Missing data saved to 'missing_pws_entries.csv'.")

Found 0 missing rows that are not in 'pws_with_counties.csv'.
Empty DataFrame
Columns: [State, PWSName]
Index: []
Missing data saved to 'missing_pws_entries.csv'.


In [None]:
"""Create a sample to check script"""
#df_locs = df_locs.sample(3)

#display(df_locs)

Unnamed: 0,State,PWSName,FacilityName,SamplePointName
991201,NY,CLIFTON PARK WATER AUTHORITY,SCWA Intertie,Entry Point to Dist. System
1443172,VT,Jay Peak Basin Complex,Treatment Plant #1,EPTDS from Treatment Plant
885289,NH,UNH - Durham Water System,Treatment Plant,Finished Water Sample @ TP


In [None]:
"""Main scipt to fetch counties"""

import requests
import pandas as pd
import os
import time

# Function to get lat/lng from Google Places API
def get_lat_lng(pws_name, state):
    """Fetch lat/lng from Places API"""
    url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    params = {"query": f"{pws_name}, {state}", "key": API_KEY}

    response = requests.get(url, params=params)
    data = response.json()

    if "results" in data and data["results"]:
        place = data["results"][0]
        lat, lng = place["geometry"]["location"]["lat"], place["geometry"]["location"]["lng"]
        return lat, lng

    return None, None

# Function to get county name from Google Geocoding API
def get_county_name(lat, lng):
    """Fetch county name from Geocoding API"""
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {"latlng": f"{lat},{lng}", "key": API_KEY}

    response = requests.get(url, params=params)
    data = response.json()

    for result in data.get("results", []):
        for component in result.get("address_components", []):
            if "administrative_area_level_2" in component["types"]:
                return component["long_name"]  # County name

    return "County not found"





CSV_FILE = "pws_with_counties.csv"  # File where data is saved

# **Step 1: Load existing processed data (if file exists)**
if os.path.exists(CSV_FILE):
    processed_df = pd.read_csv(CSV_FILE)
    processed_pws = set(processed_df["PWSName"])  # Set for fast lookup
    print(f"Loaded {len(processed_pws)} processed PWS entries from CSV.")
else:
    processed_pws = set()


# **Step 2: Load full dataset and remove already processed entries**
df = df_missing #df_locs.copy()  
df = df[~df["PWSName"].isin(processed_pws)] 

print("To be processed:",df.shape[0])

if df.empty:
    print("All PWS entries have already been processed!")
    exit()

# **Step 3: Process remaining PWS entries**
new_data = []  # Store results in a list before saving to CSV

for index, row in df.iterrows():
    pws_name, state = row["PWSName"], row["State"]

    lat, lng = get_lat_lng(pws_name, state)
    if lat and lng:
        county = get_county_name(lat, lng)
    else:
        county = "Not Found"

    new_data.append([state, pws_name, county])

    # Save every 50 rows to prevent data loss
    if len(new_data) % 50 == 0:
        print(f"Saving {len(new_data)} new rows to CSV...")
        pd.DataFrame(new_data, columns=["State", "PWSName", "County"]).to_csv(
            CSV_FILE, mode="a", header=not os.path.exists(CSV_FILE), index=False
        )
        new_data = []  # Clear list after saving

    time.sleep(0.2)  # Prevent hitting API rate limits

# **Step 4: Save remaining data after loop**
if new_data:
    print(f"Saving final {len(new_data)} rows to CSV...")
    pd.DataFrame(new_data, columns=["State", "PWSName", "County"]).to_csv(
        CSV_FILE, mode="a", header=not os.path.exists(CSV_FILE), index=False
    )

print("Processing complete. Data saved to 'pws_with_counties.csv'.")

Loaded 12535 processed PWS entries from CSV.
To be processed: 634
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving 50 new rows to CSV...
Saving final 34 rows to CSV...
Processing complete. Data saved to 'pws_with_counties.csv'.


In [None]:
"""Check for missing Counties"""

CSV_FILE = "pws_with_counties.csv"
df = pd.read_csv(CSV_FILE).drop_duplicates(subset=["PWSName"])

df_notfounds = df[df["County"] == "Not Found"]
df_notfounds = df_notfounds[~df_notfounds["State"].astype(str).str.contains("0", regex=True)]


df_cleaned = df[df["County"] != "Not Found"]

print(df_notfounds.shape[0])
print(df.shape[0])
print(df_cleaned.shape[0])

print(df_locs.shape[0])

display(df_notfounds)


56
12677
12584
12483


Unnamed: 0,State,PWSName,County
12282,AZ,Town of Buckeye Sundance,Not Found
12286,AZ,CITY OF BUCKEYE SONORA - SUNDANCE,Not Found
12289,CA,Suburban Water Systems - San Jose,Not Found
12290,CA,Baseline Gardens MWC,Not Found
12294,CA,LSID - Page Moore System,Not Found
12298,CA,Golden State WC - Cowan Heights,Not Found
12310,CA,"NEVADA ID - E. GEORGE, BANNER MOUNTAIN",Not Found
12311,CA,SQUAW VALLEY WATER SYSTEM,Not Found
12313,CA,Cal Am - Suburban Rosemont,Not Found
12325,CA,SUBURBAN WATER SYSTEMS-SAN JOSE,Not Found


In [None]:
""""Remove missing counties"""

import pandas as pd

# Load the CSV file
CSV_FILE = "pws_with_counties.csv"
df = pd.read_csv(CSV_FILE)

# Remove rows where County == "Not Found"
df_cleaned = df[df["County"] != "Not Found"]

# Save the cleaned data back to the same CSV file (overwrite)
df_cleaned.to_csv(CSV_FILE, index=False)

print(f"Removed {len(df) - len(df_cleaned)} entries where County was 'Not Found'.")
print(f"Saved cleaned data with {len(df_cleaned)} rows to '{CSV_FILE}'.")

Removed 258 entries where County was 'Not Found'.
Saved cleaned data with 12225 rows to 'pws_with_counties.csv'.
