In [1]:
# !pip install pytesseract

In [2]:
# PYTHON IMPORTS
import os, copy, math, re
from tqdm.notebook import trange, tqdm

# IMAGE IMPORTS 
from PIL import Image

# DATA IMPORTS 
import random, h5py, glob
import numpy as np
import requests
import pandas as pd

# PLOTTING
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# SHAPES IMPORTS
import shapely
import geopandas as gpd
from shapely.ops import unary_union
from shapely.geometry import LineString, Polygon, Point

# MY OWN CLASSES
from FindGrid import *

# OCR libraries
from fuzzywuzzy import fuzz, process

# PREFERENCES
Image.MAX_IMAGE_PIXELS = 933120000



Download Places Datasets

In [3]:
if False:
    # Base URL pattern with a placeholder for the integer
    base_url = "https://www2.census.gov/geo/tiger/TIGER2022/PLACE/tl_2022_{:02d}_place.zip"  # Replace with your base URL
    savedir  = r"C:\Users\fhacesga\OneDrive - University Of Houston\AAA_RECTDNN\data\ReferenceDatasets\Places"
    # Loop through integers from 1 to 100
    for i in range(1, 101):
        # Construct the URL by formatting the integer into the base URL
        url = base_url.format(i)

        try:
            # Send an HTTP GET request to the URL
            response = requests.get(url)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Save the response content to a file with a filename based on the integer
                filename = f"{i}.zip"  # You can change the filename format
                with open(f"{savedir}/{filename}", "wb") as file:
                    file.write(response.content)
                print(f"Saved {filename}")
            else:
                print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        except Exception as e:
            print(f"An error occurred while processing {url}: {str(e)}")
            
def strip_non_numeric_and_convert(field_value):
    return pd.to_numeric(field_value.str.replace(r'[^0-9]', ''))

Read Data and Preprocess

In [4]:
data_dir = r"C:\Users\franc\OneDrive - University Of Houston\AAA_RECTDNN\data/"

CIDs        = pd.read_csv(f"{data_dir}ReferenceDatasets/CommunityNumbers.csv")
CIDs["CID"] = strip_non_numeric_and_convert(CIDs["CID"])
CIDs["ST"]  = np.floor(CIDs["CID"] / 1e4)

counties    = gpd.read_file(f"{data_dir}ReferenceDatasets/Counties/tl_2019_us_county.shp")
counties['STATEFP'] = np.array(counties['STATEFP']).astype(np.float32)



Fuzzy matching between DataFrame keys

In [5]:
# Function to find the best fuzzy match
def find_best_match(left_value, choices):
    return process.extract(left_value, choices, scorer=fuzz.ratio, limit=1)

county_options = counties['NAMELSAD'].to_numpy()

# Apply the fuzzy matching and merge the DataFrames
merged_data = []

for idx, left_row in tqdm(CIDs.iterrows(), total=CIDs.shape[0]):
    if type(left_row["County"]) == str:
        index = counties['STATEFP'] == left_row["ST"]
        index = index.to_numpy()
        best_match = find_best_match(left_row["County"], county_options[index].tolist())
        if len(best_match) == 0:
            continue
        merged_data.append([left_row["County"], best_match[0][0], left_row["ST"]])
        

  0%|          | 0/25512 [00:00<?, ?it/s]

Create DataFrame for merges and drop duplicates

In [6]:
merged_df = pd.DataFrame(merged_data, columns=['CIDs', 'Counties', "ST"]) 

merged_df[['CIDs', 'Counties']] = merged_df[['CIDs', 'Counties']].astype(pd.StringDtype())
merged_df = merged_df.drop_duplicates()

Convert to Strings

In [7]:
CIDs['CID'] = CIDs['CID'].astype(pd.StringDtype())
counties['NAMELSAD'] = counties['NAMELSAD'].astype(pd.StringDtype())

Merge DataFrames and add Geometry

In [8]:
result_df   = pd.merge(CIDs, merged_df, right_on=['CIDs', 'ST'], left_on=['County', 'ST'], how='inner')
result_df_1 = pd.merge(result_df, counties, right_on=['NAMELSAD', 'STATEFP'], left_on=['Counties', 'ST'], how='inner')
result_df_1 = gpd.GeoDataFrame(result_df_1, geometry=result_df_1["geometry"])

In [9]:
result_df_final = result_df_1[["CID", "County", "STATEFP", "COUNTYFP" , "GEOID", "Counties", "NAME",]].copy()

Save files

In [10]:
result_df_final.to_csv(f"{data_dir}ReferenceDatasets/CountyCIDs.csv")