In [1]:
import requests
import pandas as pd
import time
from config import api_key, census_api_key
from pprint import pprint
from census import Census
import censusdata
import re
import numpy
import hvplot.pandas

In [2]:
# List of states with their abbreviations
states = {
    "AK": "Alaska", "AL": "Alabama", "AR": "Arkansas", "AZ": "Arizona",
    "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DC": "District of Columbia",
    "DE": "Delaware", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii",
    "IA": "Iowa", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana",
    "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "MA": "Massachusetts",
    "MD": "Maryland", "ME": "Maine", "MI": "Michigan", "MN": "Minnesota",
    "MO": "Missouri", "MS": "Mississippi", "MT": "Montana", "NC": "North Carolina",
    "ND": "North Dakota", "NE": "Nebraska", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NV": "Nevada", "NY": "New York", "OH": "Ohio",
    "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island",
    "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas",
    "UT": "Utah", "VA": "Virginia", "VT": "Vermont", "WA": "Washington",
    "WI": "Wisconsin", "WV": "West Virginia", "WY": "Wyoming"
}

# Function to fetch agency data from the API
def fetch_agency_data(state_abbr):
    base_url = f'https://api.usa.gov/crime/fbi/cde/agency/byStateAbbr/{state_abbr}'
    response = requests.get(base_url, api_key)
    if response.status_code != 200:
        raise ValueError(f"API call failed for {state_abbr} with status {response.status_code}")
    return response.json()

# List to hold filtered data from all states
all_filtered_data = []

# Iterate over each state and process the data
for state_abbr, state_name in states.items():
    try:
        # Fetch data for the current state
        response = fetch_agency_data(state_abbr)
        
        # Iterate through all counties in the response data
        for county, agencies in response.items():
            for agency in agencies:
                # Filter agencies with `agency_type_name` == "City" and valid coordinates
                if (
                    agency.get('agency_type_name') == 'City' and
                    agency.get('latitude') is not None and
                    agency.get('longitude') is not None
                ):
                    # Append filtered data to the list
                    all_filtered_data.append({
                        'State': state_abbr,
                        'Agency Name': agency.get('agency_name'),
                        'Latitude': agency.get('latitude'),
                        'Longitude': agency.get('longitude'),
                        'ORI': agency.get('ori'),
                    })
        print(f"Processed data for {state_name} ({state_abbr})")
    except Exception as e:
        print(f"Error processing {state_name} ({state_abbr}): {e}")
    
    # Pause to avoid overloading the API
    time.sleep(.2)

# Convert the collected data into a single Pandas DataFrame
df_all_agencies = pd.DataFrame(all_filtered_data)

# Add a new "City" column by removing "Police Department" from the "Agency Name"
df_all_agencies["City"] = (
    df_all_agencies["Agency Name"]
    .str.replace(r"Police Department", "", regex=True)  # Remove "Police Department"
    .str.strip()  # Remove leading/trailing spaces
)

# Save the DataFrame to a CSV file
output_path = "../Resources/Agency/Agency_Data/filtered_city_agencies.csv"
df_all_agencies.to_csv(output_path, index=False)

print(f"All filtered agency data saved to {output_path}")


Processed data for Alaska (AK)
Processed data for Alabama (AL)
Processed data for Arkansas (AR)
Processed data for Arizona (AZ)
Processed data for California (CA)
Processed data for Colorado (CO)
Processed data for Connecticut (CT)
Processed data for District of Columbia (DC)
Processed data for Delaware (DE)
Processed data for Florida (FL)
Processed data for Georgia (GA)
Processed data for Hawaii (HI)
Processed data for Iowa (IA)
Processed data for Idaho (ID)
Processed data for Illinois (IL)
Processed data for Indiana (IN)
Processed data for Kansas (KS)
Processed data for Kentucky (KY)
Processed data for Louisiana (LA)
Processed data for Massachusetts (MA)
Processed data for Maryland (MD)
Processed data for Maine (ME)
Processed data for Michigan (MI)
Processed data for Minnesota (MN)
Processed data for Missouri (MO)
Processed data for Mississippi (MS)
Processed data for Montana (MT)
Processed data for North Carolina (NC)
Processed data for North Dakota (ND)
Processed data for Nebraska 

In [3]:
df_all_agencies.head()

Unnamed: 0,State,Agency Name,Latitude,Longitude,ORI,City
0,AK,Nome Police Department,64.783686,-164.188912,AK0010600,Nome
1,AK,Sitka Police Department,57.052124,-135.33418,AK0010900,Sitka
2,AK,Bethel Police Department,60.928916,-160.15335,AK0011300,Bethel
3,AK,Haines Police Department,59.098771,-135.576936,AK0012100,Haines
4,AK,Juneau Police Department,58.356556,-134.50731,AK0010300,Juneau


In [4]:
c = Census(census_api_key, year=2021)

# Dictionary mapping state abbreviations to FIPS codes
state_abbr_to_fips = {
    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', 'CO': '08', 'CT': '09',
    'DE': '10', 'DC': '11', 'FL': '12', 'GA': '13', 'HI': '15', 'ID': '16', 'IL': '17',
    'IN': '18', 'IA': '19', 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24',
    'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', 'MT': '30', 'NE': '31',
    'NV': '32', 'NH': '33', 'NJ': '34', 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38',
    'OH': '39', 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', 'SD': '46',
    'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', 'VA': '51', 'WA': '53', 'WV': '54',
    'WI': '55', 'WY': '56'
}

def get_poverty_for_city(city_population_data, city_name):
    city_data = city_population_data.get(city_name)
    if city_data:
        poverty = city_data[1]  # Access the second element of the list (poverty data)
    else:
        poverty = None
    return poverty

def calculate_poverty_rate(population, poverty):
    if population == 0 or poverty == 0:
        return None  # or return a specific value to indicate unknown/undefined
    poverty_rate = round((poverty / population) * 100, 2)
    return poverty_rate

# Function to get population data for all cities in a state at once
def get_population_for_state(state_abbr):
    # Convert state abbreviation to FIPS code
    state_fips = state_abbr_to_fips.get(state_abbr)
    if not state_fips:
        raise ValueError(f"Invalid state abbreviation: {state_abbr}")
    
    # Get population data for all cities in the state
    census_data = c.acs5.get(
        (
            "NAME",          # City name
            "B01003_001E",   # Total population
            "B17001_002E"   # Poverty count
        ),
        {'for': 'place:*', 'in': f'state:{state_fips}'}
    )
    
    # Convert to DataFrame
    census_pd = pd.DataFrame(census_data[1:], columns=census_data[0])
    
    # Clean city names by removing unnecessary information
    census_pd['NAME'] = census_pd['NAME'].apply(lambda x: re.sub(r'\(.*?\)', '', x).replace('CDP', '').replace('City', '').strip().split(',')[0].strip())
    
    # Return as a dictionary for easy access (City -> {'Population': ..., 'Poverty': ...})
    return dict(zip(census_pd['NAME'], census_pd[['B01003_001E', 'B17001_002E']].values.tolist()))

# List to store population data for all states
state_population_data = {}

# Iterate over rows in df_all_agencies and append the population data
population_list = []
poverty_list = []
poverty_rate_list = []

# Process one state at a time
for index, row in df_all_agencies.iterrows():
    city = row['City']
    state_abbr = row['State']

    # Check if we've already fetched the population data for this state
    if state_abbr not in state_population_data:
        try:
            # Fetch the population data for the current state
            state_population_data[state_abbr] = get_population_for_state(state_abbr)
            print(f"Fetched population data for {state_abbr}")
        except Exception as e:
            print(f"Error fetching population data for {state_abbr}: {e}")
            state_population_data[state_abbr] = {}  # Empty dict in case of error

    # Retrieve the population for the city (flexible match)
    city_population_data = state_population_data.get(state_abbr, {})
    city_population_data = {key.replace("  ", " "): value 
                           for key, value in state_population_data.get(state_abbr, {}).items()}
    city = city.lower()
    matching_cities = [city_name for city_name in city_population_data 
                     if city.lower() in city_name.lower() 
                     or city_name.lower() in city.lower()]
    if matching_cities:
        population = city_population_data.get(matching_cities[0])[0]
        poverty = get_poverty_for_city(city_population_data, matching_cities[0])
        poverty_rate = calculate_poverty_rate(population, poverty)
    else:
        population = None
        poverty = None
        poverty_rate = None

    population_list.append(population)
    poverty_list.append(poverty)
    poverty_rate_list.append(poverty_rate)

# Add the population and poverty data as new columns in df_all_agencies
df_all_agencies['Population'] = population_list
df_all_agencies['Poverty'] = poverty_list
df_all_agencies['Poverty Rate'] = poverty_rate_list

# Save the final DataFrame with population data to a new CSV file
output_path = "../Resources/Agency/Agency_Data/filtered_city_agencies.csv"
df_all_agencies.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")

Fetched population data for AK
Fetched population data for AL
Fetched population data for AR
Fetched population data for AZ
Fetched population data for CA
Fetched population data for CO
Fetched population data for CT
Fetched population data for DC
Fetched population data for DE
Fetched population data for FL
Fetched population data for GA
Fetched population data for HI
Fetched population data for IA
Fetched population data for ID
Fetched population data for IL
Fetched population data for IN
Fetched population data for KS
Fetched population data for KY
Fetched population data for LA
Fetched population data for MA
Fetched population data for MD
Fetched population data for ME
Fetched population data for MI
Fetched population data for MN
Fetched population data for MO
Fetched population data for MS
Fetched population data for MT
Fetched population data for NC
Fetched population data for ND
Fetched population data for NE
Fetched population data for NH
Fetched population data for NJ
Fetched 

In [5]:
#Re-reading in the csv to make sure it's cleaned.
df_all_agencies = pd.read_csv("../Resources/Agency/Agency_Data/filtered_city_agencies.csv")

# Get the original number of rows
original_rows = df_all_agencies.shape[0]

# Remove rows with null population values
df_all_agencies = df_all_agencies.dropna(subset=['Population'])

# Get the new number of rows
new_rows = df_all_agencies.shape[0]

# Print the number of nulls dropped
nulls_dropped = original_rows - new_rows
print(f"Dropped {nulls_dropped} rows with null population values from filtered_city_agencies.csv")
output_path = "../Resources/Agency/Agency_Data/filtered_city_agencies.csv"
df_all_agencies.to_csv(output_path, index=False)
# Sort the DataFrame by state and population in descending order
df_sorted = df_all_agencies.sort_values(by=['State', 'Population'], ascending=[True, False])

# Group the DataFrame by state and get the top 25 population cities in each state
df_top_cities = df_sorted.groupby('State').head(25)

# Write the filtered DataFrame to a new CSV file
output_path = "../Resources/Agency/Agency_Data/top_25_state_cities.csv"
df_top_cities.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")

# Get the top 50 cities in the whole country by population
df_top_50_cities = df_sorted.nlargest(50, 'Population')
df_top_1_city = df_sorted.nlargest(1, 'Population')
# Write the top 50 cities to a new CSV file
output_path = "../Resources/Agency/Agency_Data/top_50_US_cities.csv"
df_top_50_cities.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")
output_path = "../Resources/Agency/Agency_Data/testing_cities.csv"
df_top_1_city.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")

Dropped 1154 rows with null population values from filtered_city_agencies.csv
Data saved to ../Resources/Agency/Agency_Data/top_25_state_cities.csv
Data saved to ../Resources/Agency/Agency_Data/top_50_US_cities.csv
Data saved to ../Resources/Agency/Agency_Data/testing_cities.csv


In [18]:
# Record the start time of the script
start_time = time.time()

# Load the top_cities.csv file
df_top_cities = pd.read_csv("../Resources/Agency/Agency_Data/top_25_state_cities.csv")

# Define the crime types
violent_crimes = ["homicide", "rape", "robbery", "aggravated-assault"]
property_crimes = ["arson", "burglary", "larceny", "motor-vehicle-theft"]
retries = 0

# Define the date range
begin_date = "01-2023"
end_date = "12-2023"
time_frame = f'?from={begin_date}&to={end_date}'

def countdown(t):
    while t:
        mins, secs = divmod(t, 60)
        timer = '{:02d}m {:02d}s'.format(mins, secs)
        print(timer, end="\r")
        time.sleep(30)
        t -= 30

def fetch_crime_data(ori, crime_type):
    base_url = f'https://api.usa.gov/crime/fbi/cde/summarized/agency/{ori}/{crime_type}{time_frame}{api_key}'
    max_retries = 5
    retries = 0

    while retries < max_retries:
        try:
            response = requests.get(base_url)
            time.sleep(2)  # Slow down for lazy API 
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {ori}, {crime_type}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in 10 minutes... (Attempt {retries}/{max_retries})")
                countdown(600)  # 10 minute countdown
            else:
                print("Max retries exceeded. Giving up.")
                return None

# List to hold processed crime data
crime_data_list = []
total_agencies = len(df_top_cities)
# Process each city in the DataFrame
for iteration, row in df_top_cities.iterrows():
    # Record the start time for this iteration
    iteration_start_time = time.time()
    
    ori = row['ORI']
    agency_name = row['Agency Name']
    population = row['Population']

    # Initialize crime totals
    violent_crime_total = 0
    property_crime_total = 0

    # Fetch and process crime data
    for crime_type in violent_crimes + property_crimes:
        data = fetch_crime_data(ori, crime_type)
        if data and 'offenses' in data:
            # Extract data using the 'Agency Name' as the key
            actuals = data['offenses'].get('actuals', {})
            agency_data = {key: int(value) if isinstance(value, int) else 0 for key, value in actuals.get(agency_name, {}).items()} # Came across sum error 2 hours in to my pull, made sure all values are int, if not return 0.
            total_crimes = sum(agency_data.values())
            
            if crime_type in violent_crimes:
                violent_crime_total += total_crimes
            elif crime_type in property_crimes:
                property_crime_total += total_crimes

    # Calculate crime rate per 100,000 population
    crime_rate = round(((violent_crime_total + property_crime_total) / population) * 100000,2)
    violent_crime_rate = round((violent_crime_total / population) * 100000,2)
    property_crime_rate = round((property_crime_total / population) * 100000,2)
    # Get rid of any redundant decimals
    df_top_cities["Population"] = df_top_cities["Population"].round(0).astype(int)
    df_top_cities["Poverty"] = df_top_cities["Poverty"].round(0).astype(int)
    # Append the calculated data to the list
    crime_data_list.append({
        'ORI': ori,
        'Violent Crime': violent_crime_total,
        'Property Crime': property_crime_total,
        'Violent Crime(Per 100k)': violent_crime_rate,
        'Property Crime(Per 100k)': property_crime_rate,
        'Crime Rate(Per 100k)': crime_rate
    })

    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    elapsed_minutes, elapsed_seconds = divmod(int(elapsed_time), 60)

    # Calculate time taken for this iteration
    iteration_elapsed_time = time.time() - iteration_start_time
    iteration_elapsed_minutes, iteration_elapsed_seconds = divmod(int(iteration_elapsed_time), 60)

    # Calculate estimated time remaining.
    estimated_remaining = iteration_elapsed_seconds * (total_agencies - (iteration + 1))
    remaining_minutes, remaining_seconds = divmod(estimated_remaining, 60)
    formatted_remaining_time = f"{remaining_minutes}m {remaining_seconds}s"

    # Spent some time showing time data so that I can see how long it's going to take and where exactly it's getting stuck if it doesn't succeed.
    print(f"Processed data for {agency_name}, {row["State"]} - This Process: {iteration_elapsed_seconds:.2f}s ({iteration + 1}/{total_agencies}) - {elapsed_minutes}m {elapsed_seconds}s - Est Remaining: {remaining_minutes}m {remaining_seconds}s {'Retries: ' + str(retries) if retries != 0 else ''}")

# Create a DataFrame from the crime data list
df_crime_data = pd.DataFrame(crime_data_list)

# Merge the crime data with the original DataFrame
df_merged = pd.merge(df_top_cities, df_crime_data, on='ORI')

# Save the merged DataFrame to a new CSV file
output_path = "../Resources/Agency/top_25_state_cities_crime_data.csv"
df_merged.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")


Processed data for Anchorage Police Department, AK - This Process: 23.00s (1/1204) - 0m 23s - Est Remaining: 461m 9s
Processed data for Fairbanks Police Department, AK - This Process: 23.00s (2/1204) - 0m 47s - Est Remaining: 460m 46s
Processed data for Juneau Police Department, AK - This Process: 23.00s (3/1204) - 1m 11s - Est Remaining: 460m 23s
Processed data for Wasilla Police Department, AK - This Process: 23.00s (4/1204) - 1m 34s - Est Remaining: 460m 0s
Processed data for Sitka Police Department, AK - This Process: 22.00s (5/1204) - 1m 57s - Est Remaining: 439m 38s
Processed data for Ketchikan Police Department, AK - This Process: 21.00s (6/1204) - 2m 18s - Est Remaining: 419m 18s
Processed data for Kenai Police Department, AK - This Process: 22.00s (7/1204) - 2m 41s - Est Remaining: 438m 54s
Processed data for Bethel Police Department, AK - This Process: 21.00s (8/1204) - 3m 3s - Est Remaining: 418m 36s
Processed data for Palmer Police Department, AK - This Process: 22.00s (9/1

In [None]:
# Load the data
df_cities_plot = pd.read_csv("../Resources/Agency/top_25_state_cities_crime_data.csv")

# Remove rows where the State is "AK" or "HI"
df_cities_plot = df_cities_plot[~df_cities_plot['State'].isin(['AK', 'HI'])]

# Remove rows where both "Violent Crime" and "Property Crime" are 0
df_cities_plot = df_cities_plot[~((df_cities_plot['Violent Crime'] == 0) & (df_cities_plot['Property Crime'] == 0))]
df_cities_plot["Crime Rate (Per 100k)"] = df_cities_plot["Crime Rate (Per 100k)"].map("{:.2f}".format)

# Create the map plot
map_plot_1 = df_cities_plot.hvplot.points(
    "Longitude",
    "Latitude",
    geo=True,
    tiles="OSM",
    size="Crime Rate(Per 100k)",
    color="Poverty Rate",
    cmap="RdYlBu_r",
    title="Top 25 Cities by State - Crime Data",
    width=1600,
    height=800,
    responsive=True,  # Add this parameter
    shared_axes=False,  # Add this parameter
    xaxis=None,  # Add this parameter
    yaxis=None,  # Add this parameter
    hover_cols=["City", "State"],
    scale = .1
)

# Display the plot
map_plot_1

