In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
import datetime

In [2]:
def get_save_name(county: str):
    return "_".join(county.split(" ")).lower()

In [3]:
def recover_proper_name(county_save_name: str):
    words = county_save_name.split("_")
    words = [word.capitalize() for word in words]
    return " ".join(words)

In [4]:
recover_proper_name("san_francisco")

'San Francisco'

In [76]:
queer_areas = {
    "San Francisco": {
        "state": "California",
        "neighboring_counties": ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma']
    },
    "Multnomah": {
        "state": "Oregon",
        "neighboring_counties": ['Columbia', 'Hood River', 'Clackamas', 'Washington']
    },
    "King": {
        "state": "Washington",
        "neighboring_counties": ['Snohomish', 'Kitsap', 'Kittitas', 'Yakima', 'Pierce', 'Chelan']
    }
}

## Case Data

### Reformat data
- Make sure to download the raw data file and save it to the local directory before running this cell
- If "daily_cases.csv", the cleaned data, is already in the directory, don't bother running this

In [None]:
#Source: https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
case_data = pd.read_csv("time_series_covid19_confirmed_US.csv")

# Remove unnecessary columns
case_data.drop(["UID", "iso2", "iso3", "code3", "Country_Region", "Lat", "Long_", "Combined_Key"], axis=1, inplace=True)

# Rename columns to better names
case_data.rename(columns={"Admin2": "county", "FIPS": "fips", "Province_State": "state"}, inplace=True)

melted_data = pd.melt(
    case_data,
    id_vars=["fips", "county", "state"],  # Columns to keep
    var_name="date",                      # Name for the new 'date' column
    value_name="cases"                    # Name for the new 'cases' column
)

melted_data["date"] = pd.to_datetime(melted_data["date"], errors="coerce")

melted_data.to_csv("daily_cases.csv", index=False)

Unnamed: 0,fips,county,state,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,1001.0,Autauga,Alabama,0,0,0,0,0,0,0,...,19732,19759,19759,19759,19759,19759,19759,19759,19790,19790
1,1003.0,Baldwin,Alabama,0,0,0,0,0,0,0,...,69641,69767,69767,69767,69767,69767,69767,69767,69860,69860
2,1005.0,Barbour,Alabama,0,0,0,0,0,0,0,...,7451,7474,7474,7474,7474,7474,7474,7474,7485,7485
3,1007.0,Bibb,Alabama,0,0,0,0,0,0,0,...,8067,8087,8087,8087,8087,8087,8087,8087,8091,8091
4,1009.0,Blount,Alabama,0,0,0,0,0,0,0,...,18616,18673,18673,18673,18673,18673,18673,18673,18704,18704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,56039.0,Teton,Wyoming,0,0,0,0,0,0,0,...,12130,12130,12130,12130,12130,12130,12130,12134,12134,12134
3338,56041.0,Uinta,Wyoming,0,0,0,0,0,0,0,...,6401,6401,6401,6401,6401,6401,6401,6406,6406,6406
3339,90056.0,Unassigned,Wyoming,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3340,56043.0,Washakie,Wyoming,0,0,0,0,0,0,0,...,2750,2750,2750,2750,2750,2750,2750,2755,2755,2755


## Population Data

### Clean data
- Same idea as for the case data above - don't run this if "pop_data_cleaned.csv" is already in directory
- Otherwise, download raw data from link and run this cell

In [48]:
# Source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-total.html
pop_data = pd.read_csv("co-est2023-pop.csv", delimiter=',')

# Remove leading '.' and remove the word "County" from each Geographic Area entry
pop_data['Geographic Area'] = (
    pop_data['Geographic Area']
    .str.lstrip('.')
    .str.replace(' County,', ',', regex=False)
)

# Split Geographic Area column into county and state columns, remove Geographic Area column
pop_data[['county', 'state']] = pop_data['Geographic Area'].str.split(', ', expand=True)
pop_data = pop_data.drop("Geographic Area", axis=1)

pop_data = pop_data.drop("April 1, 2020 Estimates Base", axis=1)

# The data isn't properly formatted for numbers - remove commas and change datatype to int for each column
numeric_columns = ['2020', '2021', '2022', '2023']

for column in numeric_columns:
    pop_data[column] = (
        pop_data[column]
        .str.replace(',', '', regex=False)
        .astype(int)
    )

pop_data.to_csv("pop_data_cleaned.csv", index=False)

### Create new csv for each county using population and case data

In [61]:
case_data = pd.read_csv("daily_cases.csv", delimiter=',')
pop_data = pd.read_csv("pop_data_cleaned.csv", delimiter=',')

start_date = pd.to_datetime("2020-03-10")
end_date = pd.to_datetime("2023-03-10")
weekly_dates = pd.date_range(start=start_date, end=end_date, freq='W') - pd.DateOffset(days=5) # subtract 5 days to get to weekday of 3/10/2020; Tuesday

In [73]:
# return the population for the county given the year
def get_population(entry):
    area_pop_data = pop_data[(pop_data['county'] == entry['county']) & (pop_data['state'] == entry['state'])]
    return area_pop_data[str(entry['year'])].values[0]

In [77]:
def save_data(parent_folder: str, county: str, state: str):
    # case data
    area_data = case_data[(case_data['county'] == county) & (case_data['state'] == state)]
    area_data['date'] = pd.to_datetime(area_data['date'])
    area_data = area_data[(area_data['date'] >= start_date) & (area_data['date'] <= end_date)]
    
    area_data['year'] = area_data['date'].dt.year
    area_data['population'] = area_data.apply(get_population, axis=1)

    area_data['delta_cases'] = area_data['cases'].diff().fillna(0)
    area_data['delta_cases_per_100k'] = (area_data['delta_cases'] / area_data['population']) * 100000

    
    weekly_area_data = area_data[area_data['date'].isin(weekly_dates)]
    weekly_area_data['delta_cases'] = weekly_area_data['cases'].diff().fillna(0)
    weekly_area_data['delta_cases_per_100k'] = (weekly_area_data['delta_cases'] / weekly_area_data['population']) * 100000
    
    county_save_name = get_save_name(county)
    
    if not os.path.exists(f"{parent_folder}/{county_save_name}"):
        os.mkdir(f"{parent_folder}/{county_save_name}")
    
    area_data.to_csv(f"{parent_folder}/{county_save_name}/{county_save_name}_daily.csv", index=False)
    weekly_area_data.to_csv(f"{parent_folder}/{county_save_name}/{county_save_name}_weekly.csv", index=False)

In [78]:
for county, info in queer_areas.items():
    county_save_name = get_save_name(county)
    save_data("queer_areas", county, info['state'])
    for neighbor in info['neighboring_counties']:
        if not os.path.exists(f"queer_areas/{county_save_name}/neighbors"):
            os.mkdir(f"queer_areas/{county_save_name}/neighbors")
        save_data(f"queer_areas/{county_save_name}/neighbors", neighbor, info['state'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  area_data['date'] = pd.to_datetime(area_data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_area_data['delta_cases'] = weekly_area_data['cases'].diff().fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_area_data['delta_cases_per_100k'] = (weekly_area_data['delta_cases'

In [81]:
def plot_cases_over_time(parent_folder: str, county_save_name: str, state: str, weekly: bool=False):
    data_sample = "daily"
    if weekly:
        data_sample = "weekly"
        
    area_data = pd.read_csv(f"{parent_folder}/{county_save_name}/{county_save_name}_{data_sample}.csv", delimiter=',')
    area_data['date'] = pd.to_datetime(area_data['date'])
    
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(area_data['date'], area_data['delta_cases_per_100k'])
    
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))  # Every 3 months
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))  # Format as 'Jan 2020', etc.
    
    ax.set_xlabel("Date")
    ax.set_ylabel(f"{data_sample.capitalize()} new COVID Cases per 100k")
    
    ax.set_title(f"{data_sample.capitalize()} new COVID Cases per 100k for {recover_proper_name(county_save_name)} County, {state}")
    
    plt.savefig(f"{parent_folder}/{county_save_name}/new_cases_{county_save_name}_{data_sample}.png")

In [None]:
for county, info in queer_areas.items():
    county_save_name = get_save_name(county)
    for weekly in [False, True]:
        plot_cases_over_time("queer_areas", county_save_name, info['state'], weekly)
        for neighbor in info['neighboring_counties']:
            neighbor_save_name = get_save_name(neighbor)
            plot_cases_over_time(f"queer_areas/{county_save_name}/neighbors", neighbor_save_name, info['state'], weekly)
