In [47]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [52]:
print(os.getcwd())
%cd elizagoler/documents/senior-thesis



/Users
/Users/elizagoler/Documents/senior-thesis


In [53]:
# File structure defined based on SEER Data Dictionary

# Column widths in the txt file
widths = [4, 2, 2, 3, 2, 1, 1, 1, 2, 8]

# Column names
names = [
    'year',
    'state_abbrev', 
    'state_fips', 
    'fips_county_suffix', 
    'race', 
    'gap',      # the two-space gap in the text file (junk column)
    'origin', 
    'sex', 
    'age', 
    'population'
]

seer_file_path = 'datastore/raw/seer_population/data/us.1969_2023.20ages.adjusted.txt'

population_df = pd.read_fwf(
    seer_file_path, 
    widths=widths, 
    names=names
)

# Drop the unused gap column
population_df = population_df.drop(columns=['gap'])

population_df['population'] = population_df['population'].astype(int)

# Keep the leading zeros so we can concatenate with the state fips code to get full county-level fips
population_df['state_fips'] = population_df['state_fips'].apply(lambda x: f"{int(x):02d}")
population_df['fips_county_suffix'] = population_df['fips_county_suffix'].apply(lambda x: f"{int(x):03d}")

population_df.head()


Unnamed: 0,year,state_abbrev,state_fips,fips_county_suffix,race,origin,sex,age,population
0,1969,AL,1,1,,9,1,0,159
1,1969,AL,1,1,,9,1,1,657
2,1969,AL,1,1,,9,1,2,1137
3,1969,AL,1,1,,9,1,3,956
4,1969,AL,1,1,,9,1,4,721


Need to filter out any codes that are used by SEER that do not correspond to FIPS. 

From the documentation (see docs folder 'seer_county_aggregations' and link here: https://seer.cancer.gov/popdata/modifications.html):
- 900: All Hawaii counties 1969-1999
- 910: Cibola County (FIPS 006) and Valencia County (FIPS 061), NM (1969-1981) // Also, from 1969-1979: aggregation of several NY counties including Bronx (005), Kings (047), NY (061), Queens (081), and Richmond (085) // Also, from 1969-1981: aggregation of several Virginia counties Prince Williams (153), Manassas City (683), Manassas Park City (685)
- 911: Aggregation of James City County (095), York County (199), Poquoson City (735), and Williamsburg City (830 VA) from 1969-81
- 912 ... 
- 913 ...

Bottom line: Need to filter out the combos of these codes with their state x years because they are aggregations of counties


In [56]:
#print([x for x in population_df['fips_county_suffix'].unique() if int(x) >= 900])

At least for now, just getting rid of all county codes greater than or equal to 900 since these appear to be aggregations of counties:

In [54]:
population_df = population_df[population_df['fips_county_suffix'].astype(int) < 900]

In [57]:
population_df['fips_county_suffix'] = population_df['fips_county_suffix'].apply(lambda x: f"{int(x):03d}")
population_df['state_fips'] = population_df['state_fips'].apply(lambda x: f"{int(x):02d}")

In [58]:
population_df['age'].unique()

# Age codes from the data dictionary:
# 00 = 0 years
# 01 = 1-4 years
# 02 = 5-9 years
# 03 = 10-14 years
# 04 = 15-19 years
# ...
# 17 = 80-84 years
# 18 = 85-89 years
# 19 = 90+ years

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

To get the full county fips code, need to concatenate the state and county fips codes.

In [59]:
population_df['county_fips'] = population_df['state_fips'] + population_df['fips_county_suffix']

In [60]:
population_df['county_fips'].unique()
population_df['county_fips'].nunique()

3146

There are 3,146 unique county fips codes but 3,143 counties in the US today. Need to figure out why this is.

GFR is defined for women ages 15-44. These are the following age groups:
 - 04: 15-19
 - 05: 20-24
 - 06: 25-29
 - 07: 30-34
 - 08: 35-39
 - 09: 40-44

Women is sex = 2

All race and origin (Hispanic/non-Hispanic) for each of these age groups

In [61]:
# Select rows for women (sex = 2) and ages 15-44 (age codes 4 through 9)
gfr_age_codes = [4, 5, 6, 7, 8, 9]
gfr_women_df = (
    population_df[
        (population_df['sex'] == 2) &
        (population_df['age'].isin(gfr_age_codes))
    ]
    .groupby(['year', 'county_fips'], as_index=False)['population']
    .sum()
    .rename(columns={'population': 'gfr_women_pop'})
)

# Add 'fips_county_suffix' back from the original dataframe by merging on 'county_fips'
gfr_women_df = gfr_women_df.merge(
    population_df[['county_fips', 'fips_county_suffix']].drop_duplicates(),
    on='county_fips',
    how='left'
)

gfr_women_df = gfr_women_df[['year', 'county_fips', 'fips_county_suffix', 'gfr_women_pop']]

gfr_women_df.head()


Unnamed: 0,year,county_fips,fips_county_suffix,gfr_women_pop
0,1969,1001,1,4941
1,1969,1003,3,11553
2,1969,1005,5,4311
3,1969,1007,7,2557
4,1969,1009,9,5228


In [None]:
print(len(gfr_women_df))

171386


Must be missing some county x year pairings because if we multiply 55 years x 3,143 counties we should have 172,865 rows. This suggests I am missing 1,479 county x years.

Martha Bailey includes the names of counties missing at least one year here: https://pmc.ncbi.nlm.nih.gov/articles/PMC3348617/#R40 