OurCup Census Data Generation
=============================

In [170]:
import pandas as pd
import json

## Build a Table of Census Tract and Zip Codes

In [148]:
df = pd.read_csv('data/tab20_zcta520_tract20_natl.txt', delimiter="|", dtype={'GEOID_TRACT_20': str, 'GEOID_ZCTA5_20': str})
df = df[['GEOID_TRACT_20', 'GEOID_ZCTA5_20']]
df = df.rename(columns={'GEOID_TRACT_20': 'tract', 'GEOID_ZCTA5_20': 'zipcode5'})
df.to_csv('tract-to-zipcode.csv', index=False)

In [149]:
tract2zipcode = {}
for idx, row in df.iterrows():
    tract2zipcode[str(row['tract'])]=row['zipcode5']

## Figure Out Top Populations by County

In [150]:
country2col_df = pd.read_csv("data/country-2-census-column.csv")

In [151]:
census_df = pd.read_csv('data/ACSDT5Y2020.B05006_2022-11-13T170212/ACSDT5Y2020.B05006-Data.csv', low_memory=False)

In [172]:
pop_data = []
for census_idx, census_row in census_df.iterrows():  # census_df.sample(n=1000).iterrows()
    if census_idx == 0:
        continue
    try:
        tract_pop_data = {}
        tract_pop_data['county'] = census_row['GEO_ID'][9:14]
        tract_pop_data['geo_id'] = census_row['GEO_ID'][9:]
        tract_pop_data['state_fips'] = census_row['GEO_ID'][9:11]
        tract_pop_data['county_fips'] = census_row['GEO_ID'][11:14]
        tract_pop_data['tract_fips'] = census_row['GEO_ID'][14:]
        tract_pop_data['zipcode5'] = tract2zipcode[tract_pop_data['geo_id']]
        for cntry_idx, cntry_row in country2col_df.iterrows():
            tract_pop_data[cntry_row['FIFA-alpha3']] = census_row[cntry_row['Census-Column-Name']]
        pop_data.append(tract_pop_data)
    except Exception as e:
        print("Failed on row {}, tract {} - {}".format(census_idx, census_row['GEO_ID'][9:], e))
pop_df = pd.DataFrame(pop_data)
pop_df.to_csv('tract-populaton-data.csv', index=False)

Failed on row 85396, tract  - ''


In [173]:
aggregated = []
for county in list(pop_df['county']):
    county_data = {'fips': county}
    this_county_pop_df = pop_df[pop_df['county']==county]
    for cntry_idx, cntry_row in country2col_df.iterrows():
        county_data[cntry_row['FIFA-alpha3']] = pd.to_numeric(this_county_pop_df[cntry_row['FIFA-alpha3']]).sum()
    aggregated.append(county_data)

In [174]:
aggregated_pop_df = pd.DataFrame(aggregated)
aggregated_pop_df.to_csv('county-populaton-data.csv', index=False)

## Pre-compute Rankings for each County

In [177]:
county_rankings = {}
for county in list(pop_df['county']):
    this_county_ranks = []
    county_data = aggregated_pop_df[pop_df['county']==county].iloc[0]
    for cntry_idx, cntry_row in country2col_df.iterrows():
        pop = county_data[cntry_row['FIFA-alpha3']]
        if pop > 0:
            this_county_ranks.append({'county':county, 'team':cntry_row['FIFA-alpha3'], 'pop': int(pop)})
    this_county_ranks.sort(key=lambda x: x['pop'], reverse=True)
    county_rankings[county] = [r['team'] for r in this_county_ranks][:3]

In [178]:
with open('county-recs.json', 'w') as f:
    json.dump(county_rankings, f)