## Population Data

### NC 1990 Census table

https://www.osbm.nc.gov/facts-figures/demographics/1990-2000-county-growth

### Census demographic api

https://demography.osbm.nc.gov/explore/?sort=modified

* state-wide population: https://demography.osbm.nc.gov/explore/dataset/historic-census/table/?disjunctive.areatype&disjunctive.areaname&rows=30&q.timerange.year=year:%5B1979-01-01T05:00:00Z+TO+2020-01-02T04:59:59Z%5D&refine.areatype=State

* Downloaded historical NC census data for the cesus years : historic-census.json

### Census dot gov
source(2000~2010) API : https://www.census.gov/data/developers/data-sets/popest-popproj/popest.2000-2010_Intercensals.html

source(1990-2000) : Can't find county pop.
https://www.census.gov/data/developers/data-sets/popest-popproj/popest.1990-2000_Intercensals.html
do NOT have geographical variable.

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import requests
import json
import time
import csv

# Import API key
from config import census_api_key

#### NC - County population (1980, 1990, 2000)
* NC census demographic data

    https://demography.osbm.nc.gov/explore/?sort=modified
    
  from the downloaded file(county data), build up a population array

In [4]:
# Histroic NC census data
query_years = ['1980', '1990', '2000']
years_pop1 = [
    ['year', 'county', 'population'],
    ['1990', 'STATE', '6628637'],
    ['1980', 'STATE', '5881766'],
    ['2000', 'STATE', '8046668']
]

with open("historic-census.json") as f:
    pop_data = json.load(f)

    for each_data in pop_data:

        field_data = each_data['fields']

        if field_data['year'] in query_years:
            sel_data = [field_data['year'],
                       field_data['areaname'],
                       str(int(field_data['population']))]
            years_pop1.append(sel_data)

In [5]:
# Check the data
cty = 'New Hanover'
list(filter(lambda d: (d[1] == cty), years_pop1))

[['1980', 'New Hanover', '103471'],
 ['2000', 'New Hanover', '160327'],
 ['1990', 'New Hanover', '120284']]

####  NC - County population (2001 ~ 2010)

    source: census dot gov

In [2]:
## API call for the population of counties in NC from 2000~2010
year = 2000
pop_url = f'https://api.census.gov/data/{year}/pep/int_population'
pop_var = '?get=GEONAME,POP,DATE_DESC&for=county:*&in=state:37'
apikey = '&key=' + census_api_key
print(pop_url+pop_var+apikey)
## Request
try:
    response = requests.get(pop_url+pop_var+apikey)
    print(response)
    pop_data = response.json()
    #print(json.dumps(census_data, indent=4))
except:
    print(f"Found error")
    
pop_df = pd.DataFrame(pop_data[1:], columns=pop_data[0])
pop_df.head()

https://api.census.gov/data/2000/pep/int_population?get=GEONAME,POP,DATE_DESC&for=county:*&in=state:37&key=c27d20165731bd731fe0b28ba84169ac2877e759
<Response [200]>


Unnamed: 0,GEONAME,POP,DATE_DESC,state,county
0,"Greene County, North Carolina",19848,7/1/2003 population estimate,37,79
1,"Greene County, North Carolina",20132,7/1/2004 population estimate,37,79
2,"Greene County, North Carolina",20146,7/1/2005 population estimate,37,79
3,"Greene County, North Carolina",20742,7/1/2006 population estimate,37,79
4,"Greene County, North Carolina",21178,7/1/2007 population estimate,37,79


In [3]:
years = []
counties = []
pop_array = []
years_pop2 = []
for drow in pop_df.iterrows():
    row = drow[1]
    #print(row)
    gname = row['GEONAME'].split(' County')
    county = gname[0]
    #print(county)
    counties.append(county)
    dt_str = row['DATE_DESC'].split()
    yr = dt_str[0].split('/')[2]
    years.append(yr)
    population = row['POP']
    pop_array.append(int(population))
    
    # Append the county population 
    if (int(yr) > 2000) & (yr != '2010'):
        years_pop2.append([yr,county,population])
    
pop_df['YEAR'] = years
pop_df['CountyName'] = counties
pop_df['population'] = pop_array

In [4]:
pop_df.loc[pop_df['YEAR']=='2001'].head()

Unnamed: 0,GEONAME,POP,DATE_DESC,state,county,YEAR,CountyName,population
10,"Guilford County, North Carolina",428381,7/1/2001 population estimate,37,81,2001,Guilford,428381
22,"Halifax County, North Carolina",56788,7/1/2001 population estimate,37,83,2001,Halifax,56788
34,"Harnett County, North Carolina",93168,7/1/2001 population estimate,37,85,2001,Harnett,93168
46,"Haywood County, North Carolina",54802,7/1/2001 population estimate,37,87,2001,Haywood,54802
58,"Henderson County, North Carolina",91150,7/1/2001 population estimate,37,89,2001,Henderson,91150


In [9]:
# Compute the total population for NC and append the data to years_pop array
pop_nums = pop_df.groupby(['YEAR'])['population'].sum()
for items in pop_nums.items():
    #print(items)
    if (items[0]!='2000') & (items[0]!='2010'):
        years_pop2.append([items[0], 'STATE', items[1]])
#print(years_pop)

In [10]:
# Check the data
cty = 'New Hanover'
list(filter(lambda d: (d[1] == cty), years_pop2))

[['2001', 'New Hanover', '164386'],
 ['2002', 'New Hanover', '168147'],
 ['2003', 'New Hanover', '171928'],
 ['2004', 'New Hanover', '178362'],
 ['2005', 'New Hanover', '185412'],
 ['2006', 'New Hanover', '190791'],
 ['2007', 'New Hanover', '194339'],
 ['2008', 'New Hanover', '197709'],
 ['2009', 'New Hanover', '200178']]

### Append the population data of 2010~2019

In [11]:
# Collect population data for all years

recent_yrs = ['2010','2011','2012','2013', '2014', '2015', '2016', '2017', '2018', '2019']
years_pop3 = []

# opening the CSV file
with open('countytotals_2010_2019.csv', mode='r')as file:

    # reading the CSV file
    csvFile = csv.reader(file)
    # skip the initial line
    next(csvFile)
    next(csvFile)
    next(csvFile)
    next(csvFile)
    # loop throughthe file to find the county or full state depending on the input request.
    for lines in csvFile:
        cty = lines[0]
        #print(cty.split()[0])
        # Skip the end lines of the source file
        if cty.split()[0]=='Source:':
            break
        for i in range(len(lines[1:])):
            app_data = [ recent_yrs[i], cty, lines[i+1] ]
            #print(app_data)
            years_pop3.append(app_data)

In [28]:
# Check the data
cty = 'Wake'
list(filter(lambda d: (d[1] == cty), years_pop3))

[['2010', 'Wake', '906882'],
 ['2011', 'Wake', '924330'],
 ['2012', 'Wake', '944622'],
 ['2013', 'Wake', '963856'],
 ['2014', 'Wake', '983918'],
 ['2015', 'Wake', '1006139'],
 ['2016', 'Wake', '1028509'],
 ['2017', 'Wake', '1049943'],
 ['2018', 'Wake', '1070197'],
 ['2019', 'Wake', '1089579']]

In [13]:
# Save the population data to a file

#with open("counties_pop_1990_2019.json", "w") as cfile:
#    json.dump(years_pop1, cfile)
#    json.dump(years_pop2, cfile)
#    json.dump(years_pop3, cfile)
with open("counties_pop_1990_2019.csv", "w") as cfile:
    popwriter = csv.writer(cfile)
    for row in years_pop1:
        popwriter.writerow(row)
    for row in years_pop2:
        popwriter.writerow(row)
    for row in years_pop3:
        popwriter.writerow(row)

### Check the population data


In [33]:
# Check the data in "counties_pop_1990_2019.csv"
cty = 'Rowan'
cyr = 2018
#with open("counties_pop_1990_2019.json") as f:
#    pop_data = json.load(f)
#    pp_data = pop_data[1:]
#    sel_pop = list(filter(lambda d: (d[1] == cty) & (int(d[0])<=cyr), pp_data))
#print(sel_pop)
with open("counties_pop_1990_2019.csv") as cfile:
    pp_data = csv.reader(cfile)
    next(pp_data)
    sel_pop = list(filter(lambda d: (d[1] == cty) & (int(d[0])<=cyr), pp_data))
        

In [34]:
sel_pop_yrs = [ d[0] for d in sel_pop]
sel_pop_size = [d[2] for d in sel_pop]
print(sel_pop_yrs)
print(sel_pop_size)

['1990', '1980', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
['110605', '99186', '130340', '131852', '132765', '132562', '132202', '132527', '134045', '135626', '137721', '138562', '138368', '137841', '137765', '138118', '138897', '139639', '140557', '141445', '141802']


In [35]:
sel_pop.sort(key=lambda d:d[0])

In [36]:
for d in sel_pop:
    print(d)

['1980', 'Rowan', '99186']
['1990', 'Rowan', '110605']
['2000', 'Rowan', '130340']
['2001', 'Rowan', '131852']
['2002', 'Rowan', '132765']
['2003', 'Rowan', '132562']
['2004', 'Rowan', '132202']
['2005', 'Rowan', '132527']
['2006', 'Rowan', '134045']
['2007', 'Rowan', '135626']
['2008', 'Rowan', '137721']
['2009', 'Rowan', '138562']
['2010', 'Rowan', '138368']
['2011', 'Rowan', '137841']
['2012', 'Rowan', '137765']
['2013', 'Rowan', '138118']
['2014', 'Rowan', '138897']
['2015', 'Rowan', '139639']
['2016', 'Rowan', '140557']
['2017', 'Rowan', '141445']
['2018', 'Rowan', '141802']


In [39]:
years= ['1986','1988','1990','1992','1994','1996','1998','2000','2002','2004','2006','2008','2010','2012','2013','2014','2015','2016','2017','2018']
yrs = []
vls = []
for yr in years:
    if int(yr) > 2018:
        break
    if int(yr) < 1990:
        yrs.append(yr)
        vls.append(sel_pop[0][2])
        print(yr, sel_pop[0])
    elif int(yr) < 2000:
        yrs.append(yr)
        vls.append(sel_pop[1][2])
        print(yr, sel_pop[1])
    else:
        yrs.append(yr)
        yr_pop = list(filter(lambda d: d[0] == yr, sel_pop))
        print(yr, yr_pop[0])
        vls.append(yr_pop[0][2])

1986 ['1980', 'Rowan', '99186']
1988 ['1980', 'Rowan', '99186']
1990 ['1990', 'Rowan', '110605']
1992 ['1990', 'Rowan', '110605']
1994 ['1990', 'Rowan', '110605']
1996 ['1990', 'Rowan', '110605']
1998 ['1990', 'Rowan', '110605']
2000 ['2000', 'Rowan', '130340']
2002 ['2002', 'Rowan', '132765']
2004 ['2004', 'Rowan', '132202']
2006 ['2006', 'Rowan', '134045']
2008 ['2008', 'Rowan', '137721']
2010 ['2010', 'Rowan', '138368']
2012 ['2012', 'Rowan', '137765']
2013 ['2013', 'Rowan', '138118']
2014 ['2014', 'Rowan', '138897']
2015 ['2015', 'Rowan', '139639']
2016 ['2016', 'Rowan', '140557']
2017 ['2017', 'Rowan', '141445']
2018 ['2018', 'Rowan', '141802']


In [32]:
print(vls)

['301429', '301429', '426301', '426301', '426301', '426301', '426301', '627846', '676392', '716336', '784038', '856492', '906882', '944622', '963856', '983918', '1006139', '1028509', '1049943']
