In [11]:
import pandas as pd
import numpy as np
import math 

<h1> Helper Funcitons </h1>

<h3>To impute missing populations, I used US growth rates from the following website, which was constructed using Census Data. Website: </h3> 
<href> https://www.macrotrends.net/countries/USA/united-states/population-growth-rate </href> 
<h3>Growth rates for each year.</h3>

In [12]:
rateByYear = {2019:.0066,2018:.0071,2017:.0079,2016:.008,2015:.008,2014:.0083,2013:.0086,2012:.0088,2011:.0087,2010:.0087,2009:.0092,2008:.0097,2007:.01,2006:.0098,2005:.0098}

In [13]:
def format(df):

    df.columns = [x.lower().replace(' ','_') for x in df.columns]
    df['year'] = df['year'].astype(int)
    df['county'] = df['county'].astype(int)
    df['state'] = df['state'].astype(int)

    return df

def recordsCheck(df,row0,regions0):

    rows = len(df[['county_fips','year']])
    regions = df['county_fips'].nunique()

    print("Percentage of rows retained: " + str(rows/row0))
    print("Percentage of regions retained: " + str(regions/regions0))

def imputeBack(cFip,yr):

    cnt = 1
    pop0 = -1 #Set nonsensical initial value

    #Get last entry with a population
    while(pop0 < 0):
        
        try: # Try to fetch previous year's population

            pop0 = df.loc[ ( (df['county_fips'] == cFip) & (df['year'] == yr-cnt) ), 'pop'].values[0]
            break
        
        except: # If also null try for even further back
        
            pop0 = -1

        if(yr-cnt <= 2005): #If we go out of our data range, give up. 

            break

        cnt += 1

    if(pop0 < 0): #If there was no data from behind to use for impuatation return -1 value to indicate nothing was usable for imputation.

        return pop0 
        
    else: #If we were able to pull data, then calculate new population

        pop = pop0 #Set population equal to the inital population we pulled

        for i in range(cnt): #For each year we went back

            rt = rateByYear[((yr-1)-i)] #Get the growth rate
            ntMig = df.loc[ ( (df['county_fips'] == cFip) & (df['year'] == ((yr-1)-i)) ), 'net_migration'].values[0] #Get the net migration for that year

            pop = (pop + (pop * rt)) + ntMig #Population is equal to current population plus it's natural increase and net migration. Calculate next year's population

        return pop

def imputeForward(cFip,yr):

    cnt = 1
    pop0 = -1 #Set nonsensical initial value

    #Get last entry with a population
    while(pop0 < 0):
        
        try: # Try to fetch previous year's population

            pop0 = df.loc[ ( (df['county_fips'] == cFip) & (df['year'] == yr+cnt) ), 'pop'].values[0]
            break

        except: # If also null try for even further back
        
            pop0 = -1

        if(yr+cnt >= 2019): #If we go out of our data range, give up. 

            break

        cnt += 1

    if(pop0 < 0): #If there was no data from behind to use for impuatation return -1 value to indicate nothing was usable for imputation.

        return pop0 
        
    else: #If we were able to pull data, then calculate new population

        pop = pop0 #Set population equal to the inital population we pulled

        for i in range(cnt): #For each year we went forward

            rt = rateByYear[((yr)+i)] #Get the growth rate
            ntMig = df.loc[ ( (df['county_fips'] == cFip) & (df['year'] == ((yr)+i)) ), 'net_migration'].values[0] #Get the net migration for that year
 
            pop = (pop * (1/(1+rt))) - ntMig #Population is scaled by inverse of growth rate from previous year and net migration is negated to give last year's projection

        return pop

<h1> Integrating Data </h1>

In [14]:
mig = pd.read_parquet('/Users/gpoulsen@apamail.org/Desktop/Grant Training/Urban-Growth/Project/Integration/Extracted Data/Migration.gzip') #Load
hpi = pd.read_parquet('/Users/gpoulsen@apamail.org/Desktop/Grant Training/Urban-Growth/Project/Integration/Extracted Data/Housing Index.gzip')
pop = pd.read_parquet('/Users/gpoulsen@apamail.org/Desktop/Grant Training/Urban-Growth/Project/Integration/Extracted Data/Population.gzip')

hpi = hpi.rename(columns={'yr':'year'}) #Rename columns & split county_fips
hpi['state'] = hpi['county_fips'].str[0:2]
hpi['county'] = hpi['county_fips'].str[2:5]

fms = [mig,hpi,pop]

for df in fms:

    df = format(df)

mig, hpi, pop = fms #Standardize syntax

df = mig.merge(hpi,'left',['state','county','year']) #Merge on HPI
df = df.merge(pop,'left',['state','county','year']) #Merge on Population

rows0 = len(df[['county_fips','year']])
regions0 = df['county_fips'].nunique()

<h1> Some Cleaning & Meta-Analysis </h1>

In [15]:
df = df[df['city'].notnull()] #Drop records missing HPI data

recordsCheck(df,rows0,regions0) #Call printing function

#Casting to proper dtypes
df['net_migration'].astype(int)
df['place_id'] = df['place_id'].astype(int) 
df['county_fips'] = df['county_fips'].astype(int)
df['pop'] = df['pop'].fillna(-1).astype(int)

Percentage of rows retained: 0.12807777660361408
Percentage of regions retained: 1.0


<h3> Imputing Populations Using National Growth Rates & Migration </h3>

In [16]:
for idx, row in df[df['pop'] < 0][['county_fips','year']].iterrows():

    #FETCH DATA

    cFip, yr = row
    pop = df.loc[idx,'pop']

    #IMPUTE

    pop = imputeBack(cFip,yr)

    if(pop < 0):
        pop = imputeForward(cFip,yr)

    df.loc[idx,'pop'] = pop


df = df[df['pop'] >= 0] #Drop 
df['pop'] = df['pop'].astype(int)

recordsCheck(df,rows0,regions0) #Call printing function

df['name'] = df['name'].fillna('')

Percentage of rows retained: 0.1237028426503223
Percentage of regions retained: 0.9678217821782178


In [17]:
df = df[['state','county','county_fips','city','year','net_migration','pop','index_nsa']]
df['net_migration_percentage'] = df['net_migration'] / df['pop']

df.to_csv('/Users/gpoulsen@apamail.org/Desktop/Grant Training/Urban-Growth/Project/Analysis/MASTER_TABLE.csv',index=0)

In [18]:
df

Unnamed: 0,state,county,county_fips,city,year,net_migration,pop,index_nsa,net_migration_percentage
22,1,3,1003,Daphne,2005,3830,160354,181.5025,0.023885
23,1,3,1003,Daphne,2006,2433,169162,212.5825,0.014383
24,1,3,1003,Daphne,2007,2119,171769,217.1225,0.012336
25,1,3,1003,Daphne,2008,2431,174439,208.8650,0.013936
26,1,3,1003,Daphne,2009,2808,179878,192.8200,0.015611
...,...,...,...,...,...,...,...,...,...
47161,56,25,56025,Casper,2015,-1404,82178,277.9925,-0.017085
47162,56,25,56025,Casper,2016,-1979,81039,273.0200,-0.024420
47163,56,25,56025,Casper,2017,-380,79547,270.0375,-0.004777
47164,56,25,56025,Casper,2018,719,79115,273.2475,0.009088


In [19]:
ag = df.groupby('county_fips')['net_migration_percentage'].mean().to_frame().reset_index()
ag = ag.rename(columns={'net_migration_percentage':'avg_net_migration_rate'})
ag['percentile'] = ag['avg_net_migration_rate'].rank(pct=True) * 100
ag.to_csv('/Users/gpoulsen@apamail.org/Desktop/Grant Training/Urban-Growth/Project/Analysis/POPULATION_TABLE.csv',index=0)