In [1]:
import pandas as pd
import seaborn as sns
import fbprophet 
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# total population by place
pop_by_place=pd.read_csv('../../data/NHGIS/nhgis0002_csv/nhgis0002_ts_nominal_place.csv',encoding='ISO-8859-1')

In [3]:
pop_by_place.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31436 entries, 0 to 31435
Data columns (total 25 columns):
NHGISCODE    31436 non-null object
GJOIN1970    20449 non-null object
GJOIN1980    22529 non-null object
GJOIN1990    23435 non-null object
GJOIN2000    25150 non-null object
GJOIN2010    29514 non-null object
GJOIN2012    29509 non-null object
STATE        31436 non-null object
STATEFP      31436 non-null int64
STATENH      31436 non-null int64
PLACE        31436 non-null object
PLACEA       31436 non-null int64
NAME1970     20449 non-null object
NAME1980     22529 non-null object
NAME1990     23435 non-null object
NAME2000     25150 non-null object
NAME2010     29514 non-null object
NAME2012     29509 non-null object
AV0AA1970    6435 non-null float64
AV0AA1980    22529 non-null float64
AV0AA1990    23435 non-null float64
AV0AA2000    25150 non-null float64
AV0AA2010    29514 non-null float64
AV0AA125     29509 non-null float64
AV0AA125M    29509 non-null float64
dtypes: float

In [2]:
nyc=pop_by_place.iloc[[18672]]
nyc

Unnamed: 0,NHGISCODE,GJOIN1970,GJOIN1980,GJOIN1990,GJOIN2000,GJOIN2010,GJOIN2012,STATE,STATEFP,STATENH,...,NAME2000,NAME2010,NAME2012,AV0AA1970,AV0AA1980,AV0AA1990,AV0AA2000,AV0AA2010,AV0AA125,AV0AA125M
18672,G36051000,G3602505,G3602505,G36051000,G36051000,G36051000,G36051000,New York,36,360,...,New York city,New York city,"New York city, New York",7894862.0,7071639.0,7322564.0,8008278.0,8175133.0,8199221.0,0.0


In [9]:
pop_by_place_ = pop_by_place.copy()[['NHGISCODE','AV0AA1970','AV0AA1980','AV0AA2000','AV0AA2010']]
pop_by_place_#.sample(5)

Unnamed: 0,NHGISCODE,AV0AA1970,AV0AA1980,AV0AA2000,AV0AA2010
0,G01000100,,,,192.0
1,G01000124,2996.0,3155.0,2987.0,2688.0
2,G01000460,,2498.0,4965.0,4522.0
3,G01000484,,746.0,723.0,758.0
4,G01000676,,604.0,521.0,356.0
5,G01000820,2642.0,7079.0,22619.0,30352.0
6,G01000988,9963.0,12039.0,17247.0,21160.0
7,G01001132,12358.0,13807.0,15008.0,14875.0
8,G01001180,,,3692.0,3917.0
9,G01001228,2851.0,3207.0,2567.0,2486.0


- ***notes***:
    - forget places with only 1 measurement 

In [None]:
def population_by_place(years=20,n_places=1000,changepoint_prior=0.15):
    # total population by place (1970 to 2010)
    pop_by_place=pd.read_csv('../../data/NHGIS/nhgis0002_csv/nhgis0002_ts_nominal_place.csv',encoding='ISO-8859-1')
    """
    inputs) 
    >> years
        > number of years to forecast
    >> places
        > number of places to forecast +1 
            >> e.g. 99 = first 100 places (max==25102)
    >> changepoint_prior
        > set changepoint_prior_scale for prophet model
    
    - generate DataFrame of population:
        > from 1970 to 2010
        > by unique place (use NHGISCODE as Id)
    - drop 
        > counties with less than 2 measurements
            > can only predict counties which have been measured 2+ times 
    - extract list of places
        > each as a DataFrame ready for prediction 
        > column0='ds' , column1='y'
    - make and fit prophet model on each
    - return prophet model's predictions
        >> for {years} years
    """

    # df by NHGISCODE with measurements by decade (31436 rows × 5 columns)
    unique_places = pop_by_place.copy()[['NHGISCODE','AV0AA1970','AV0AA1980','AV0AA2000','AV0AA2010']]

    # drop NaN rows @ thresh = 3 due to NHGISCODE being non-NaN (25103 rows × 5 columns ; 6333 non-measurable) 
    measureable_unique_places = unique_places.dropna(axis=0,thresh=3)
    # convert NaN values to 0 (note: there are 270 'dead' counties ('A00AA2010' == 0))
    measureable_unique_places = measureable_unique_places.fillna(0)

    # generate list of remaining NHGISCODE codes 
    codes_of_measureable_unique_places = [code for code in measureable_unique_places.NHGISCODE]
    # drop NHGISCODE column (25103 rows × 4 columns)
    measureable_unique_places = measureable_unique_places.drop('NHGISCODE',axis=1)

    # list of str column names as years (for conversion to datetime)
    year_only_columns = [i[5:] for i in measureable_unique_places.columns]
    # convert year_only_columns to DatetimeIndex of Timestamps
    dt_columns = pd.to_datetime(arg=year_only_columns)

    # convert dt_columns into dataframe 
    datetime_df = pd.DataFrame(dt_columns).T
    # w/ columns, so concatable with measureable_unique_counties
    datetime_df.columns = measureable_unique_places.columns

    # generate list of remaining places (each as pd.Series)
    dfs_of_measureable_unique_places = [measureable_unique_places.iloc[place] for place in range(len(measureable_unique_places))]

    # add datetime_df to each dataframe as first row
    prophet_places = [pd.concat((datetime_df,pd.DataFrame(place).T),axis=0) for place in dfs_of_measureable_unique_places]
    # then transpose to 2 rows x 23 columns 
    prophet_almost_ready_places = [place.T for place in prophet_places]

    # set collection of prophets 
    prophet_by_place = []

    # run prophet model on first 1000 places
    for place in range(len(prophet_almost_ready_places[:n_places])):
        # make the prophet model
        place_prophet = fbprophet.Prophet(changepoint_prior_scale=changepoint_prior)
        # identify county 
        a = prophet_almost_ready_places[place]
        # rename place df's columns to agree with prophet formatting
        a.columns = ['ds','y']
        # fit place on prophet model 
        b = place_prophet.fit(a)
        # make a future dataframe for 20 years
        place_forecast = place_prophet.make_future_dataframe( periods=1*years, freq='Y' )
        # establish predictions
        place_forecast = county_prophet.predict(place_forecast)
        # add to collection 
        prophet_by_place.append(place_forecast)
        
    return prophet_by_place

In [None]:
by_place = population_by_place()

In [14]:
31436-25103

6333