In [None]:
import numpy as np
import fbprophet 
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
# don't do this at home
warnings.filterwarnings("ignore")

# def validate():
"""
>> takes in 
    > Census 1970-2010 dataframe (1 df)
        >> total population by Place measurements
    > American Community Survey (ACS) 2011-2017 dataframes (7 dfs)
        >> total population (age & sex) by Place 

>> forges DataFrame of places that have 
    > at least one (1) recording for Census years 1970-2010
    > at least one (1) recording for ACS years 2011-2015

>> test our model v. base on
    > random sample 100 Places
    > random sample 100 Places from bottom half population size
    > random sample 100 Places from top half population size
"""

'''load Train data'''
# population by Place Census 1970-2010 measurements
load_census_place = pd.read_csv('../../data/NHGIS/nhgis0002_csv/nhgis0002_ts_nominal_place.csv',encoding='ISO-8859-1')
# population by Place ACS 2011
load_acs_20l1 = pd.read_csv('../../data/American_Community_Survey/ACS_11_5YR_S0101/ACS_11_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 
# population by Place ACS 2012
load_acs_20l2 = pd.read_csv('../../data/American_Community_Survey/ACS_12_5YR_S0101/ACS_12_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 
# population by Place ACS 2013
load_acs_20l3 = pd.read_csv('../../data/American_Community_Survey/ACS_13_5YR_S0101/ACS_13_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 
# population by Place ACS 2014
load_acs_20l4 = pd.read_csv('../../data/American_Community_Survey/ACS_14_5YR_S0101/ACS_14_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 
# population by Place ACS 2015
load_acs_20l5 = pd.read_csv('../../data/American_Community_Survey/ACS_15_5YR_S0101/ACS_15_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 


'''load Test data'''
# population by Place ACS 2016
load_acs_20l6 = pd.read_csv('../../data/American_Community_Survey/ACS_16_5YR_S0101/ACS_16_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 
# population by Place ACS 2017
load_acs_20l7 = pd.read_csv('../../data/American_Community_Survey/ACS_17_5YR_S0101/ACS_17_5YR_S0101_with_ann.csv',encoding='ISO-8859-1',low_memory=False) 

'''find common places across Census and each train ACS'''
# identify Places measured in 2011 ACS [0 == 'Geography'] (# 29517)
acs11places = [place for place in load_acs_20l1['GEO.display-label'][1:]]
# identify Places measured in 2012 ACS  (# 29510)
acs12places = [place for place in load_acs_20l2['GEO.display-label']]
# identify Places measured in 2013 ACS (# 29510)
acs13places = [place for place in load_acs_20l3['GEO.display-label']]
# identify Places measured in 2014 ACS (# 29550)
acs14places = [place for place in load_acs_20l4['GEO.display-label']]
# identify Places measured in 2015 ACS (# 29575)
acs15places = [place for place in load_acs_20l5['GEO.display-label']]

# cross 2011-2015, keep coexisting Places (# 29475)
train_places = [place for place in acs11places if place in acs12places and acs13places and acs14places and acs15places]

"""find common places across 2016 & 2017 (test ACSs)
"""
# identify Places measured in 2016 ACS (# 29574) [0 == 'Geography']
acs16places = [place for place in load_acs_20l6['GEO.display-label'][1:]]
# identify Places measured in 2017 ACS (# 29577)
acs17places = [place for place in load_acs_20l7['GEO.display-label']]

# cross 2017 Places w/ 2016 Places, keep coexisting Places (# 29550)
base_places = [place for place in acs17places if place in acs16places]

"""find common Places across the Places our model will train on {train_places} 
    and the Places our model can predict on {base_places}
"""
# identify Places we can compare our predictions with (# 29341)
pre_measureable_places = [place for place in train_places if place in base_places]

"""clean Census 1970-2010 df (Train)
"""
# identify columns needed to make GEO.display-label column (so can pair with ACS DataFrames) 
for_geo_displays = ['PLACE','STATE']
# pull those columns 
to_geo_displays = load_census_place[for_geo_displays]

# mold PLACE column into list with Place formatted as is in GEO.display-label
places_70_10 = [place + ', ' for place in to_geo_displays.PLACE]

# list paired State for each Place
states_70_10 = [state for state in to_geo_displays.STATE]

# merge places_70_10 and states_70_10 into list formatted as GEO.display-label column
GEO_display_label = [ places_70_10[i] + states_70_10[i] for i in range(len(places_70_10))]

# identify columns relevant to our end goal of predicting population for a given place
place_cols_of_interest = ['AV0AA1970', 'AV0AA1980', 'AV0AA1990', 'AV0AA2000', 'AV0AA2010']
# set base dataframe using Census (1970-2010) measurements 
pop_place_70_10_ = load_census_place[place_cols_of_interest]

# add GEO.display-label column from GEO_display_label list (# 31436)
pop_place_70_10_['GEO.display-label'] = GEO_display_label

# forget places without measurements for at least 3 of the 5 census measurement years (# 23027)
at_least_3_70_10_ = pop_place_70_10_.dropna(axis=0,thresh=4)
# forget places with measurements of 0 for 2000 (# 23018)
not_0_for_2000_ = at_least_3_70_10_.loc[at_least_3_70_10_.AV0AA2000 != 0]
# forget places with measurements of 0 for 2010 (# 23016)
pop_place_70_10_ = not_0_for_2000_.loc[not_0_for_2000_.AV0AA2010 != 0]

# note the remaining places (total # = 23016)
census_places = [place for place in pop_place_70_10_['GEO.display-label']]
# adjust measurable places to reflect places with census measurements (total # = 22506)
measureable_places = [place for place in pre_measureable_places if place in census_places]

"""clean American Community Survey (ACS) 2011-2015 dataframes (Train)
"""
# ID columns we will be using
columns = ['GEO.display-label', 'HC01_EST_VC01']
# convert 2011
acs_20l1 = load_acs_20l1[columns]
# convert 2012
acs_20l2 = load_acs_20l2[columns]
# convert 2013
acs_20l3 = load_acs_20l3[columns]
# convert 2014
acs_20l4 = load_acs_20l4[columns]
# convert 2015
acs_20l5 = load_acs_20l5[columns]

"""convert Train years to reflect Places only seen in measureable_places
"""
# drop Census Places not ideal for measurement (29346)
census_place_populations = pop_place_70_10_.loc[pop_place_70_10_['GEO.display-label'].isin(measureable_places)]
# drop 2011 ACS Places not ideal for measurement (29341)
acs_2011_place_populations = acs_20l1.loc[acs_20l1['GEO.display-label'].isin(measureable_places)]
# drop 2012 ACS Places not ideal for measurement (29341)
acs_2012_place_populations = acs_20l2.loc[acs_20l2['GEO.display-label'].isin(measureable_places)]
# drop 2013 ACS Places not ideal for measurement (29341) 
acs_2013_place_populations = acs_20l3.loc[acs_20l3['GEO.display-label'].isin(measureable_places)]
# drop 2014 ACS Places not ideal for measurement (29341) 
acs_2014_place_populations = acs_20l4.loc[acs_20l4['GEO.display-label'].isin(measureable_places)]
# drop 2015 ACS Places not ideal for measurement (29341) 
acs_2015_place_populations = acs_20l5.loc[acs_20l5['GEO.display-label'].isin(measureable_places)]

"""clean ACS 2016 & 2017 dataframes (Test)
    take a sample of n Places to score our model
"""
# identify 2016/2017 columns of interest (to measure against)
test_col_of_i = ['GEO.display-label', 'HC01_EST_VC01']

# shrink ACS 2017 df to columns to measure against only 
testd_16_ = load_acs_20l6[test_col_of_i]
# realize ACS 2016 combined measureable_places DataFrame (Baseline) dataframe 
test_16_df_ = testd_16_.loc[testd_16_['GEO.display-label'].isin(measureable_places)]

# shrink ACS 2017 df to columns to measure against only 
testd_17_ = load_acs_20l7[test_col_of_i]
# realize ACS 2017 combined measureable_places DataFrame (Baseline) dataframe 
test_17_df_ = testd_17_.loc[testd_17_['GEO.display-label'].isin(measureable_places)]
# conver
test_17_1000_pops = [float(population) for population in test_17_df_.HC01_EST_VC01]
# convert test_17_df_ populations to floats (numbers, from strings) 
test_17_df_.HC01_EST_VC01 = test_17_1000_pops
# forget Places with 2017 measured population less than 1,000 (13218 places remain)
test_17_df_ = test_17_df_.loc[test_17_df_.HC01_EST_VC01 >= 1000]

# sample Baseline data for Places to evaluate model 
sample_one_hunnit = test_17_df_.sample(2500)
# list Places for conversion of other Datas
sample_places = [place for place in sample_one_hunnit['GEO.display-label']]

"""adjust Train dataframes to sampled Places
"""
# shrink Census DataFrame to sampled Places
_s_census_ = census_place_populations.loc[census_place_populations['GEO.display-label'].isin(sample_places)]
# shrink 2011 ACS df to sampled Places 
_s_acs_2011_ = acs_20l1.loc[acs_20l1['GEO.display-label'].isin(sample_places)]
# shrink 2012 ACS DataFrame to sampled Places 
_s_acs_2012_ = acs_20l2.loc[acs_20l2['GEO.display-label'].isin(sample_places)]
# shrink 2013 ACS df to Places in sample  
_s_acs_2013_ = acs_20l3.loc[acs_20l3['GEO.display-label'].isin(sample_places)]
# shrink 2014 ACS DataFrame to sampled Places 
_s_acs_2014_ = acs_20l4.loc[acs_20l4['GEO.display-label'].isin(sample_places)]
# shrink 2015 ACS df to sampled Places 
_s_acs_2015_ = acs_20l5.loc[acs_20l5['GEO.display-label'].isin(sample_places)]

"""adjust Test dataframes to sampled Places
"""
# 2016 ACS df to sampled Places 
test_16_df = test_16_df_.loc[test_16_df_['GEO.display-label'].isin(sample_places)]
# reset 2016 ACS df index
test_16_df = test_16_df.set_index(test_16_df['GEO.display-label'])

# 2017 ACS DataFrame to sampled Places 
test_17_df = test_17_df_.loc[test_17_df_['GEO.display-label'].isin(sample_places)]
# reset 2017 ACS df index
test_17_df = test_17_df.set_index(test_17_df['GEO.display-label'])

"""forge combined Train/Test DataFrame 
    --ensure Place order remains unchanged
"""
# set Census index to Places, and forget Place column 
s_census_ = _s_census_.copy().set_index(_s_census_['GEO.display-label'])[['AV0AA1970','AV0AA1980','AV0AA1990','AV0AA2000','AV0AA2010']]
# rename Census columns to years for later datetime conversion
s_census_.columns = ['1970','1980','1990','2000','2010']

# set 2011 index to Places 
s_acs_2011_ = _s_acs_2011_.copy().set_index(_s_acs_2011_['GEO.display-label'])
# rename columns 
s_acs_2011_.columns = ['no','2011']
# only continue with year/measurement column
s_acs_2011_ = s_acs_2011_['2011']

# set 2012 index to Places 
s_acs_2012_ = _s_acs_2012_.copy().set_index(_s_acs_2012_['GEO.display-label'])
# rename columns 
s_acs_2012_.columns = ['no','2012']
# only continue with year/measurement column
s_acs_2012_ = s_acs_2012_['2012']

# set 2013 index to Places 
s_acs_2013_ = _s_acs_2013_.copy().set_index(_s_acs_2013_['GEO.display-label'])
# rename columns 
s_acs_2013_.columns = ['no','2013']
# only continue with year/measurement column
s_acs_2013_ = s_acs_2013_['2013']

# set 2014 index to Places 
s_acs_2014_ = _s_acs_2014_.copy().set_index(_s_acs_2014_['GEO.display-label'])
# rename columns 
s_acs_2014_.columns = ['no','2014']
# only continue with year/measurement column
s_acs_2014_ = s_acs_2014_['2014']

# set 2015 index to Places 
s_acs_2015_ = _s_acs_2015_.copy().set_index(_s_acs_2015_['GEO.display-label'])
# rename columns 
s_acs_2015_.columns = ['no','2015']
# only continue with year/measurement column
s_acs_2015_ = s_acs_2015_['2015']

# rename columns 
test_16_df.columns = ['no','2016']
# only continue with year/measurement column
test_16_df = test_16_df['2016']

# rename columns 
test_17_df.columns = ['no','2017']
# only continue with year/measurement column
test_17_df = test_17_df['2017']

# forge Train DataFrame and convert NaN values to 0 (assumes population not measured is 0) 
combined_df = pd.concat([s_census_,s_acs_2011_,s_acs_2012_,s_acs_2013_,s_acs_2014_,s_acs_2015_,test_16_df,test_17_df],axis=1).fillna(0)

# split train_df from combined_df
train_df = combined_df[['1970', '1980', '1990', '2000', '2010', '2011', '2012', '2013', '2014','2015']]
# split test_df form combined_df
test_df = combined_df[['2016', '2017']]

In [None]:
"""forecast 2016 and 2017 populations using model for each sample Place
"""
# set out route for forecast tables
out = []
# set out route for 2016 & 2017 Train predictions
train_preds = []

# make DataFrame of column values as datetime (first converting to Series)
datetimes = pd.DataFrame(data=pd.to_datetime(pd.Series(data=train_df.columns)))

# go though each place in train_df
for i in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[i]
    # add datetime values to forge place specific DataFrame
    df = pd.concat([df.reset_index(),datetimes],axis=1)
    
    # use fbprophet to make Prophet model
    place_prophet = fbprophet.Prophet(changepoint_prior_scale=0.15,
                                      daily_seasonality=False,
                                      weekly_seasonality=False,
                                      yearly_seasonality=True,
                                      n_changepoints=7)
    
    # rename Place df's columns to agree with prophet formatting
    df.columns = ['drop','y','ds']
    # adjust df ; forget index column (drop)
    df = df[['ds','y']]
    
    # fit place on prophet model 
    place_prophet.fit(df)
    
    # make a future dataframe for 2016 & 2017 years
    place_forecast = place_prophet.make_future_dataframe(periods=3, freq='Y')
    
    # establish predictions
    forecast = place_prophet.predict(place_forecast)
    
    # check where we are
    if i % 25 == 0:
        # indicate where we are
        print(i)
    # tag and bag (forecast table)
    out.append(forecast)
    # store 2016 and 2017 predictions
    train_preds.append([
        forecast.loc[forecast.ds == '2016-12-31'].yhat.values[0],
        forecast.loc[forecast.ds == '2017-12-31'].yhat.values[0]])

In [None]:
"""make Baseline predictions of 2016 and 2017 population on sample Places
"""
# set out route
baseline_preds = []
# go though each place in train_df
for j in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[j]
    
    # tag 1970 population
    measure_70 = int(df['1970'])
    # tag 1980 population
    measure_80 = int(df['1980'])
    # tag 1990 population
    measure_90 = int(df['1990'])
    # tag 2000 population 
    measure_00 = int(df['2000'])
    # tag 2010 population
    measure_10 = int(df['2010'])
    # tag 2011 population
    measure_11 = int(df['2011'])
    # tag 2012 population
    measure_12 = int(df['2012'])
    # tag 2013 population
    measure_13 = int(df['2013'])
    # tag 2014 population
    measure_14 = int(df['2014'])
    # tag 2015 population
    measure_15 = int(df['2015'])
    
    change = (measure_15 - measure_11)/5

    # make 2016 prediction 
    p_16 = measure_15 + change

    # make 2017 prediction 
    p_17 = p_16 + change
    
    # print(measure_00,measure_15,p_16,p_17)
    if p_16 > abs(measure_15 * 1.1):
        print(f"measure_15 = {measure_15}\np_16 = {p_16}\n")
    
    # pair prediction, tag & bag
    baseline_preds.append([p_16,p_17])

"""pull actual measurements for 2016 and 2017 population for each sample Place
"""
# list actual populations for 2016
test_16 = [actual_population for actual_population in test_df['2016']]

# list actual populations for 2017
test_17 = [actual_population for actual_population in test_df['2017']]


In [None]:
"""2016
"""
train_preds_16 = [float(pred[0]) for pred in train_preds]
test_16_ = [float(act) for act in test_16]

MODEL_rmse_exrate16 = sqrt(mean_squared_error(y_true=test_16_,y_pred=train_preds_16))

base_preds_16 = [float(pred[0]) for pred in baseline_preds]
test_16_ = [float(act) for act in test_16]
BASE_rmse_exrate16 = sqrt(mean_squared_error(y_true=test_16_,y_pred=base_preds_16))

MODEL_rmse_exrate16,BASE_rmse_exrate16  # (32119.17325005084, 32635.422809490734)
# 91907.10440197852, 91928.04165272956 -- sample size 2,000

In [None]:
"""2017
"""
train_preds_17 = [float(pred[1]) for pred in train_preds]
test_17_ = [float(act) for act in test_17]
MODEL_rmse_exrate17 = sqrt(mean_squared_error(y_true=test_17_,y_pred=train_preds_17))

base_preds_17 = [float(pred[1]) for pred in baseline_preds]
test_17_ = [float(act) for act in test_17]
BASE_rmse_exrate17 = sqrt(mean_squared_error(y_true=test_17_,y_pred=base_preds_17))

MODEL_rmse_exrate17,BASE_rmse_exrate17  # (31958.00988989089, 32717.02106434803)
# 92797.69436875812, 92834.60502207742 -- sample size 2,000

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
mean_absolute_percentage_error(y_true=test_16_,y_pred=train_preds_16)

In [None]:
mean_absolute_percentage_error(y_true=test_16_,y_pred=base_preds_16)

In [None]:
mean_absolute_percentage_error(y_true=test_17_,y_pred=train_preds_17)

In [None]:
mean_absolute_percentage_error(y_true=test_17_,y_pred=base_preds_17)

In [None]:
def avg_percent_error(y_true, y_pred):
    if len(y_true) != len(y_pred):
        raise Exception(f"len(y_true) != len(y_pred)\n{len(y_true)} != {len(y_pred)}")
    sum_percent_error = 0
    for population in range(len(y_true)):
        percent_error = (abs(y_true[population] - y_pred[population]) / y_true[population])*100
        if percent_error > 5:
            print(population)
            print(y_true[population],y_pred[population],percent_error)
            print('\n')
        sum_percent_error += percent_error
    avg_percent_error = sum_percent_error / len(y_true)
    return avg_percent_error

In [None]:
'''"""basically nowhere are we predicting more than 20% change in population
    from 2015 (actual) to 2016 (model or baseline)
    so why the hell are the avg_percent_error so damn high
    -- is order getting changed? are model and base staying same but actual order is changed?
"""

count = 0
n=0
# go though each place in train_df
for j in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[j]
    # tag 2015 population
    measure_15 = int(df['2015'])
    # tag 2016 prediction
    model_16 = train_preds[j][0]
    if model_16 > abs(measure_15 * 1.2):
        #print(f"measure_15 = {measure_15}\np_16 = {model_16}\n")
        count+=1
    n+=1
print(count)

count = 0
nn=0
# go though each place in train_df
for j in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[j]
    # tag 2015 population
    measure_15 = int(df['2015'])
    # tag 2016 prediction
    base_16 = baseline_preds[j][0]
    if base_16 > abs(measure_15 * 1.2):
        #print(f"measure_15 = {measure_15}\np_16 = {model_16}\n")
        count+=1
    nn+=1
print(count)
print()
print(n,nn)'''
pass

In [None]:
"""import matplotlib.pyplot as plt

# high resolution 
%config InlineBackend.figure_format = 'retina'

# set fig
plt.figure(figsize=(7,4))

# scatter 2016 Predictions 
plt.scatter(x_axis_2016, y_axis_2016,alpha=1,label='Model 2016')
# scatter 2017 Predictions 
plt.scatter(x_axis_2017, y_axis_2017,alpha=0.5,label='Model 2017')
# title graph
plt.title(label='Prediction Residuals 2016 vs 2017',fontsize=20,fontname="Times New Roman Bold")
# x axis label 
plt.xlabel('Total Population of Place (Sample 5000 Places)', fontsize=13)
# y axis label
plt.ylabel('Residuals (Prediction - Actual)' , fontsize=13)
# display legend
plt.legend(loc='top right')

# convert to semilogx
plt.semilogx()
plt.xticks(rotation=35)

# display graph
plt.show()"""
pass

In [None]:
print(f"avg population for 2016: {int(sum(test_16_)/len(sample_one_hunnit))}\navg  estimate  for 2016: {int(sum(train_preds_16)/len(sample_one_hunnit))}\navg  baseline  for 2016: {int(sum(base_preds_16)/len(sample_one_hunnit))}")

In [None]:
print(f"avg population for 2017: {int(sum(test_17_)/len(sample_one_hunnit))}\navg  estimate  for 2017: {int(sum(train_preds_17)/len(sample_one_hunnit))}\navg  baseline  for 2017: {int(sum(base_preds_17)/len(sample_one_hunnit))}")

In [None]:
"""CHANGEPOINT PRIOR FORM 0.15 TO 0.1"""
'''forecast 2016 and 2017 populations using model for each sample Place'''
# set out route for forecast tables
out2 = []
# set out route for 2016 & 2017 Train predictions
train_preds2 = []
# make DataFrame of column values as datetime
datetimes = pd.DataFrame(data=pd.to_datetime(pd.Series(data=train_df.columns)))
# go though each place in train_df
for i in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[i]
    # add datetime values to DataFrame
    df = pd.concat([df.reset_index(),datetimes],axis=1)
    # use fbprophet to make Prophet model
    place_prophet = fbprophet.Prophet(changepoint_prior_scale=10)
    # rename Place df's columns to agree with prophet formatting
    df.columns = ['drop','y','ds']
    # adjust df ; forget index column (drop)
    df = df[['ds','y']]
    # fit place on prophet model 
    place_prophet.fit(df)
    # make a future dataframe for 2016 & 2017 years
    place_forecast = place_prophet.make_future_dataframe(periods=21, freq='Y')
    # establish predictions
    forecast = place_prophet.predict(place_forecast)
    # tag and bag (forecast table)
    out2.append(forecast)
    # store 2016 and 2017 predictions
    train_preds2.append([
        forecast.loc[forecast.ds == '2016-12-31'].yhat.values[0],
        forecast.loc[forecast.ds == '2017-12-31'].yhat.values[0]])

In [None]:
'''2016'''
train_preds_16_1 = [float(pred[0]) for pred in train_preds2]

# RMSE for new 2016 preds
MODEL_rmse_exrate16_1 = sqrt(mean_squared_error(y_true=test_16_,y_pred=train_preds_16_1))

MODEL_rmse_exrate16_1,BASE_rmse_exrate16

In [None]:
'''2017'''
train_preds_17_1 = [float(pred[1]) for pred in train_preds2]

# RMSE for new 2017 preds
MODEL_rmse_exrate17_1 = sqrt(mean_squared_error(y_true=test_17_,y_pred=train_preds_17_1))

MODEL_rmse_exrate17_1,BASE_rmse_exrate17

In [None]:
"""CHANGEPOINT PRIOR FORM 0.1 TO 0.05"""
'''forecast 2016 and 2017 populations using model for each sample Place'''
# set out route for forecast tables
out3 = []
# set out route for 2016 & 2017 Train predictions
train_preds3 = []
# make DataFrame of column values as datetime
datetimes = pd.DataFrame(data=pd.to_datetime(pd.Series(data=train_df.columns)))
# go though each place in train_df
for i in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[i]
    # add datetime values to DataFrame
    df = pd.concat([df.reset_index(),datetimes],axis=1)
    # use fbprophet to make Prophet model
    place_prophet = fbprophet.Prophet(changepoint_prior_scale=0.05)
    # rename Place df's columns to agree with prophet formatting
    df.columns = ['drop','y','ds']
    # adjust df ; forget index column (drop)
    df = df[['ds','y']]
    # fit place on prophet model 
    place_prophet.fit(df)
    # make a future dataframe for 2016 & 2017 years
    place_forecast = place_prophet.make_future_dataframe( periods=3, freq='Y' )
    # establish predictions
    forecast = place_prophet.predict(place_forecast)
    # tag and bag (forecast table)
    out3.append(forecast)
    # store 2016 and 2017 predictions
    train_preds3.append([
        forecast.loc[forecast.ds == '2016-12-31'].yhat.values[0],
        forecast.loc[forecast.ds == '2017-12-31'].yhat.values[0]])

In [None]:
'''2016'''
train_preds_16_2 = [float(pred[0]) for pred in train_preds3]

# RMSE for new 2016 preds
MODEL_rmse_exrate16_2 = sqrt(mean_squared_error(y_true=test_16_,y_pred=train_preds_16_2))

MODEL_rmse_exrate16_2,BASE_rmse_exrate16

In [None]:
'''2017'''
train_preds_17_2 = [float(pred[1]) for pred in train_preds2]

# RMSE for new 2017 preds
MODEL_rmse_exrate17_2 = sqrt(mean_squared_error(y_true=test_17_,y_pred=train_preds_17_2))

MODEL_rmse_exrate17_2,BASE_rmse_exrate17

In [None]:
"""CHANGEPOINT PRIOR FORM 0.05 TO 0.01"""
'''forecast 2016 and 2017 populations using model for each sample Place'''
# set out route for forecast tables
out4 = []
# set out route for 2016 & 2017 Train predictions
train_preds4 = []
# make DataFrame of column values as datetime
datetimes = pd.DataFrame(data=pd.to_datetime(pd.Series(data=train_df.columns)))
# go though each place in train_df
for i in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[i]
    # add datetime values to DataFrame
    df = pd.concat([df.reset_index(),datetimes],axis=1)
    # use fbprophet to make Prophet model
    place_prophet = fbprophet.Prophet(changepoint_prior_scale=0.01)
    # rename Place df's columns to agree with prophet formatting
    df.columns = ['drop','y','ds']
    # adjust df ; forget index column (drop)
    df = df[['ds','y']]
    # fit place on prophet model 
    place_prophet.fit(df)
    # make a future dataframe for 2016 & 2017 years
    place_forecast = place_prophet.make_future_dataframe( periods=3, freq='Y' )
    # establish predictions
    forecast = place_prophet.predict(place_forecast)
    # tag and bag (forecast table)
    out4.append(forecast)
    # store 2016 and 2017 predictions
    train_preds4.append([
        forecast.loc[forecast.ds == '2016-12-31'].yhat.values[0],
        forecast.loc[forecast.ds == '2017-12-31'].yhat.values[0]])

In [None]:
'''2016'''
train_preds_16_3 = [float(pred[0]) for pred in train_preds4]

# RMSE for new 2016 preds
MODEL_rmse_exrate16_3 = sqrt(mean_squared_error(y_true=test_16_,y_pred=train_preds_16_3))

MODEL_rmse_exrate16_3,BASE_rmse_exrate16

In [None]:
'''2017'''
train_preds_17_3 = [float(pred[1]) for pred in train_preds4]

# RMSE for new 2017 preds
MODEL_rmse_exrate17_3 = sqrt(mean_squared_error(y_true=test_17_,y_pred=train_preds_17_3))

MODEL_rmse_exrate17_3,BASE_rmse_exrate17

In [None]:
"""make Baseline predictions of 2016 and 2017 population on sample Places"""
# set out route
baseline_preds = []
# go though each place in train_df
for j in range(len(train_df)):
    # extract DataFrame for that place
    df = train_df.iloc[j]
    
    # tag 1970 population
    measure_70 = int(df['1970'])
    # tag 1980 population
    measure_80 = int(df['1980'])
    # tag 1990 population
    measure_90 = int(df['1990'])
    # tag 2000 population 
    measure_00 = int(df['2000'])
    # tag 2010 population
    measure_10 = int(df['2010'])
    # tag 2011 population
    measure_11 = int(df['2011'])
    # tag 2012 population
    measure_12 = int(df['2012'])
    # tag 2013 population
    measure_13 = int(df['2013'])
    # tag 2014 population
    measure_14 = int(df['2014'])
    # tag 2015 population
    measure_15 = int(df['2015'])
    
    # calculate avg yearly change 1970-1980
    change_70_80 = (measure_80 - measure_70) / 10
    # calculate avg yearly change 1780-1990
    change_80_90 = (measure_90 - measure_80) / 10
    # calculate avg yearly change 1990-2000
    change_90_00 = (measure_00 - measure_90) / 10
    # calculate avg yearly change 2000-2010
    change_00_10 = (measure_10 - measure_00) / 10
    
    # calculate yearly change 2010-2011
    change_10_11 = measure_11 - measure_10
    # calculate yearly change 2011-2012
    change_11_12 = measure_12 - measure_11
    # calculate yearly change 2012-2013
    change_12_13 = measure_13 - measure_12
    # calculate yearly change 2013-2014
    change_13_14 = measure_14 - measure_13
    # calculate yearly change 2014-2015
    change_14_15 = measure_15 - measure_14
    
    # calculate mean census change 
    avg_census_change = np.mean([change_70_80, change_80_90, change_90_00, change_00_10])
    
    # calculate mean ACS change
    avg_acs_change = np.mean([change_10_11, change_11_12, change_12_13, change_13_14, change_14_15])
    
    # weight census change
    weighted_census_change = avg_census_change * (40/45)
    # weight ACS change
    weighted_acs_change = avg_acs_change * (5/45)
    
    # combine weighted changes for average change
    change = weighted_census_change + weighted_acs_change

    # make 2016 prediction 
    p_16 = measure_15 + change

    # make 2017 prediction 
    p_17 = p_16 + change
    
    print(measure_00,measure_15,p_16,p_17)
    
    # pair prediction, tag & bag
    baseline_preds.append([p_16,p_17])