In [None]:
import pandas as pd
import seaborn as sns
import fbprophet 
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
def to_numeric_but(dataframe,save_these_columns='none',e='coerce'):
    '''
    split into 2 df and rejoin after convert to int
    
    inputs:
        >> save_these_columns=number of columns to save
            > currently must include one end of df 
                >> might could run function multiple times to edit slices
                >> single number, not range (yet)
                    > if 'none', saves no columns
        >> dataframe
            > dataframe to shif to numeric (but)
        >> e
            > for pd.to_numeric, errors=e
    output:
        >> concatted pd.DataFrame of 
            > og columns you chose to save
            > columns converted to numeric
    '''
    # copy df for editing
    k = dataframe.copy()
    
    # split
    if save_these_columns != 'none':
        # columns to save
        save_k = k[k.columns[:save_these_columns]]
        # columns to edit
        switch_k = k[k.columns[save_these_columns:]]
    # don't split
    else:
        # k as is
        switch_k = k

    # edited columns  # coerce , ignore , raise
    swapped_k = switch_k.apply(pd.to_numeric, errors=e)
    
    # check saving columns
    if save_these_columns != 'none':
        # new (edited) dataframe (ogsave|swapped)
        new_k = pd.concat( [save_k,swapped_k] ,axis=1 )
    else:
        new_k = swapped_k

    return new_k


def geography_to_zipcode_ids_to_numeric(dataframe):
    '''
    convert 
        >> .Geography values 
            > like 'ZCTA5 00601' 
            > to int(00601)
        >> .Id values
            > like '8600000US00601' 
            > to int(860000000601)
        >> .Id2 values
            > like '00601'
            > to int(00601)
    '''
    # copy
    df = dataframe.copy()
    
    # set old Geography
    geo = df.Geography
    # set old Id
    _id = df.Id
    # set old Id2
    __id2 = df.Id2
    
    # make new 'Geography' values
    new_geos = [int(i[-5:]) for i in geo]
    # new 'Id' values
    new_id = [int(''.join(i.split('US'))) for i in _id]
    # new .Id2 instances
    new__id2 = [int(d) for d in __id2]
    
    # convert dataframe
    new_df = df.copy()
    new_df.Geography = new_geos
    new_df.Id = new_id
    new_df.Id2 = new__id2
    
    # return new df
    return new_df

def kmeans_by(dataframe,n_clusters=10,converted=False):
    '''
    inputs:
        >> dataframe
            > dataframe to be edited
        >> n_clusters 
            > default = 10
            > number of clusters for KMeans
        >> converted
            > default = False
            > assumes data is not ready for KMeans 
                >> if True, assumes df is ready for KMeans
    output:
        > pd.Dataframe of 
    '''
    # copy data 
    d = dataframe.copy()  
    
    '''df conversion'''
    # default
    if converted!=True:
        # copy data for editing
        _data_ = d.copy()
        
        # convert first 3 columns ('Id', 'Id2', 'Geography')
        _data = geography_to_zipcode_ids_to_numeric(dataframe=_data_)
        
        # convert remainder of dataframe
        data = to_numeric_but(save_these_columns='none', dataframe=_data)
        print(len(data),len(data.columns))

    # dataframe has already been converted / otherwise
    if converted==True:
        data = d  
    
    '''KMeans'''
    # fill NaN values
    t = data.copy().fillna(0)
    
    # Convert DataFrame to matrix
    mat = t.values
    
    # Using sklearn
    km = KMeans(n_clusters)
    # fit our values
    km.fit(mat)
    
    # Get cluster assignment labels
    labels = km.labels_
    
    # Format results as a DataFrame
    results = pd.DataFrame([t.index,labels])

    # display results
    return results

In [None]:
# total population by county (adjusted to 2010 controls)
county_2010=pd.read_csv('../../data/NHGIS/nhgis0001_csv/nhgis0001_ts_geog2010_county.csv')
# total population by place
pop_by_place=pd.read_csv('../../data/NHGIS/nhgis0002_csv/nhgis0002_ts_nominal_place.csv',encoding='ISO-8859-1')

In [None]:
# Table 1: (CL8) Total Population
#         CL8AA:       Persons: Total
#         CL8AAL:      Lower bound: Persons: Total
#         CL8AAU:      Upper bound: Persons: Total

 
# Context Fields 
#         NHGISCODE:   NHGIS Integrated Geographic Unit Code
#         GJOIN1970:   GIS Join Match Code, 1970
#         GJOIN1980:   GIS Join Match Code, 1980
#         GJOIN1990:   GIS Join Match Code, 1990
#         GJOIN2000:   GIS Join Match Code, 2000
#         GJOIN2010:   GIS Join Match Code, 2010
#         GJOIN2012:   GIS Join Match Code, 2012
#         STATE:       NHGIS Integrated State Name
#         STATEFP:     FIPS State Code
#         STATENH:     NHGIS Integrated State Code
#         PLACE:       NHGIS Integrated Place Name
#         PLACEA:      NHGIS Integrated Place Code
#         NAME1970:    Area Name, 1970
#         NAME1980:    Area Name, 1980
#         NAME1990:    Area Name, 1990
#         NAME2000:    Area Name, 2000
#         NAME2010:    Area Name, 2010
#         NAME2012:    Area Name, 2012
 
# Table 1: (AV0) Total Population
#     Time series AA: Persons: Total
#         AV0AA1970:   1970: Persons: Total
#         AV0AA1980:   1980: Persons: Total
#         AV0AA1990:   1990: Persons: Total
#         AV0AA2000:   2000: Persons: Total
#         AV0AA2010:   2010: Persons: Total
#         AV0AA125:    2008-2012: Persons: Total
#         AV0AA125M:   Margin of error: 2008-2012: Persons: Total

In [None]:
highest_pop_2010 = pop_by_place.loc[pop_by_place.AV0AA2010.idxmax()]
for i in highest_pop_2010[22:24]:
    print(i)

- ***notes***: 
    - 2010: Persons: Total != 008-2012: Persons: Total 

In [None]:
print(len(pop_by_place.NHGISCODE),len(pop_by_place.NHGISCODE.unique()))

In [None]:
_x=(len(county_2010.DATAYEAR)/3)-1
print(_x,_x*2,_x*3,'\n',len(county_2010.DATAYEAR),len(county_2010.DATAYEAR.unique()))

In [None]:
county_2010.iloc[9428]

In [None]:
'''Understanding STATEA'''
print(len(county_2010[:3142].STATEA.unique()),'\n',county_2010[:3142].STATEA.unique())

In [None]:
q=county_2010.copy()
idx=q.DATAYEAR
q=q.set_index(idx)
q[['GISJOIN', 'DATAYEAR', 'STATEA',
       'COUNTYA', 'CL8AA', 'CL8AAL', 'CL8AAU']]

In [None]:
x=county_2010[:3142].copy()
y=county_2010[3143:6286].copy()
z=county_2010[6286:].copy()

x=x.apply(pd.to_numeric, errors='coerce')
x=x.dropna(axis=1, how='all')
y=y.apply(pd.to_numeric, errors='coerce')
y=y.dropna(axis=1, how='all')
z=z.apply(pd.to_numeric, errors='coerce')
z=z.dropna(axis=1, how='all')

a=kmeans_by(dataframe=x,n_clusters=31,converted=True)
b=kmeans_by(dataframe=y,n_clusters=31,converted=True)
c=kmeans_by(dataframe=z,n_clusters=31,converted=True)

In [None]:
a_i = a.loc[0]

In [None]:
b

In [None]:
c

In [None]:
pop_by_place.head()

In [None]:
len(pop_by_place.NHGISCODE),len(pop_by_place.NHGISCODE.unique())

In [None]:
places = pop_by_place.NHGISCODE
# Table 1: (AV0) Total Population
#     Time series AA: Persons: Total
#         AV0AA1970:   1970: Persons: Total
#         AV0AA1980:   1980: Persons: Total
#         AV0AA1990:   1990: Persons: Total
#         AV0AA2000:   2000: Persons: Total
#         AV0AA2010:   2010: Persons: Total
#         AV0AA125:    2008-2012: Persons: Total
#         AV0AA125M:   Margin of error: 2008-2012: Persons: Total
# pop_by_place.loc(places[1])
places[1]

In [None]:
# df.loc[df['column_name'] == some_value]
# pop_by_place.loc[pop_by_place.STATE=='Arkansas']
for i,place in enumerate(pop_by_place.PLACE):
    if 'San Francisco' in place:
        print(i,place)

In [None]:
# list of 
# pleasanton 3200
# bentonville 1549
nyc=pop_by_place.iloc[[18672]][['NHGISCODE','PLACE','STATE','AV0AA1970','AV0AA1980', 'AV0AA1990', 'AV0AA2000','AV0AA2010']]
sfo= pop_by_place.iloc[[3334]][['NHGISCODE','PLACE','STATE','AV0AA1970','AV0AA1980', 'AV0AA1990', 'AV0AA2000','AV0AA2010']]
sfo
# testa=pop_by_place[['NHGISCODE','PLACE','STATE','AV0AA1970','AV0AA1980', 'AV0AA1990', 'AV0AA2000','AV0AA2010']]
# only keep columns with at least 5 non-NaN (aka 2 population measurements)
# testa.dropna(thresh=5)

In [None]:
# testb=pd.DataFrame(data=['place','1970','1980','1990','2000','2010','2008-2012'], index=['NHGISCODE','AV0AA1970','AV0AA1980', 'AV0AA1990', 'AV0AA2000','AV0AA2010', 'AV0AA125']).T
# testb
# testb=pd.DataFrame({'year': ['1970','1980','1990','2000','2010'],
#                        'month': [12,12,12,12,12],
#                        'day': [31,31,31,31,31]})
# testb=testb.T.apply(pd.to_datetime)
# testb

# df of years w/ columns same as pop_by_place
testb=pd.DataFrame(index=[1],data={'NHGISCODE':'x','AV0AA1970': '1970','AV0AA1980':'1980','AV0AA1990':'1990',"AV0AA2000":'2000',"AV0AA2010":'2010'})

# convert to datetime
testb=testb.apply(pd.to_datetime,errors='coerce')
testb

In [None]:
# put together 
# G01000124=pd.concat([testb,testa],axis=0)
G01000124=pd.concat([testb,sfo],axis=0)
# drop nan values columns (non-measured, etc)
G01000124=G01000124.dropna(axis=1)
# reset index (now == 0,1), transpose, then rename columns to Prophet standards
G01000124=G01000124.reset_index().T.rename(columns={0: 'ds', 1:'y'})
# has weird 'index' row at 0
G01000124=G01000124[1:]
G01000124

In [None]:
"""
e.g. 1-measure city
ValueError: Dataframe has less than 2 non-NaN rows."""
# Make the prophet model and fit on the data
gm_prophet = fbprophet.Prophet(changepoint_prior_scale=0.15)
gm_prophet.fit(G01000124)

In [None]:
# Make a future dataframe for 2 years
gm_forecast = gm_prophet.make_future_dataframe(periods=1 * 10, freq='Y')

# Make predictions
gm_forecast = gm_prophet.predict(gm_forecast)

In [None]:
gm_forecast

In [None]:
# Make the prophet model 
gm_prophet = fbprophet.Prophet(changepoint_prior_scale=0.15)

# fit model on our data
gm_prophet.fit(G01000124)

# Make a future dataframe for 2 years
gm_forecast = gm_prophet.make_future_dataframe(periods=1 * 10, freq='Y')

# Make predictions
gm_forecast = gm_prophet.predict(gm_forecast)

# identify change points
city_changepoints = [str(date) for date in tesla_prophet.changepoints]

gm_prophet.plot(gm_forecast, xlabel = 'Year', ylabel = 'Population')
plt.title('Population of San Francisco');

In [None]:
gm_prophet.plot(gm_forecast, xlabel = 'Year', ylabel = 'Population')

plt.title('Population of New York City');