# Turn the dfs to attribute, adjacency, and label matrices

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import copy
from scipy.sparse import csr_matrix


In [2]:
# read files.
with open('../../data/02_intermediate/boston_stays.pickle', 'rb') as f:
    df = pickle.load(f)

with open("../../data/02_intermediate/boston_annual_growth_2016_2018_3391cbg.pickle", 'rb') as f:
    df_growth_16_18 = pickle.load(f)
    
with open("../../data/02_intermediate/boston_socioecon_2016_3391cbg.pickle", 'rb') as f:
    df_socio_2016 = pickle.load(f)


In [3]:
# find overlapping GEO IDs.
# use only the GEOID (not GEOID_home) from the df. 
overlapping_geoid = list(set(df_growth_16_18.index).intersection(set(np.unique(df.GEOID))))
print(len(overlapping_geoid))

3102


# Place Graph (X, A, Y)

Predict growth now

# X

In [4]:
var_list = ['inc_per_capita', 'property_value_median', 'pop_total', 
            'households', 'race_white_ratio', 'race_black_ratio', 
            'age_median', 'travel_driving_ratio', 
            'edu_bachelor_ratio']

In [5]:
X = df_socio_2016.loc[overlapping_geoid, var_list]

In [6]:
X = X.sort_index()

In [7]:
X

Unnamed: 0_level_0,inc_per_capita,property_value_median,pop_total,households,race_white_ratio,race_black_ratio,age_median,travel_driving_ratio,edu_bachelor_ratio
full_bg_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
250092011001,46400.0,521300.0,544.0,259.0,1.000000,0.000000,52.8,0.728395,0.239669
250092021011,54513.0,464100.0,721.0,248.0,0.970874,0.000000,47.4,0.737931,0.334669
250092021012,48486.0,461900.0,518.0,202.0,0.967181,0.000000,39.9,0.836538,0.413408
250092021013,43408.0,391000.0,805.0,288.0,0.822360,0.045963,35.4,0.761261,0.250000
250092021021,35731.0,403800.0,1181.0,402.0,0.957663,0.029636,33.8,0.902357,0.204301
...,...,...,...,...,...,...,...,...,...
330170870001,25345.0,218500.0,1479.0,549.0,1.000000,0.000000,33.2,0.926868,0.100338
330170870002,24643.0,158700.0,1612.0,630.0,0.984491,0.000000,38.5,0.869505,0.127907
330170870003,28067.0,169300.0,1657.0,597.0,1.000000,0.000000,35.9,0.896261,0.098936
330170870004,20110.0,93200.0,1087.0,561.0,1.000000,0.000000,54.6,1.000000,0.179310


# A

- A: unweighted adjacency matrix.
- A_: weighted matrix.


In [8]:
# create the people place matrix.
people_place_vector = df.groupby(['user','GEOID']).count().iloc[:, 0]
people_place_m = people_place_vector.unstack(level = 'user')
people_place_m.values[people_place_m.isna()] = 0


In [9]:
# create the subset.
people_place_m = people_place_m.loc[overlapping_geoid, :]
people_place_m = people_place_m.sort_index() # imp! it matches the idx in X.


In [10]:
people_place_m

user,0000ff45f7f170db960e4e601167975f7559c5be147d69e75460123855ab0eab,000277100d5593fec35a151e228f6a485210a3fa87cda78d79c465bbebb6e71e,0002a8f1815bcef6bc1613175df0505bd295fb79ed6ace605b933f1a165c718d,00033a3bb5f07e32dc00a69de6c8b4f73d05e5d08d14a8d2b6d796a64f3dbb54,000343483445495a56d6d02cd1e1f5cda6e577b636c21c2f5922df46c0855919,000347ca57a60068d9b77636ccd8f0612ec971a84131b4f650996f8a255766a8,0004871e72d75626632bbce83040c515345c7df37a77658c3573eb29e496eed4,0005770f50f530c11219fa5585560153c72398204b08b9e3f005ae9a83fc41ed,0005f8ebe58f73f38b9d6869a3ee527a90aaa1d9b0fa35d4fdf9cf253b9e9b11,000657595de0bee53aaa139e25d227aab5828aa8ce0536bb7ee64290ab0ba192,...,fff76bea8927709f04bc449dce9d99cc8f939ff230675df60aee364396554dc7,fffac8080bf4f87e24bc97526135f9c3159113163bb534c65f929d88915df4d6,fffb8aafca44b83c382d92d9faf24b423f7b5e39026be566ee5f3d2c1ca50a77,fffb9b9b993e364fa4cac98956312a2153e150fc8a02c6471c913d2b31fdc835,fffb9dc73dff4a4f98eeb7c5b8ad68fbcdae5b5514c491fb9f4691abd6f7b910,fffbc8a9f385f78cd57b2e21a49d32882d44298254256a77ec6663bad36c3960,fffcb2376eb209b7b739b635fcec7e6ffb61615bcb7eed272bbb87674cf3bd2e,fffcb34b6656f9884cccdc336a4359e7aeade94dba6c1de6f7b34b7f22484c80,fffd27545d8286f0ea39bbae179c6954f1f87a91f108ccd8ebab8e7860a81969,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb2f64a95d49a2025d6f
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
250092011001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250092021011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250092021012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250092021013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250092021021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330170870001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330170870002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330170870003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330170870004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# create the weighted adjacency matrix. (incidence matrix)
people_place_sparse = csr_matrix(people_place_m)
A_ = people_place_sparse @ people_place_sparse.T


In [12]:
A_ # only 50% of the matrix is zero. Very dense

<3102x3102 sparse matrix of type '<class 'numpy.float64'>'
	with 5290846 stored elements in Compressed Sparse Row format>

In [13]:
# create the unweighted adjacency matrix.
nnz_inds = A_.nonzero()
keep = np.where(A_.data > 0.0)[0]
n_keep = len(keep)
A = csr_matrix((np.ones(n_keep), (nnz_inds[0][keep], nnz_inds[1][keep])), shape = A_.shape) # this is the (data, (row, col)) way to create the csr 


In [14]:
A.toarray()

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 1.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 1., 0., ..., 1., 1., 1.]])

In [15]:
A_.toarray()

array([[6.310190e+05, 8.487000e+03, 4.370000e+02, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [8.487000e+03, 1.252828e+06, 8.114000e+03, ..., 0.000000e+00,
        0.000000e+00, 1.000000e+00],
       [4.370000e+02, 8.114000e+03, 4.495000e+04, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       ...,
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 1.666370e+06,
        1.263800e+04, 4.760000e+02],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 1.263800e+04,
        2.744050e+05, 1.820000e+02],
       [0.000000e+00, 1.000000e+00, 0.000000e+00, ..., 4.760000e+02,
        1.820000e+02, 1.320890e+05]])

# Y

In [16]:
df_growth_16_18.columns

Index(['inc_per_capita_annual_growth', 'pop_total_annual_growth',
       'property_value_median_annual_growth', 'race_black_ratio_annual_growth',
       'race_white_ratio_annual_growth', 'vacancy_ratio_annual_growth'],
      dtype='object')

In [17]:
var_list = ['inc_per_capita_annual_growth', 
            'pop_total_annual_growth', 
            'property_value_median_annual_growth']

Y = df_growth_16_18.loc[overlapping_geoid, var_list]

In [18]:
Y = Y.sort_index()

In [19]:
Y

Unnamed: 0_level_0,inc_per_capita_annual_growth,pop_total_annual_growth,property_value_median_annual_growth
full_bg_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
250092011001,0.105,-0.094,0.062
250092021011,0.101,0.001,0.050
250092021012,0.078,0.013,0.028
250092021013,0.005,0.034,0.027
250092021021,0.125,-0.037,0.055
...,...,...,...
330170870001,0.094,-0.031,0.020
330170870002,-0.018,0.037,0.044
330170870003,0.161,-0.094,0.107
330170870004,0.084,0.084,0.030


In [20]:
# Save X, A, Y
with open("../../data/03_processed/place_graph_X.pickle", 'wb') as f:
    pickle.dump(X, f)

with open("../../data/03_processed/place_graph_A.pickle", 'wb') as f:
    pickle.dump(A, f)

with open("../../data/03_processed/place_graph_weighted_A.pickle", 'wb') as f:
    pickle.dump(A_, f)
    
with open("../../data/03_processed/place_graph_Y.pickle", 'wb') as f:
    pickle.dump(Y, f)
