# Compare the graph parameters for the activities

- Compute: n_components, n nodes, size of giant components clustering coeff, etc.
- Just run iterations. city - output - (strong, weak insig, weak sig pos, weak sig neg).
- Need to go through some temporal perspectives: seasonality, trend, breaks, etc.

Q: Should I focus on the directed vs. undirected graph comparison???

In [52]:
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import copy
from scipy.sparse import csr_matrix
import time

In [2]:
from sklearn.preprocessing import normalize

In [3]:
import sys
sys.path.append("../")
import utils

In [4]:
import importlib
importlib.reload(utils)

<module 'utils' from '../utils.py'>

In [5]:
# read files
with open("../../data/02_intermediate/boston_stays.pickle", 'rb') as f:
    df_boston = pickle.load(f)

with open("../../data/02_intermediate/miami_stays.pickle", 'rb') as f:
    df_miami = pickle.load(f)

with open("../../data/02_intermediate/chicago_stays.pickle", 'rb') as f:
    df_chicago = pickle.load(f)

with open("../../data/03_processed/A_home_activity_three_cities_unweighted_dic.pickle", 'rb') as f:
    A_home_activity_unweighted_dic = pickle.load(f)
    
with open("../../data/03_processed/A_home_activity_three_cities_weighted_dic.pickle", 'rb') as f:
    A_home_activity_weighted_dic = pickle.load(f)
    

In [6]:
# read the spatial network dictionary.
with open("../../data/03_processed/spatial_network_boston_miami_chicago_dic.pickle", 'rb') as f:
    spatial_network_dic = pickle.load(f)
    

In [7]:
# read shapefiles
with open("../../data/02_intermediate/boston_miami_chicago_ct_shp_dic.pickle", 'rb') as f:
    shp_dic = pickle.load(f)


In [8]:
# read evaluation files
with open("../../data/05_model_outputs/lasso_coefficients.pickle", 'rb') as f:
    lasso_coef = pickle.load(f)
    

In [9]:
# check the coefficients
lasso_coef['boston']['inc_median_household_2018']['lasso (no socio-demographics)'].index


Index(['Latin American', 'Caribbean', 'Brazilian', 'Fried Chicken',
       'Laundromat', 'Food Stand', 'Tennis Court', 'Fishing Store', 'Football',
       'Science Museum', 'Ski Area', 'Peking Duck', 'Cupcakes', 'Skating Rink',
       'French'],
      dtype='object')

In [10]:
# check the mobility network
A_home_activity_unweighted_dic['boston'][1.0]['ATM']


GEOID,25009211402,25017357500,25025101102,25017374200,25025020301,25017316102,25017351300,25017330200,25009260900,25017374700,...,25025060501,25017333100,25017333400,25017336300,25025110301,25021443101,25021400400,25017339400,25009268200,25009221700
GEOID_home,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25009201100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25009202101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25009202102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25009202200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25009203100,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33017085000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33017086000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33017087000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33017088000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Turn the directed to undirected adj matrice 

In [36]:
def turn_asymmetric_directed_to_symmetric_undirected_adj(activity_dir_adj_df):
    ''' turn an asymmetric directed adj matrix to a symmetric directed one '''
#     activity_dir_adj_df = A_home_activity_unweighted_dic['boston'][1.0]['ATM']
    union_list = sorted(set(activity_dir_adj_df.columns).union(set(activity_dir_adj_df.index)))
    activity_zeros_df = pd.DataFrame(np.zeros((len(union_list), len(union_list))), 
                                     columns = union_list,
                                     index = union_list)
    activity_zeros_df = activity_zeros_df.add(activity_dir_adj_df, fill_value = 0.0)
#     print(activity_zeros_df)

    activity_undir_adj_df = pd.DataFrame(np.maximum(activity_zeros_df.values.T, activity_zeros_df.values),
                                   columns = union_list,
                                   index = union_list)
    return activity_undir_adj_df

# # test
# activity_dir_adj_df = A_home_activity_unweighted_dic['boston'][1.0]['ATM']
# activity_undir_adj_df = turn_asymmetric_directed_to_symmetric_undirected_adj(activity_dir_adj_df)
# print(np.sum(activity_undir_adj_df.values.T - activity_undir_adj_df.values)) # pass the test.


0.0


In [37]:
A_home_activity_unweighted_sym_adj_dic = {}

for city in ['boston','chicago','miami']:
    print(city)
    A_home_activity_unweighted_sym_adj_dic[city] = {}
    for threshold in [1.0, 10.0]:
        print(threshold)
        A_home_activity_unweighted_sym_adj_dic[city][threshold] = {}
        for activity_name in A_home_activity_unweighted_dic[city][threshold].keys():
            A_home_activity_unweighted_sym_adj_dic[city][threshold][activity_name]=turn_asymmetric_directed_to_symmetric_undirected_adj(A_home_activity_unweighted_dic[city][threshold][activity_name])
            
        

boston
1.0
10.0
chicago
1.0
10.0
miami
1.0
10.0


In [38]:
# save - wow a huge file.
with open('../../data/03_processed/A_home_activity_three_cities_unweighted_sym_adj_dic.pickle', 'wb') as f:
    pickle.dump(A_home_activity_unweighted_sym_adj_dic, f)
    

In [None]:
# read
with open('../../data/03_processed/A_home_activity_three_cities_unweighted_sym_adj_dic.pickle', 'rb') as f:
    A_home_activity_unweighted_sym_adj_dic = pickle.load(f)


### Compute the graph parameters for activity categories

In [48]:
list(A_home_activity_unweighted_sym_adj_dic[city][threshold].keys())

['ATM',
 'Academic Building',
 'Accessories',
 'Acupuncturist',
 'Administrative Building',
 'Adult Boutique',
 'Adult Education Center',
 'Advertising Agency',
 'Afghan',
 'African',
 'Airport',
 'Airport Service',
 'Alternative Healer',
 'American',
 'Amphitheater',
 'Animal Shelter',
 'Antiques',
 'Apparel',
 'Apres Ski Bar',
 'Aquarium',
 'Arcade',
 'Arepas',
 'Argentinian',
 'Art Gallery',
 'Art Museum',
 'Art Studio',
 'Arts',
 'Arts & Crafts',
 'Arts & Entertainment',
 'Asian',
 'Assisted Living',
 'Astrologer',
 'Athletics & Sports',
 'Auditorium',
 'Australian',
 'Auto Dealer',
 'Auto Garage',
 'Auto Workshop',
 'Automotive',
 'B & B',
 'BBQ',
 'Baby Store',
 'Bagels',
 'Baggage Claim',
 'Bakery',
 'Ballroom',
 'Bank',
 'Bar',
 'Baseball',
 'Baseball Field',
 'Basketball',
 'Basketball Court',
 'Bath House',
 'Bathing Area',
 'Bay',
 'Beach',
 'Beach Bar',
 'Beer Bar',
 'Beer Garden',
 'Beer Store',
 'Belgian',
 'Big Box Store',
 'Bike',
 'Bike Shop',
 'Bike Trail',
 'Billiard

In [56]:
###
beginning_time = time.time()
mobility_network_parameters_dic = {}
threshold = 1.0

for city in ['boston','chicago','miami']:
    print(city)
    mobility_network_parameters_dic[city]={}
    for idx in range(len(A_home_activity_unweighted_sym_adj_dic[city][threshold].keys())):
        activity_name = list(A_home_activity_unweighted_sym_adj_dic[city][threshold].keys())[idx]
        mobility_network_parameters_dic[city][activity_name] = {}
        current_time = time.time()
        elapse_time = current_time - beginning_time
        print(idx, activity_name, elapse_time/60.0, "minutes", end = '\r')

        # init the mobility graph
        adj_m = A_home_activity_unweighted_sym_adj_dic[city][threshold][activity_name]
        G_mobility = nx.from_pandas_adjacency(adj_m)

        # computing the parameters
        # may need to add: degree distributions, clustering coeff distributions. 
        n_components = nx.number_connected_components(G_mobility)
        n_nodes = nx.number_of_nodes(G_mobility)
        n_edges = nx.number_of_edges(G_mobility)
        density = nx.density(G_mobility)
        Gcc = sorted(nx.connected_components(G_mobility), key=len, reverse=True)
        sizes_of_components = [len(l) for l in Gcc]
        size_of_giant_component = len(Gcc[0])
        G0 = G_mobility.subgraph(Gcc[0]) # get the giant component.
        diameter_giant_component = nx.diameter(G0)
        ave_distance_giant_component = nx.average_shortest_path_length(G0)
        
        # save
        mobility_network_parameters_dic[city][activity_name]['n_components']=n_components
        mobility_network_parameters_dic[city][activity_name]['n_nodes']=n_nodes
        mobility_network_parameters_dic[city][activity_name]['n_edges']=n_edges
        mobility_network_parameters_dic[city][activity_name]['density']=density
        mobility_network_parameters_dic[city][activity_name]['sizes_of_components']=sizes_of_components
        mobility_network_parameters_dic[city][activity_name]['size_of_giant_component']=size_of_giant_component
        mobility_network_parameters_dic[city][activity_name]['diameter_giant_component']=diameter_giant_component
        mobility_network_parameters_dic[city][activity_name]['ave_distance_giant_component']=ave_distance_giant_component
        


boston
chicago Exhibit 60.04456735054652 minutesssnutessutess
miamioo Exhibit 427.2834915200869 minutesesnutessnutess
629 Zoo Exhibit 505.625807038943 minutessesinutesnutes

In [57]:
# save!
with open('../../data/05_model_outputs/network_property_other_parameters.pickle', 'wb') as f:
    pickle.dump(mobility_network_parameters_dic, f)
