# Cumulative R2

- Using the LASSO results, and compute the cumulative R2


In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import copy
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.linear_model import LinearRegression

# regression tools
import statsmodels.api as sm
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

In [3]:
with open("../../data/02_intermediate/socioecon_boston_miami_chicago_nyc_ct_shp_dic.pickle", 'rb') as f:
    socioecon_shp_dic = pickle.load(f)

# hetero activity graphs    
with open("../../data/03_processed/A_home_activity_three_cities_unweighted_dic.pickle", 'rb') as f:
    A_home_unweighted_dic = pickle.load(f)

with open("../../data/03_processed/A_home_activity_three_cities_weighted_dic.pickle", 'rb') as f:
    A_home_weighted_dic = pickle.load(f)


In [4]:
# this is the output from model_01_lasso_three_cities
with open('../../data/05_model_outputs/lasso_coefficients.pickle', 'rb') as f:
    lasso_coeff_dic = pickle.load(f)


In [5]:
lasso_coeff_dic.keys()

dict_keys(['boston', 'chicago', 'miami', 'total', 'nyc'])

In [6]:
lasso_coeff_dic['boston']['inc_median_household_2018']['lasso (no socio-demographics)']

Unnamed: 0,value
Latin American,-0.052719
Caribbean,-0.047442
Brazilian,-0.033213
Fried Chicken,-0.0261
Laundromat,-0.020124
Food Stand,0.022041
Tennis Court,0.022838
Fishing Store,0.024188
Football,0.024686
Science Museum,0.027285


In [9]:
# 
df_socioecon_shp_dic = {}
threshold = 1.0
for city in ['boston', 'miami', 'chicago']:
    print(city)
    df_socioecon_shp = socioecon_shp_dic[city]
    for activity in A_home_unweighted_dic[city][threshold].keys():
        activity_vector = pd.DataFrame(A_home_unweighted_dic[city][threshold][activity].sum(axis = 1), columns = [activity])
        df_socioecon_shp = df_socioecon_shp.merge(activity_vector, 
                                   left_index = True, 
                                   right_index = True, 
                                   how = 'left')
    df_socioecon_shp.fillna(0.0, inplace=True)
    df_socioecon_shp_dic[city] = df_socioecon_shp
    

boston
miami
chicago


In [10]:
# init
cum_r2_dic = {}

In [11]:
# Compute the cumulative R2 for 3*3 scenarios.
# TBD: separate the trianing/testing.

for city in ['boston', 'chicago', 'miami']:
    cum_r2_dic[city] = {}
    for output in ['inc_median_household_2018', 'property_value_median_2018', 'rent_median_2018']:
        df_socioecon_shp = df_socioecon_shp_dic[city]

        # sort coef
        coeff_ = lasso_coeff_dic[city][output]['lasso (no socio-demographics)']
        coeff_.value = np.abs(coeff_.value)
        coeff_ = coeff_.sort_values('value', ascending = False)

        # 
        input_list = []
        reg_score_list = []
        for idx in np.arange(15):
            input_var = coeff_.index[idx] # start with 0 
            input_list.append(input_var)

            X = df_socioecon_shp[input_list]
            X = sm.add_constant(X)
            y = df_socioecon_shp[output].values.reshape(-1, 1)

            reg = LinearRegression().fit(X, y)
            reg_score_list.append(reg.score(X, y))
            
        cum_r2_dic[city][output] = reg_score_list


In [12]:
# cumulative R2 for the nine situations.
cum_r2_dic

{'boston': {'inc_median_household_2018': [0.14211897891010428,
   0.257154695169051,
   0.3503440606828798,
   0.3848676906949061,
   0.4181052930021547,
   0.43145946089114684,
   0.44237610036055386,
   0.45178515925614837,
   0.4632689046217142,
   0.4658832829644375,
   0.4735215832182694,
   0.48243496292415144,
   0.4925380188316869,
   0.4987029368003777,
   0.5035599415992749],
  'property_value_median_2018': [0.2580737197645969,
   0.32214736715826475,
   0.34181020824162633,
   0.4619978649705223,
   0.4835786008141826,
   0.49326140022362264,
   0.49842493458113935,
   0.5169637386674317,
   0.5195331711464424,
   0.5308909453522215,
   0.5328029118428035,
   0.5426243559861801,
   0.5467738753923446,
   0.5499253285880943,
   0.5523169018132092],
  'rent_median_2018': [0.11922834532498705,
   0.15203524449305394,
   0.16440286776656055,
   0.1695890034853429,
   0.17997309627659286,
   0.19307851718035307,
   0.20767370342770175,
   0.2263052560706158,
   0.2457539458445852

# Save

In [31]:
with open('../../data/05_model_outputs/performance_cumulative_mobility_networks.pickle', 'wb') as f:
    pickle.dump(cum_r2_dic, f)