# Processing of the flux matrices
[Index](0-index.ipynb)

Flux matrices were computed using the SafeGraph [Social Distancing Metrics](https://docs.safegraph.com/docs/social-distancing-metrics) dataset and pooling the fluxes in $N=2^{10}$ communities as computed in [1-clustering](1-clustering.ipynb). The script used is shown [here](../code/compute_flux_matrices.py), although we do not provide the SafeGraph dataset here.

## Imports and global variables

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
import datetime
import scipy.stats as scs

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib.colors as mco
import matplotlib.gridspec as mgs
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib import animation
plt.rcParams['svg.fonttype'] = 'none'
from IPython.display import HTML

In [None]:
sys.path.append('../code')
from functions import show_image

In [None]:
resdir = Path('../results/')
if not resdir.is_dir():
    raise ValueError('No results directory!')

In [None]:
resfile = resdir / 'safegraph_analysis.hdf5'
complevel=7
complib='zlib'
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    print(f"File {resfile.stem} has {len(store.keys())} entries.")

In [None]:
tfmt = '%Y-%m-%d'

## Clusters

### Load clusters

In [None]:
key = "/clustering/clusters"
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    clusters = store[key]
# clusters = pd.read_hdf(resfile, key)
N = len(clusters)
print(f"N = {N}")
clusters

In [None]:
key = "/clustering/cbgs_clusters"
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    cbgs_labels = store[key]

df = clusters.copy().loc[:,'leaves'].reset_index().set_index('leaves')
cbgs_labels['index'] = -1

for cbgs in cbgs_labels.index:
    cbgs_labels.at[cbgs, 'index'] = df.at[cbgs_labels.at[cbgs, 'leaves'],'index']

cbgs_labels

### Represent on a map

In [None]:
XY = clusters.loc[:, ['X', 'Y']].to_numpy()
indices = clusters.index.to_numpy()
X,Y = XY.T

In [None]:
norm = mco.Normalize(vmin=np.min(indices), vmax=np.max(indices))
cmap = cm.rainbow

colors = cmap(norm(indices))

In [None]:
fig = plt.figure(figsize=(4,3),dpi=300)
ax = fig.gca()
for i in range(len(X)):
    x = X[i]
    y = Y[i]
    circle = plt.Circle((x,y), 0.5, color=colors[i], alpha=0.5, lw=0)
    ax.add_patch(circle)
    
xmin = np.min(X)
xmax = np.max(X)
ymin = np.min(Y)
ymax = np.max(Y)
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.set_aspect('equal')
for lab in 'left', 'right', 'bottom', 'top':
    ax.spines[lab].set_visible(False)
ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
cax = fig.add_axes(rect=[0.98,0.1,0.02,0.7])
plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap),
             cax=cax, label='Matrix index', extendfrac='auto')
plt.show()


## Process flux matrices

### List the entries

In [None]:
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    for rt, grps, keys in store.walk('/fluxes'):
        pass
prefs = [k for k in keys]
print("len(prefs) = {:d}".format(len(prefs)))
# prefs

### Compute and show mean matrix

In [None]:
N = len(clusters)
fmat_mean = np.zeros((N,N), dtype=np.float_)
fmat_var = np.zeros((N,N), dtype=np.float_)

count = 0
for i, pref in enumerate(prefs):
    # define all store keys
    key_f = "/fluxes/{:s}".format(pref)    
#     print(f"File {i+1} / {len(prefs)}", key_f)
    
    with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:            
        F = store[key_f].to_numpy()
        
    # average
    fmat_mean += F
    fmat_var += F**2
    count += 1

fmat_mean = fmat_mean / count
fmat_var = fmat_var / count - fmat_mean**2

In [None]:
fig = show_image(fmat_mean, downscale=None, log=True, mpl=True)

Check that $\sum_\limits{b} f_{ba} / M_a$ is small.

In [None]:
Ms = fmat_mean.diagonal()
idx = Ms > 0.
Minvs = np.zeros(Ms.shape)
Minvs[idx] = 1./Ms[idx]
A = fmat_mean.copy()
np.fill_diagonal(A, 0.)
A = np.einsum('ji,i->i', A, Minvs)
err = np.sum(A[idx]**2 / len(A))
print("std = {:.6e}".format(err))

In [None]:
A = fmat_mean.T / np.diag(fmat_mean) - np.eye(N)
A[~np.isfinite(A)] = 0.
idx = A > 0.
M = np.sum(idx)
np.sqrt(np.sum(A[idx]**2/M))

$\| F - F^T\| / \|F\|$ is small so the mean flux matrix is almost symmetric.

In [None]:
f = np.linalg.norm(fmat_mean)
err = np.linalg.norm(fmat_mean - fmat_mean.T)
err_rel = err/f
print("||F|| = {:.6e}".format(f), "||F - F^T|| = {:.6e}".format(err), "||F - F^T||/||F|| = {:.6e}".format(err_rel))

We can also represent the variance of the mean fluxes.

In [None]:
fig = show_image(fmat_var, downscale=None, log=True, mpl=True)

Export mean flux matrix

In [None]:
expdir = resdir / 'csv'
if not expdir.is_dir():
    expdir.mkdir()

In [None]:
fname = 'fluxes_mean.csv'
pd.DataFrame(data=fmat_mean, index=clusters.index.to_numpy(), columns=clusters.index.to_numpy()).to_csv(expdir / fname)

### What is the noise distribution?

In [None]:
mats = []
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    for pref in prefs:
        key_f = "/fluxes/{:s}".format(pref)
        mat = store[key_f].to_numpy()
        mats.append(mat)
mats = np.array(mats, dtype=np.float_)
mats.shape

mat_mean = np.mean(mats, axis=0)
mat_std = np.std(mats, axis=0)

#### Multiplicative noise

In [None]:
# only consider non-zero elements to compute the distribution
# of non-zero elements of m_ij / <m_ij>
idx = mat_mean > 0.
data = []
for i in range(len(prefs)):
    idxi = mats[i] > 0.
    idxi = idx & idxi
    data.append(mats[i][idxi] / mat_mean[idxi])
data = np.concatenate(data)
data = np.sort(data)

In [None]:
# nbins=2**6
nbins='doane'
q = 0.00
n = len(data)
n1 = int(0.5*q*n)
n2 = int((1. - 0.5*q)*n)
print(f"n1 = {n1}, n2 = {n2}")

fig = plt.figure(figsize=(4,3), dpi=150)
ax = fig.gca()

# hist, edges = np.histogram(data[n1:n2], bins=nbins, density=True)
hist, edges = np.histogram(np.log(data[n1:n2]), bins=nbins, density=True)

print(f"nbins = ", len(edges)-1)
ax.plot(0.5*(edges[:-1]+edges[1:]), hist, '-o', lw=0.5, color='darkblue', ms=2)

m = np.mean(np.log(data[n1:n2]))
s = np.std(np.log(data[n1:n2]))
m,s = scs.norm.fit(np.log(data[n1:n2]), loc=m, scale=s)
npts = 1000
X = np.linspace(edges[0], edges[-1], npts)
Y = 1./ np.sqrt(2.*np.pi*s**2)*np.exp(-0.5*(X-m)**2/s**2)
ax.plot(X,Y,'r--',lw=0.5, label="$\\mu = {:.2f}$\n$\sigma = {:.2f}$".format(m, s))

ax.legend(loc='best', fontsize='medium', frameon=False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlabel("$\ln{(m_{ij}/\\mu_{ij})}$", fontsize='large')
ax.set_ylabel("pdf", fontsize='medium')
fig.tight_layout()
plt.show()

#### Additive noise

In [None]:
# only consider non-zero elements to compute the distribution
# of non-zero elements of (m_ij - <m_ij>)/ s_ij
idx = mat_std > 0.
data = []
for i in range(len(prefs)):
    idxi = mats[i] > 0.
    idxi = idx & idxi
    data.append( (mats[i][idxi] - mat_mean[idxi])/mat_std[idxi])
data = np.concatenate(data)
data = np.sort(data)

In [None]:
# nbins=2**6
nbins='doane'
q = 0.00
n = len(data)
n1 = int(0.5*q*n)
n2 = int((1. - 0.5*q)*n)
print(f"n1 = {n1}, n2 = {n2}")

fig = plt.figure(figsize=(4,3), dpi=150)
ax = fig.gca()

hist, edges = np.histogram(data[n1:n2], bins=nbins, density=True)
# hist, edges = np.histogram(np.log(data[n1:n2]), bins=nbins, density=True)

print(f"nbins = ", len(edges)-1)
ax.plot(0.5*(edges[:-1]+edges[1:]), hist, '-o', lw=0.5, color='darkblue', ms=2)

# m = np.mean(np.log(data[n1:n2]))
# s = np.std(np.log(data[n1:n2]))
# m,s = scs.norm.fit(np.log(data[n1:n2]), loc=m, scale=s)
# npts = 1000
# X = np.linspace(edges[0], edges[-1], npts)
# Y = 1./ np.sqrt(2.*np.pi*s**2)*np.exp(-0.5*(X-m)**2/s**2)
# ax.plot(X,Y,'r--',lw=0.5, label="$\\mu = {:.2f}$\n$\sigma = {:.2f}$".format(m, s))

ax.legend(loc='best', fontsize='medium', frameon=False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlabel("$(m_{ij} - \\mu_{ij})/\sigma_{ij}$", fontsize='large')
ax.set_ylabel("pdf", fontsize='medium')
fig.tight_layout()
plt.show()

## Compute and show the total fluxes

In [None]:
ts = []
data = []
count = 0
idump = 10
for i, pref in enumerate(prefs):
    # define all store keys
    key_f = "/fluxes/{:s}".format(pref)
    
    if (i%idump == 0):
        print(f"File {i+1} / {len(prefs)}", key_f)
    
    t = datetime.datetime.strptime(pref, tfmt)
    
    with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
        if not (key_f in store):
            print(f"skipping!")
            continue
    
        fmat = store[key_f].to_numpy().astype('float64')
        
    ts.append(t)
    fsum = np.sum(fmat)
    ftrace = np.einsum('ii', fmat)
    data.append([fsum - ftrace, ftrace, float(fsum - ftrace)/ftrace])

ts = np.array(ts)
data = np.array(data)

df_flux_tot = pd.DataFrame(data=data, columns=['flux', 'mobile count', 'relative flux'], index=ts)

In [None]:
fig = plt.figure(figsize=(4,3), dpi=150)
ax = fig.gca()

ax.plot(df_flux_tot.index, df_flux_tot['flux'].to_numpy()/1e6, 'o', ms=2)

plt.xticks(rotation=45)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylabel("total flux (M)", fontsize='medium')
fig.tight_layout()
plt.show()

### Analyze the mobile phone count

In [None]:
idump =10
ts = []
data = []
count = 0
for i, pref in enumerate(prefs):
    # define all store keys
    key_f = "/fluxes/{:s}".format(pref)
    
    if i % idump == 0:
        print(f"File {i+1} / {len(prefs)}", key_f)
    
    t = datetime.datetime.strptime(pref, tfmt)
    
    with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
        if not (key_f in store):
            print(f"skipping!")
            continue
    
        fmat = store[key_f].to_numpy().astype('float64')
            
    pvec = np.einsum('ii->i', fmat)
    
    ts.append(t)
    data.append(pvec)
    
df_pvec = pd.DataFrame(data=data, columns=clusters.index, index=ts)

Show the cell phone count per community is approximately constant

In [None]:
indices = np.arange(len(df_pvec.index))
norm = mco.Normalize(vmin=np.min(indices), vmax=np.max(indices))
cmap = cm.rainbow
colors = cmap(norm(indices))

fig = plt.figure(figsize=(10,2), dpi=300)
ax=fig.gca()


for i in range(len(indices)):
    ax.plot(df_pvec.loc[df_pvec.index[i]], '-', color=colors[i], lw=0.5, ms=0, alpha=0.1)

ax.plot(np.diag(fmat_mean), 'k--', lw=0.5)

for lab in 'right', 'top':
    ax.spines[lab].set_visible(False)
ax.tick_params(length=4)
ax.set_xlim(0.,None)
ax.set_ylim(1,None)
ax.set_xlabel('cluster index', fontsize='medium')
ax.set_ylabel('# devices', fontsize='medium')
ax.set_yscale('log')
cax = fig.add_axes(rect=[0.99,0.1,0.01,0.7])
cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap),
             cax=cax, extendfrac='auto')
ticks = cbar.get_ticks()
labels = df_pvec.index[ticks.astype('int64')].strftime('%Y-%m-%d').tolist()
cbar.set_ticks(ticks)
cbar.set_ticklabels(labels)
plt.show()


Show that the cell phone count is a good proxy for actual population, by comparing the cell phone count with the Census Bureau reported population.

In [None]:
s_pvec_mean = df_pvec.mean(axis=0)

clusters['mobile_count'] = s_pvec_mean
key = "/clustering/clusters"
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    store[key] = clusters
clusters

In [None]:
clusters.loc[(clusters['population'] == 0) | (clusters['mobile_count'] == 0)]

In [None]:
df_pvecT = df_pvec.T

indices = np.arange(len(df_pvec.index))
norm = mco.Normalize(vmin=np.min(indices), vmax=np.max(indices))
cmap = cm.rainbow
colors = cmap(norm(indices))

ncol = 2
fig = plt.figure(facecolor='w', figsize=(8,3), dpi=300)
gs = mgs.GridSpec(1, ncol)

idx = clusters.index
ax1 = fig.add_subplot(gs[0,0])
ax2 = fig.add_subplot(gs[0,1])

X1 = clusters.loc[idx, 'population'].to_numpy()
xsum = np.sum(X1)
X2 = X1 / xsum
matX1 = []
matX2 = []
matY1 = []
matY2 = []
for i in range(len(indices)):
    t = df_pvecT.columns[i]
    Y1 = df_pvecT.loc[idx,t].to_numpy()
    
    ax1.plot(X1, Y1, 'o', color=colors[i], lw=0, ms=2, alpha=0.1)
    matX1.append(X1)
    matY1.append(Y1)
    
    ysum = np.sum(Y1)
    Y2 = Y1 / ysum
    ax2.plot(X2, Y2, 'o', color=colors[i], lw=0, ms=2, alpha=0.1)
    matX2.append(X2)
    matY2.append(Y2)

# fits
matX1 = np.ravel(matX1)
matY1 = np.ravel(matY1)
matX2 = np.ravel(matX2)
matY2 = np.ravel(matY2)

# res = scs.linregress(matX1, matY1)
# a1 = res.slope
# b1 = res.intercept
b1 = 0.
a1 = np.sum(matY1) / np.sum(matX1)
Xfit = np.array([0., np.max(X1)])
ax1.plot(Xfit, a1*Xfit +b1, 'k-', lw=1.)

# res = scs.linregress(matX2, matY2)
# a2 = res.slope
# b2 = res.intercept
b2 = 0.
a2 = np.sum(matY2) / np.sum(matX2)
Xfit = np.array([0., np.max(X2)])
ax2.plot(Xfit, a2*Xfit +b2, 'k-', lw=1.)

# plot formatting
ax1.set_xlabel("$M_a$", fontsize='medium')
ax1.set_ylabel("$P_a$", fontsize='medium')
ax2.set_xlabel("$M_a / \sum M_a$", fontsize='medium')
ax2.set_ylabel("$P_a / \sum P_a$", fontsize='medium')
for ax in ax1, ax2:
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(bottom=True, left=True, labelbottom=True, labelleft=True)
    ax.tick_params(length=4)
    ax.set_xlim(0., None)
    ax.set_ylim(0., None)
# ax.set_xlim(0.5, 1.5)
# ax.set_ylim(0.5, 1.5)
# ax.xaxis.set_major_locator(ticker.MultipleLocator(0.5))
# ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.1))
# ax.yaxis.set_major_locator(ticker.MultipleLocator(0.5))
# ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.1))
# ax.set_aspect('equal')
        
# ax.legend(loc='upper left', fontsize='small', bbox_to_anchor=(1.1, 0.98), frameon=False, ncol=3)
    
gs.tight_layout(fig, rect=[0.,0.,0.95,1.])
cax = fig.add_axes(rect=[0.99,0.2,0.01,0.7])
cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap),
             cax=cax, extendfrac='auto')
ticks = cbar.get_ticks()
labels = df_pvec.index[ticks.astype('int64')].strftime('%Y-%m-%d').tolist()
cbar.set_ticks(ticks)
cbar.set_ticklabels(labels)

plt.show()