# Spatial Coincidental Match Probability Exploration

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle as pkl
import matplotlib.pyplot as plt

sys.path.append('/Users/cmg/dev/spatial-assocr/kde')
from location_project import kde_2d  # adaptive bw KDE 
%load_ext autoreload
%autoreload 2 # auto reload changes to kde code for faster prototyping

%matplotlib inline
sys.setrecursionlimit(10000)

## Load & format filtered data

In [2]:
# read in the filtered data
filepath = os.path.join("..", "data", "mpp_month0a_month1b_n50.csv")
mpp = pd.read_csv(filepath)
mpp.head()

Unnamed: 0,id,m,lon,lat
0,11.0,a,-117.92895,33.61891
1,11.0,a,-117.67199,33.60002
2,11.0,a,-117.92895,33.61891
3,11.0,a,-117.67199,33.60002
4,11.0,a,-117.67199,33.60002


In [3]:
# reorder the columns... kde_2d expects [id, lon, lat]
data = mpp[['id', 'lon', 'lat']]
data.head()

Unnamed: 0,id,lon,lat
0,11.0,-117.92895,33.61891
1,11.0,-117.67199,33.60002
2,11.0,-117.92895,33.61891
3,11.0,-117.67199,33.60002
4,11.0,-117.67199,33.60002


In [4]:
# convert the pandas data frame to array of arrays
# i.e., <np.array [[user_id, lon, lat], ... ] >
df = data.values
df

array([[ 1.1000000e+01, -1.1792895e+02,  3.3618910e+01],
       [ 1.1000000e+01, -1.1767199e+02,  3.3600020e+01],
       [ 1.1000000e+01, -1.1792895e+02,  3.3618910e+01],
       ...,
       [ 1.7282000e+04, -1.1795213e+02,  3.3684750e+01],
       [ 1.7282000e+04, -1.1795213e+02,  3.3684750e+01],
       [ 1.7282000e+04, -1.1795197e+02,  3.3684750e+01]])

## Fit an adaptive bandwidth KDE to the data
Do this for the entire sample (i.e., a population model). 

In [5]:
# learn the bandwidth for each point & save to file (it's expensive)
filepath = os.path.join("..", "data", "bw_pop_k10_month0a_month1b_n50.npy")
# KM_TO_LON = 0.010615
# KM_TO_LAT = 0.008989
# bw_pop = kde_2d._learn_nearest_neighbors_bandwidth(
#     df[:, 1:3], k=10, lon_to_km=KM_TO_LON, lat_to_km=KM_TO_LAT
# )
# np.save(filepath, bw_pop)
bw_pop = np.load(filepath)

In [6]:
# format the data for the KDE class; equally weight points
# <np.array [[user_id, lon, lat, bw, weight], ... ] >
pop = np.append(df, np.reshape(bw_pop, (len(df), 1)), 1)
pop = np.append(pop, np.ones((len(df), 1)), 1)

# create the KDE
kde_pop = kde_2d.KDE(pop)

array([[ 1.10000000e+01, -1.17928950e+02,  3.36189100e+01,
         1.00000000e-03,  1.00000000e+00],
       [ 1.10000000e+01, -1.17671990e+02,  3.36000200e+01,
         1.00000000e-03,  1.00000000e+00],
       [ 1.10000000e+01, -1.17928950e+02,  3.36189100e+01,
         1.00000000e-03,  1.00000000e+00],
       ...,
       [ 1.72820000e+04, -1.17952130e+02,  3.36847500e+01,
         2.18804089e-03,  1.00000000e+00],
       [ 1.72820000e+04, -1.17952130e+02,  3.36847500e+01,
         2.18804089e-03,  1.00000000e+00],
       [ 1.72820000e+04, -1.17951970e+02,  3.36847500e+01,
         6.56412268e-03,  1.00000000e+00]])

In [8]:
# compute the log pdf for a given point
# < np.array [lon, lat] >
kde_pop.log_pdf(df[0][1:])

18.806669724706385

In [11]:
# sample from the KDE
# must pass in the data to do so b/c its a nonparametric model
kde_pop.sample_from_kde(pop)

[-117.8870705441304, 33.69372231153212]