In [None]:
import numpy as np  # NumPy package for arrays, random number generation, etc
import matplotlib.pyplot as plt  # For plotting
from shapely.geometry import Point
import pandas as pd
from sklearn.neighbors import KDTree

from faker import random_point_processes as rpp

In [None]:
c = 1.2
n=100
a = np.random.zipf(c,n)
plt.hist(a[a<50])

In [None]:
from scipy import special
limit = 100
count, bins, ignored = plt.hist(a[a<limit], limit, normed=True)
x = np.arange(1., limit)
y = x**(-c) / special.zetac(c)
plt.plot(x, y/max(y), linewidth=2, color='r')
print(a)
plt.show()

In [None]:
max(a)

In [None]:
a

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
xx, yy, xx_parent_repeated, yy_parent_repeated, xx_parent, yy_parent, centre_ids = rpp.thomas_cluster_process(
    xmin = 0,
    xmax = 1000,
    ymin = 0,
    ymax = 1000,
    lambda_parent = .0001, # density of parent Poisson point process
    lambda_daughter = 10000, # mean number of points in each cluster
    sigma = 15 # sigma for normal variables (ie random locations) of daughters
)
print(len(xx_parent), 'centres.', len(xx), 'households.')
plt.scatter(xx, yy, alpha=0.5, marker='.')
plt.xlabel("x")
plt.ylabel("y")
plt.axis('equal')

In [None]:
pop_size = len(xx)
pop_size

In [None]:
def minmax(array):
    assert isinstance(array, np.ndarray)
    return (array - min(array)) / (max(array) - min(array))

In [None]:
# build some hidden features

In [None]:
pop_locs = [Point(x, y) for x, y in zip(xx,yy)]

In [None]:
pop_centres = [Point(x, y) for x, y in zip(xx_parent, yy_parent)]
centres = [Point(x, y) for x, y in zip(xx_parent_repeated, yy_parent_repeated)]

In [None]:
dist_to_centre = np.array([x.distance(y) for x, y in zip(pop_locs, centres)])

In [None]:
loc_array = np.vstack((xx, yy)).T
centre_array = np.vstack((xxParent, yyParent)).T

In [None]:
tree = KDTree(centre_array)

In [None]:
dist_closest_centre, _ = tree.query(loc_array, dualtree=True, k=1)
dist_closest_centre = dist_closest_centre.reshape(-1)
dist_closest_centre_mm = minmax(dist_closest_centre)
len(dist_closest_centre_mm)

In [None]:
nearest_dist, _ = tree.query(loc_array, dualtree=True, k=3)
dist_closest_centres = nearest_dist.sum(axis=1)
dist_closest_centres_mm = minmax(dist_closest_centres)
len(dist_closest_centres_mm)

In [None]:
tree = KDTree(loc_array)
density = tree.query_radius(loc_array, count_only=True, r=1)
density_mm = minmax(density)
len(density_mm)

In [None]:
xx_mm = minmax(xx)
yy_mm = minmax(yy)
dist_to_centre_mm = minmax(dist_to_centre)
centre_ids_mm = minmax(centre_ids)

In [None]:
# build some attributes based on these hidden features and on each other

In [None]:
def get_gender(pman=.49, n=4):
    return np.random.choice([0, 1], size=n, p=[pman, 1-pman])

In [None]:
genders = get_gender(n=pop_size)

In [None]:
def get_ethnicity(xx, yy, density):
    p1 = np.random.poisson(xx*10)
    p2 = np.random.poisson(density*10)
    p3 = np.random.poisson(density*xx*10)
    p4 = np.random.poisson(((xx - .5)*10)**2)
    p5 = np.random.poisson(yy*10)
    choice = np.array([p1,p2,p3,p4,p5])
    choice = choice/sum(choice)
    return np.random.choice([1,2,3,4,5], p=choice)
    

In [None]:
ethnicity = np.array([get_ethnicity(x,y,d) for x,y,d in zip(xx_mm, yy_mm, density)])

In [None]:
def get_hh_people(ethnicity, density):
    p1 = density*50 + ethnicity*2
    p2 = density*40 + ethnicity*4
    p3 = density*30 + ethnicity*6
    p4 = density*20 + ethnicity*8
    p5 = density*10 + ethnicity*10
    choice = np.array([p1,p2,p3,p4,p5])
    choice = choice/sum(choice)
    return np.random.choice([1,2,3,4,5], p=choice)

In [None]:
hh_people = np.array([get_hh_people(e,d) for e,d in zip(ethnicity, density)])

In [None]:
def get_age(centre_id, density, dist_closest_centre):
    old = 70
    adult = 30 + (12*centre_id)
    child = 20 - (12*centre_id)
    return np.random.choice(
        [
            int(np.random.poisson(old)),
            int(np.random.poisson(adult)),
            int(np.random.poisson(child))
        ],
        p=(.3-(density*dist_closest_centre/5),.3,.4+(density*dist_closest_centre/5))
    )

In [None]:
age = np.array([get_age(c,d,dist) for c,d,dist in zip(centre_ids_mm, density_mm, dist_closest_centre_mm)])

In [None]:
def get_dist_pt(dist_closest_centre):
    return np.random.normal(1-dist_closest_centre)**2

In [None]:
dist_pt = np.array([get_dist_pt(d) for d in dist_closest_centre])

In [None]:
def get_income(dist_closest_centres):
    return (15 + np.random.poisson(5*(dist_closest_centres+1))*5)*1000

In [None]:
income = np.array([get_income(d) for d in dist_closest_centres_mm])

In [None]:
income_mm = minmax(income)

In [None]:
def get_car(ethnicity, income_mm, density_mm):
    p = income_mm * (1 - density_mm)
    if ethnicity < 3:
        p /=2 
    choice = np.array([p, 1-p])
    return np.random.choice([1, 0], p=choice)

In [None]:
has_car = np.array([get_car(e, i, d) for e, i, d in zip(ethnicity, income_mm, density_mm)])

In [None]:
# build df

In [None]:
data_dict =  {
        'x': xx,
        'y': yy,
        'centre_id': centre_ids,
        'dist_centre': dist_to_centre,
        'density': density,
        'gender': genders,
        'ethnicity': ethnicity,
        'hh_people': hh_people,
        'age': age,
        'dist_pt': dist_pt,
        'income': income,
        'has_car': has_car
    }

In [None]:
for k,v in data_dict.items():
    print(k, len(v))

In [None]:
data = pd.DataFrame(data_dict)

In [None]:
data.head()

In [None]:
data.to_csv('data.csv', index=False)

In [9]:
from faker.tree import RegularBlock
import numpy as np

In [10]:
xx = np.array((2,1,2,3,6,1,9,1,7))
yy = np.array((0,1,1,3,3,4,1,9,9))

In [11]:
locs = np.stack((xx,yy), axis=1)

In [14]:
ind = np.array(range(len(locs))).reshape((-1,1))

In [15]:
data = np.concatenate((ind,locs), axis=1)
data

array([[0, 2, 0],
       [1, 1, 1],
       [2, 2, 1],
       [3, 3, 3],
       [4, 6, 3],
       [5, 1, 4],
       [6, 9, 1],
       [7, 1, 9],
       [8, 7, 9]])

In [6]:
bbox = np.array([[0,0],[10,10]])

In [7]:
grid = RegularBlock(bbox, data, 4)

9
[5. 5.]
5
[2.5 2.5]
3
[1.25 1.25]
[[0.  0. ]
 [2.5 2.5]]
1
[1.25 3.75]
[[0.  2.5]
 [2.5 5. ]]
1
[3.75 3.75]
[[2.5 2.5]
 [5.  5. ]]
0
[3.75 1.25]
[[2.5 0. ]
 [5.  2.5]]
1
[2.5 7.5]
[[ 0.  5.]
 [ 5. 10.]]
1
[7.5 7.5]
[[ 5.  5.]
 [10. 10.]]
2
[7.5 2.5]
[[ 5.  0.]
 [10.  5.]]


In [8]:
for leaf in grid.traverse():
    if leaf.leaf:
        print(leaf.centre)

[1.25 1.25]
[1.25 3.75]
[3.75 3.75]
[3.75 1.25]
[2.5 7.5]
[7.5 7.5]
[7.5 2.5]


In [None]:
bbox = np.array([[0,0],[10,10]])
print(bbox)
centre = bbox.mean(axis=0)
print(centre)
left = data[:, 1] < centre[0]
bottom = data[:, 2] < centre[1]
print(left)
print(bottom)
# bottom left
minx, miny, maxx, maxy = bbox[0, 0], bbox[0, 1], centre[0], centre[1]
bbox = np.array([[minx, miny], [maxx, maxy]])
bbox

In [None]:
bb = np.array([[2,-2],[10,10]])
bb

In [None]:
bb.mean(axis=0)