In [75]:
from multiprocessing import Process
import numpy as np
import pandas as pd
import tqdm
import os
from models import Node, Rec 

checkin_file = "../dataset/poi_info.txt"
df = pd.read_csv(checkin_file, sep='\t', header=None)
df.columns = ["id", "poi", "latitude", "longitude"]
print "total poi :", len(df)
poi2id = {}
id2poi = {}
for i in xrange(len(df)):
    poi2id[df['poi'][i]] = df['id'][i]
    id2poi[df['id'][i]] = df['poi'][i]
id2poi = id2poi.values()
id2pos = df.loc[:, ['latitude', 'longitude', 'poi']].set_index('poi').T.to_dict('list')

total poi : 13187


In [76]:
unk = 'u*n*k'
poi2id[unk] = len(id2poi)
id2poi.append(unk)

In [77]:
np.save("./npy/poi2id.npy", poi2id)
np.save("./npy/id2poi.npy", id2poi)

In [78]:
print poi2id.get(unk)

13187


In [79]:
# build a tree of area
tree = Node(df['latitude'].min(), df['latitude'].max(),df['longitude'].max(), df['longitude'].min(), 0)
tree.build()
print "total node of tree :", Node.count
theta = Node.theta

total node of tree : 40955


In [80]:
def main(id2poi_batch):
    id2route = []
    id2lr = []
    id2prob = []

    # make route/left_right_choice/probability list of each poi
    for poi in tqdm.tqdm(id2poi_batch):
        # each poi, they have a area. p_n is each corner
        p_n = [(id2pos[poi][0] - 0.5*theta, id2pos[poi][1] - 0.5*theta)\
                ,(id2pos[poi][0] - 0.5*theta, id2pos[poi][1] + 0.5*theta)\
                ,(id2pos[poi][0] + 0.5*theta, id2pos[poi][1] - 0.5*theta)\
                ,(id2pos[poi][0] + 0.5*theta, id2pos[poi][1] + 0.5*theta)]
        # that area
        poi_area = Rec((id2pos[poi][1]+0.5*theta, id2pos[poi][1]-0.5*theta\
                        ,id2pos[poi][0]-0.5*theta, id2pos[poi][0]+0.5*theta))

        route_list = []
        lr_list = []
        area_list = []
        # each corner, where they are contained in
        for p in p_n:
            route, lr = tree.find_route(p)
            route_list.append(route)
            lr_list.append(lr)

        # remove duplicate
        route_set = []
        for route in route_list:
            if route not in route_set:
                route_set.append(route)
        lr_set = []
        for lr in lr_list:
            if lr not in lr_set:
                lr_set.append(lr)

        # each leaf, how much they are overlaped
        for route in route_set:
            leaf_area = Rec(tree.find_idx(route[0]))
            area_list.append(leaf_area.overlap(poi_area))
        area_list = np.divide(area_list, sum(area_list))

        id2route.append(route_set)
        id2lr.append(lr_set)
        id2prob.append(area_list)
        
    return id2route, id2lr, id2prob

In [81]:
if __name__ == '__main__':
    id2route, id2lr, id2prob = main(id2poi[:-1])

100%|██████████| 13187/13187 [00:23<00:00, 551.16it/s]


In [85]:
max_path = len(id2route[0][0])
print max_path

13


In [86]:
pad = [0]*max_path
max_route_cnt = 4
id2route_cnt = []

for idx, routes in enumerate(tqdm.tqdm(id2route)):
    id2route_cnt.append(len(routes))
            
    if len(routes) < max_route_cnt:
        for _ in xrange(max_route_cnt - len(routes)):
            routes.append(pad)
        
    routes = np.asarray([l[1:] for l in routes])
    id2route[idx] = routes

id2route.append([pad[1:], pad[1:], pad[1:], pad[1:]])
id2route_cnt.append(0)

100%|██████████| 13187/13187 [00:00<00:00, 81040.12it/s]


In [87]:
print np.asarray(id2route).shape
print np.asarray(id2route_cnt).shape

(13188, 4, 12)
(13188,)


In [89]:
pad = [0]*(max_path-1)

for idx, lrs in enumerate(tqdm.tqdm(id2lr)):
    if len(lrs) < max_route_cnt:
        for _ in xrange(max_route_cnt - len(lrs)):
            lrs.append(pad)
            
    id2lr[idx] = lrs
        
id2lr.append([pad, pad, pad, pad])

100%|██████████| 13187/13187 [00:00<00:00, 592942.69it/s]


In [90]:
print np.asarray(id2lr).shape

(13188, 4, 12)


In [91]:
pad = 0

for idx, probs in enumerate(tqdm.tqdm(id2prob)):
    probs = list(probs)
    if len(probs) < max_route_cnt:
        for _ in xrange(max_route_cnt - len(probs)):
            probs.append(pad)
            
    id2prob[idx] = probs
        
id2prob.append([0,0,0,0])

100%|██████████| 13187/13187 [00:00<00:00, 162874.68it/s]


In [92]:
print np.asarray(id2prob).shape

(13188, 4)


In [94]:
np.save("./npy/id2route.npy", id2route)
np.save("./npy/id2lr.npy", id2lr)
np.save("./npy/id2prob.npy", id2prob)

In [93]:
np.max(id2route)

40683