In [1]:
import pandas as pd
import pathlib
import folium
import ast
from math import sin, ceil, floor
import numpy as np
from collections import defaultdict, Counter
from sklearn.neighbors import KDTree
import pickle

import data_utils as utils

"""
name::String
    ## bounding box
    minlon::Float64
    minlat::Float64
    maxlon::Float64
    maxlat::Float64
            
    minx::Float64
    miny::Float64
    maxx::Float64
    maxy::Float64
            
    xstep::Float64 # cell size
    ystep::Float64
            
    numx::Int
    numy::Int
    minfreq::Int # minfreq for cell
    maxvocab_size::Int
    k::Int
       
       
    ##### 아래는 make_vocab function에서 생성
    ## the number of hitting
    cellcount
    ## hot cells
    hotcell::Vector{Int}
    ## hot cell kdtree
    hotcell_kdtree : any
    ## map a hot cell into the vocabulary id
    hotcell2vocab::Dict{Int, Int}
    ## map the vocabulary id into its hot cell
    vocab2hotcell::Dict{Int, Int}
    ## vocab start
    vocab_start::Int
    ## vocabulary size
    vocab_size::Int
    ## whether indices have been built
    built::Bool
    """
class attribute 느낌.

In [2]:
data_dir = pathlib.PosixPath("data/")
# utils.porto2standardcsv(data_dir/"porto"/"train.csv", limit=1000, fname=data_dir/"porto"/"preprocessed_porto1000.csv")

trips = pd.read_csv(data_dir/"porto"/"preprocessed_porto1000.csv")
trips.head()

Unnamed: 0,lat,timestamps,tripid,lon
0,41.141412,1372636858,1372636858620000589,-8.618643
1,41.141376,1372636873,1372636858620000589,-8.618499
2,41.14251,1372636888,1372636858620000589,-8.620326
3,41.143815,1372636903,1372636858620000589,-8.622153
4,41.144373,1372636918,1372636858620000589,-8.623953


In [3]:
trips_len = defaultdict(int)
for idx in np.unique(trips.tripid):
    trip = trips.loc[trips["tripid"] == idx]
    trips_len[idx] = (trip.index[0], trip.shape[0]) # (s, len)

In [8]:
class SpatialRegion(object) : 
    
    def __init__(self, dataset_name, minlon, minlat, maxlon, maxlat, 
                 xstep, ystep, 
#                  numx, numy, 
                 minfreq=50, maxvocab_size=50000, knn_k=5, 
                 vocab_start=4,
#                  cellcount, hotcell, hotcell_kdtree,
#                  hotcell2vocab, vocab2hotcell, vocab_size, is_built
                ):
        self.dataset_name = dataset_name
        self.minfreq = minfreq
        self.maxvocab_size = maxvocab_size
        self.knn_k = knn_k
        self.vocab_start = vocab_start
        
        # compute minx, miny, maxx, maxy from the followings.
        self.minlon = minlon
        self.minlat = minlat
        self.maxlon = maxlon
        self.maxlat = maxlat
        
        self.minx, self.miny, self.maxx, self.maxy = None,None,None,None
        
        self.xstep, self.ystep = xstep, ystep
        self.numx, self.numy = None, None
        
        self.cellcount, self.hotcell, self.hotcell2vocab = None,None,None
        self.vocab2hotcell, self.vocab_size = None,None
        self.is_built = False
        self.hotcell_kdtree = None
        self.build_region()
                
    def build_region(self):
        self.minx, self.miny = utils.lonlat2meters(self.minlon,self.minlat)
        self.maxx, self.maxy = utils.lonlat2meters(self.maxlon,self.maxlat)
        self.numx = ceil(round(self.maxx-self.minx, ndigits=6) / self.xstep)
        self.numy = ceil(round(self.maxy-self.miny, ndigits=6) / self.ystep)
        
    def coord2cell(self, x,y):
        """
        mapping x,y to cell_id
        @param x,y : coordinate in meter metric
        """
        xoffset = floor(round(x - self.minx, ndigits=6) / self.xstep)
        yoffset = floor(round(y - self.miny, ndigits=6) / self.ystep)

        return yoffset * self.numx + xoffset

    def cell2coord(self, cell_id):
        yoffset = cell_id // self.numx
        xoffset = cell_id % self.numx
        y = self.miny + (yoffset + 0.5) * self.ystep #(cell_size)
        x = self.minx + (xoffset + 0.5) * self.xstep
        return x, y
    
    def gps2cell(self,lon,lat):
        """
        mapping lon, lat to cell_id through coord2cell()
        """
        x,y = utils.lonlat2meters(lon,lat)
        return self.coord2cell(x,y) # cell_id
    
    def cell2gps(self, cell_id):
        x, y = self.cell2coord(cell_id)
        return utils.meters2lonlat(x,y) # lon, lat
    
    def gps2offset(self, lon, lat):
        """
        mapping lon, lat to coord in the region(self instance)
        """
        x, y = utils.lonlat2meters(lon, lat)
        xoffset = round(x - self.minx, ndigits=6) / self.xstep
        yoffset = round(y - self.miny, ndigits=6) / self.ystep
        return xoffset, yoffset
    
    def is_inregion(self, lon, lat):
        """
        @param lon, lat
        """
        if (self.minlon <= lon < self.maxlon) and \
        (self.minlat <= lat < self.maxlat):
            return True
        else : 
            return False
    
    def make_vocab(self, trips, trips_len):
        """
        @param trips : pd.DataFrame that trips are concatenated
        @param trips_len : dict id:(start,len)
        """
        self.cellcount = defaultdict(int)
        
        num_out_region = 0
        for i, trip_id in enumerate(np.unique(trips["tripid"])):
            s,l = trips_len[trip_id]
            trip = np.array(trips.loc[s:s+l,["lon","lat"]]).transpose() # (2, traj_len)
#             print(trip.shape)
            for p in range(trip.shape[1]):
                lon, lat = trip[:,p]
                if self.is_inregion(lon, lat):
                    cell_id = self.gps2cell(lon,lat) # cell_id
                    self.cellcount[cell_id] += 1
                else : # not in region
                    num_out_region += 1
                    
            if i % 300 == 299 : 
                print("Processed {} trips".format(i+1))
                    
        max_num_hotcells = min(self.maxvocab_size, len(self.cellcount))
        topcellcount = sorted(self.cellcount.items(), 
                              key=lambda x : -x[1],)[:max_num_hotcells] # descending
        print("max_num_hotcells: {} \nmax_count of hotcells: {}".format(max_num_hotcells, topcellcount[0][-1]))
        
        # the biggest idx with minfreq
        minfreq_idx = np.argwhere(np.array(topcellcount)[:,1] == 50)[-1,0]
        self.hotcell = np.array([cell_id for cell_id, _ in np.array(topcellcount[:minfreq_idx+1])]) # (len_cell_ids)
        print("num of hotcell : {}".format(len(self.hotcell)))
        
        ## build the map between cell and vocab id
        self.hotcell2vocab = dict([(cell_id, i+self.vocab_start) for (i, cell_id) in enumerate(self.hotcell)])
        
        #region.vocab2hotcell = map(reverse, region.hotcell2vocab)
        self.vocab2hotcell = {vocab:cell for (cell,vocab) in self.hotcell2vocab.items()}
        
        ## vocabulary size
        self.vocab_size = self.vocab_start + len(self.hotcell)
        
        self.built = True
        ## build the hot cell kdtree to facilitate search
        
        # coord : (len_hotcells, 2: (x,y))
        coord = np.array(list(map(self.cell2coord, self.hotcell))) # self.hotcell : (len_hotcells)
        self.hotcell_kdtree = KDTree(coord)

    def knearest_hotcells(self, cell_ids, k):
        """
        @param cell_ids :: iterables
        return knearest_hotcells_id : (len_cells_ids, k), knndists : (len_cells_ids, k)
        """
        assert self.built == True
        coord = np.array(list(map(self.cell2coord, cell_ids))) # (len_hotcells, 2: (x,y))
        dists, indice = self.hotcell_kdtree.query(coord,k=k) # indice here is indice of self.hotcell
        return self.hotcell[indice], dists 
    
    def nearest_hotcell(self, cell_ids,):
        assert self.built == True
        cell_id,_ = self.knearest_hotcells(cell_ids, k=1)
        return cell_id # : (None, 1)
    
    def save_KNVocabs(self,):
        V, D = np.zeros((self.knn_k, self.vocab_size)), np.zeros((self.knn_k, self.vocab_size))
        for vocab in range(self.vocab_start):
            V[:, vocab] = vocab
            D[:, vocab] = 0.
            
        kcells, dists = spatialregion.knearest_hotcells(list(self.vocab2hotcell.values()), k=5) # (#hotcells, k)
        V[:, self.vocab_start:] = kcells.transpose()
        D[:, self.vocab_start:] = dists.transpose()
        
        pickle.dump({"V": V,"D": D}, 
                    open(data_dir/self.dataset_name/"{ds}KNVocabs_{cell_sz}.pkl".format(ds = self.dataset_name,
                                                                                        cell_sz = self.xstep
                                                                                       ), "wb"))
    def anycell2vocab(self, cell_id):
        """
        mapping a cell_id to vocab where the cell_id is not necessarily a hotcell
        if a cell_id is not one of hotcells, it is replaced with the nearest hotcell.
        """
        if cell_id in self.hotcell2vocab: # one of hotcells
            return self.hotcell2vocab[cell_id]
        else: # not a hotcell
            hotcell_id = self.nearest_hotcell(cell_id)
            return self.hotcell2vocab[hotcell_id]
    
    def gps2vocab(self, lon,lat):
        if self.is_inregion(lon, lat):
            cell_id = self.gps2cell(lon, lat)
            return self.anycell2vocab(cell_id) # vocab
        else : 
            return "UNK"
        
    def traj2seq(self, trip):
        """
        @param trip : (2, traj_len) ::nd.array
        """
        seq = []
        for p in range(trip.shape[1]):
            # gps
            lon, lat = trip[:, p]
            seq.append(self.gps2vocab(lon, lat))
            
        return seq
    
    def seq2traj(self,seq):
        trip = np.zeros((2,len(seq)),dtype=np.float32)
        for point in range(len(seq)):
            hotcell_id = self.vocab2hotcell.get(seq[point], -1)
            if hotcell_id == -1 : raise ValueError
            lon, lat = self.cell2gps(hotcell_id)
            trip[:, point] = np.array([lon, lat])
            
        return trip
    
    def tripmeta(self, trip):
        """
        @param trip = (2, traj_len) ::nd.array
        """
        mins, maxs = np.min(trip, axis=1), np.max(trip, axis=1)
        lon_centroid, lat_centroid = mins + (maxs-mins)/2
        xoffset, yoffset = self.gps2offset(lon_centroid, lat_centroid)
        return xoffset, yoffset
    
    def seqmeta(self, seq):
        trip = self.seq2traj(seq)
        return self.tripmeta(trip)
    
    def seq2str(self, seq):
        """
        @param seq : list of points
        """
        seq_str = " ".join(list(map(str, seq))) + "\n"
        return seq_str
        
    def createTrainVal(self, trjfile, datapath, injectnoise,
                        ntrain, nval, nsplit=5, min_length=20, max_length=100):
        """
        @param datapath :: pathlib.POSIX("datapath")
        """
        trainsrc = open(datapath/self.dataset_name/"train.src". "w")
        traintrg = open(datapath/self.dataset_name/"train.trg". "w")
        trainmta = open(datapath/self.dataset_name/"train.mta". "w")
        
        validsrc = open(datapath/self.dataset_name/"valid.src". "w")
        validtrg = open(datapath/self.dataset_name/"valid.trg". "w")
        validmta = open(datapath/self.dataset_name/"valid.mta". "w")
        
        for i in range(ntrain+nval):
            
            
            
            
        
        
        
    
    

    
            
            
            
        
            
        
        


SyntaxError: invalid syntax (<ipython-input-8-3b4c5fef59f1>, line 219)

In [15]:
" ".join(list(map(str, [5,6,7]))) + "\n"

'5 6 7\n'

In [11]:
" ".join([5,6,7])

TypeError: sequence item 2: expected str instance, int found

In [5]:
spatialregion = SpatialRegion(dataset_name="porto",
                              minlon=-8.735152, minlat=40.953673,
                              maxlon=-8.156309, maxlat=41.307945,
                              xstep=100, ystep=100,)

In [6]:
spatialregion.make_vocab(trips, trips_len)

Processed 300 trips
Processed 600 trips
Processed 900 trips
max_num_hotcells: 6572 
max_count of hotcells: 380
num of hotcell : 120


In [7]:
spatialregion.seq2traj([5,6,7])

array([[-8.606244, -8.670024, -8.607142],
       [41.14438 , 41.237667, 41.145733]], dtype=float32)

In [267]:
spatialregion.save_KNVocabs()

In [233]:
spatialregion.knearest_hotcells(spatialregion.hotcell[0:3], k=5)

(array([[269682, 269037, 270327, 271617, 271618],
        [181388, 181389, 182677, 183322, 183967],
        [270327, 269682, 271617, 269037, 271618]]),
 array([[  0.        , 100.        , 100.        , 300.        ,
         316.22776602],
        [  0.        , 100.        , 223.60679775, 316.22776602,
         412.31056256],
        [  0.        , 100.        , 200.        , 200.        ,
         223.60679775]]))

In [235]:
spatialregion.nearest_hotcells(spatialregion.hotcell[0:3],)

array([[269682],
       [181388],
       [270327]])

In [188]:
np.argwhere(np.array(sorted(spatialregion.cellcount.items(), key=lambda x : -x[1],))[:,1] == 50)

array([[116],
       [117],
       [118],
       [119]])

In [206]:
np.array(list(map(spatialregion.cell2coord, spatialregion.hotcell) ))
# spatialregion.hotcell

array([[-965142.67264185, 5047360.84148584],
       [-958042.67264185, 5033660.84148584],
       [-965142.67264185, 5047460.84148584],
       [-958142.67264185, 5033860.84148584],
       [-958542.67264185, 5033860.84148584],
       [-955742.67264185, 5034360.84148584],
       [-958342.67264185, 5036360.84148584],
       [-960442.67264185, 5035660.84148584],
       [-955742.67264185, 5034260.84148584],
       [-958542.67264185, 5034160.84148584],
       [-958342.67264185, 5034060.84148584],
       [-959542.67264185, 5034160.84148584],
       [-956242.67264185, 5036560.84148584],
       [-959842.67264185, 5034160.84148584],
       [-958442.67264185, 5035060.84148584],
       [-960242.67264185, 5034860.84148584],
       [-960942.67264185, 5034860.84148584],
       [-959342.67264185, 5033960.84148584],
       [-958342.67264185, 5034160.84148584],
       [-960442.67264185, 5036460.84148584],
       [-965142.67264185, 5047260.84148584],
       [-961742.67264185, 5036560.84148584],
       [-9

In [174]:
spatialregion.__dict__

{'dataset_name': 'porto',
 'minfreq': 50,
 'maxvocab_size': 50000,
 'knn_k': 5,
 'vocab_start': 4,
 'minlon': -8.735152,
 'minlat': 40.953673,
 'maxlon': -8.156309,
 'maxlat': 41.307945,
 'minx': -972392.6726418451,
 'miny': 5005510.841485839,
 'maxx': -907956.1646325944,
 'maxy': 5057870.140667773,
 'xstep': 100,
 'ystep': 100,
 'numx': 645,
 'numy': 524,
 'cellcount': defaultdict(int,
             {178158: 10,
              178800: 41,
              178799: 20,
              179444: 29,
              180090: 22,
              180736: 9,
              181381: 16,
              182027: 39,
              182028: 69,
              182673: 208,
              183318: 40,
              183320: 7,
              184611: 31,
              185257: 18,
              184615: 22,
              183971: 10,
              183325: 36,
              182679: 8,
              182033: 9,
              183319: 60,
              188518: 13,
              187873: 2,
              187225: 2,
              185