In [1]:
from tpm.data_model import *
from tpm.util.io import read_geolife
from tpm.util.dist import haversine_distance
from tpm.preprocessing import time_duplication_filter
from tpm.preprocessing import speed_filter_abs
import numpy as np
import pandas as pd
import folium
from datetime import timedelta

In [2]:
trajs = read_geolife('/mnt/hdd1/christian/data/geotracking/Geolife Trajectories 1.3/Data/013/Trajectory')


In [129]:
from tpm.util.visualization import visualize_trajectory
from tpm.util.visualization import visualize_start_end_trajectory

for i, traj in enumerate(trajs):
    
        print(i)
visualize_start_end_trajectory(trajs[2])

4
6
7
37
39
41
42
44
45
46
78
106
111
142


In [3]:
preprocessed = list()
for traj in trajs:
    traj_new = time_duplication_filter(traj)
    traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
    preprocessed.append(traj_new)


In [5]:
def staypoints_geolife(traj):
    time_thresh = 30*60
    dist_thresh = 200

    staypoints = list()
    i, i_max = 0, len(traj)
    while i < i_max:
        j = i+1
        token = 0
        while j < i_max:
            dist = haversine_distance(traj[i], traj[j])
            if dist > dist_thresh:
                delta_time = traj[j].datetime - traj[i].datetime
                if delta_time.total_seconds() > time_thresh:
                    mean_point = np.mean([[p.lat, p.lon] for p in traj[i:j+1]], axis=0)
                    arrival_time = traj[i].datetime
                    leave_time = traj[j].datetime
                    staypoints.append([mean_point, arrival_time, leave_time, i, j])
                    i = j
                    token = 1
                break
            j = j+1
        if not token == 1:
            i = i+1

    
    return staypoints

In [6]:
len(trajs)

144

In [7]:
def make_df(trajs):
    data = list()
    for traj in trajs:
        if haversine_distance(traj[0], traj[-1]) < 100:
            continue
            
        fp = traj[0]
        sps = staypoints_geolife(traj)
        lp = traj[-1]               
        
        if len(sps) > 1:
            data.append([fp.lat, fp.lon, fp.datetime, sps[0][0][0], sps[0][0][1], sps[0][1]])
            for i in range(1, len(sps)-1):
                data.append([sps[i][0][0], sps[i][0][1], sps[i][1], sps[i+1][0][0], sps[i+1][0][1], sps[i+1][2]])
            data.append([sps[-1][0][0], sps[-1][0][1], sps[-1][2], lp.lat, lp.lon, lp.datetime])
        else:
            data.append([fp.lat, fp.lon, fp.datetime, lp.lat, lp.lon, lp.datetime])
        
        
    df = pd.DataFrame(data, columns=['start_lat','start_lon','start_date','end_lat','end_lon','end_date'])
    df = df.set_index(pd.DatetimeIndex(df['start_date'])).sort_index()
    return df

In [8]:
df = make_df(preprocessed)
df


Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date
2008-09-27 21:08:19,39.975651,116.329742,2008-09-27 21:08:19,39.959160,116.418816,2008-09-27 21:51:09
2008-09-28 08:38:05,39.967533,116.418808,2008-09-28 08:38:05,39.976669,116.331299,2008-09-28 09:13:43
2008-09-28 13:58:50,39.977264,116.308876,2008-09-28 13:58:50,39.977493,116.307999,2008-09-28 15:14:22
2008-09-28 15:14:22,39.977493,116.307999,2008-09-28 15:14:22,39.647343,118.161087,2008-09-28 18:50:26
2008-09-29 09:14:36,39.647022,118.161980,2008-09-29 09:14:36,39.644073,118.167107,2008-09-29 11:53:27
2008-09-29 18:02:06,39.649311,118.162544,2008-09-29 18:02:06,39.647366,118.160950,2008-09-29 22:56:42
2008-09-30 07:43:13,39.647526,118.164955,2008-09-30 07:43:13,39.648209,118.162315,2008-09-30 08:33:40
2008-09-30 10:33:52,39.644432,118.165726,2008-09-30 10:33:52,39.647057,118.161057,2008-09-30 10:49:10
2008-10-06 18:24:37,39.975548,116.330566,2008-10-06 18:24:37,39.974533,116.341972,2008-10-06 18:28:26
2008-10-06 18:28:28,39.975384,116.338562,2008-10-06 18:28:28,39.954704,116.423004,2008-10-06 21:08:01


In [9]:
from sklearn.cluster import dbscan
from sklearn.cluster import k_means
from sklearn.neighbors import DistanceMetric
from tpm.data_model import R
from collections import Counter

In [36]:
import math
import json

class Point:
    
    def __init__(self, latitude, longitude):
        
        self.latitude = latitude
        self.longitude = longitude
        self.cd = None              # core distance
        self.rd = None              # reachability distance
        self.processed = False      # has this point been processed?
        
    # --------------------------------------------------------------------------
    # calculate the distance between any two points on earth
    # --------------------------------------------------------------------------
    
    def distance(self, point):
        
        # convert coordinates to radians
        
        p1_lat = math.radians(self.latitude)
        p1_lon = math.radians(self.longitude) 
        p2_lat = math.radians(point.latitude) 
        p2_lon = math.radians(point.longitude)
        
        numerator = math.sqrt(
            math.pow(math.cos(p2_lat) * math.sin(p2_lon - p1_lon), 2) +
            math.pow(
                math.cos(p1_lat) * math.sin(p2_lat) -
                math.sin(p1_lat) * math.cos(p2_lat) *
                math.cos(p2_lon - p1_lon), 2))

        denominator = (
            math.sin(p1_lat) * math.sin(p2_lat) +
            math.cos(p1_lat) * math.cos(p2_lat) *
            math.cos(p2_lon - p1_lon))
        
        # convert distance from radians to meters
        # note: earth's radius ~ 6372800 meters
        
        return math.atan2(numerator, denominator) * 6372800
        
    # --------------------------------------------------------------------------
    # point as GeoJSON
    # --------------------------------------------------------------------------
        
    def to_geo_json_dict(self, properties=None):
        
        return {
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [
                    self.longitude,
                    self.latitude,
                ]
            },
            'properties': properties,
        }
 
    def __repr__(self):
        return '(%f, %f)' % (self.latitude, self.longitude)
    
    def __eq__(self, other): 
        if self.latitude == other.latitude and self.longitude == other.longitude:
            return True
        return False

################################################################################
# CLUSTER
################################################################################

class Cluster:
    
    def __init__(self, points):
        
        self.points = points
        
    # --------------------------------------------------------------------------
    # calculate the centroid for the cluster
    # --------------------------------------------------------------------------

    def centroid(self):
        
        return Point(sum([p.latitude for p in self.points])/len(self.points),
            sum([p.longitude for p in self.points])/len(self.points))
            
    # --------------------------------------------------------------------------
    # calculate the region (centroid, bounding radius) for the cluster
    # --------------------------------------------------------------------------
    
    def region(self):
        
        centroid = self.centroid()
        radius = reduce(lambda r, p: max(r, p.distance(centroid)), self.points)
        return centroid, radius
        
    # --------------------------------------------------------------------------
    # cluster as GeoJSON
    # --------------------------------------------------------------------------
        
    def to_geo_json_dict(self, user_properties=None):
        
        center, radius = self.region()
        properties = { 'radius': radius }
        if user_properties: properties.update(user_properties)
        
        return {
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [
                    center.longitude,
                    center.latitude,
                ]
            },
            'properties': properties,
        }

################################################################################
# OPTICS
################################################################################

class Optics:
    
    def __init__(self, points, max_radius, min_cluster_size):
        
        self.points = points
        self.max_radius = max_radius                # maximum radius to consider
        self.min_cluster_size = min_cluster_size    # minimum points in cluster
    
    # --------------------------------------------------------------------------
    # get ready for a clustering run
    # --------------------------------------------------------------------------
    
    def _setup(self):
        
        for p in self.points:
            p.rd = None
            p.processed = False
        self.unprocessed = [p for p in self.points]
        self.ordered = []

    # --------------------------------------------------------------------------
    # distance from a point to its nth neighbor (n = min_cluser_size)
    # --------------------------------------------------------------------------
    
    def _core_distance(self, point, neighbors):

        if point.cd is not None: return point.cd
        if len(neighbors) >= self.min_cluster_size - 1:
            sorted_neighbors = sorted([n.distance(point) for n in neighbors])
            point.cd = sorted_neighbors[self.min_cluster_size - 2]
            return point.cd
        
    # --------------------------------------------------------------------------
    # neighbors for a point within max_radius
    # --------------------------------------------------------------------------
    
    def _neighbors(self, point):
        
        return [p for p in self.points if p is not point and
            p.distance(point) <= self.max_radius]
            
    # --------------------------------------------------------------------------
    # mark a point as processed
    # --------------------------------------------------------------------------
        
    def _processed(self, point):
    
        point.processed = True
        self.unprocessed.remove(point)
        self.ordered.append(point)
    
    # --------------------------------------------------------------------------
    # update seeds if a smaller reachability distance is found
    # --------------------------------------------------------------------------

    def _update(self, neighbors, point, seeds):
        
        # for each of point's unprocessed neighbors n...

        for n in [n for n in neighbors if not n.processed]:
            
            # find new reachability distance new_rd
            # if rd is null, keep new_rd and add n to the seed list
            # otherwise if new_rd < old rd, update rd
            
            new_rd = max(point.cd, point.distance(n))
            if n.rd is None:
                n.rd = new_rd
                seeds.append(n)
            elif new_rd < n.rd:
                n.rd = new_rd
    
    # --------------------------------------------------------------------------
    # run the OPTICS algorithm
    # --------------------------------------------------------------------------

    def run(self):
        
        self._setup()
        
        # for each unprocessed point (p)...
        
        while self.unprocessed:
            point = self.unprocessed[0]
            
            # mark p as processed
            # find p's neighbors
            
            self._processed(point)
            point_neighbors = self._neighbors(point)

            # if p has a core_distance, i.e has min_cluster_size - 1 neighbors

            if self._core_distance(point, point_neighbors) is not None:
                
                # update reachability_distance for each unprocessed neighbor
                
                seeds = []
                self._update(point_neighbors, point, seeds)
                
                # as long as we have unprocessed neighbors...
                
                while(seeds):
                    
                    # find the neighbor n with smallest reachability distance
                    
                    seeds.sort(key=lambda n: n.rd)
                    n = seeds.pop(0)
                    
                    # mark n as processed
                    # find n's neighbors
                    
                    self._processed(n)
                    n_neighbors = self._neighbors(n)
                    
                    # if p has a core_distance...
                    
                    if self._core_distance(n, n_neighbors) is not None:
                        
                        # update reachability_distance for each of n's neighbors
                        
                        self._update(n_neighbors, n, seeds)
                        
        # when all points have been processed
        # return the ordered list

        return self.ordered
        
    # --------------------------------------------------------------------------
    
    def cluster(self, cluster_threshold):
        
        clusters = []
        separators = []

        for i in range(len(self.ordered)):
            this_i = i
            next_i = i + 1
            this_p = self.ordered[i]
            this_rd = this_p.rd if this_p.rd else float('infinity')
            
            # use an upper limit to separate the clusters
            
            if this_rd > cluster_threshold:
                separators.append(this_i)

        separators.append(len(self.ordered))

        for i in range(len(separators) - 1):
            start = separators[i]
            end = separators[i + 1]
            if end - start >= self.min_cluster_size:
                clusters.append(Cluster(self.ordered[start:end]))

        return clusters



In [55]:
from collections import Counter

In [132]:
# LOAD SOME POINTS
def cluster_into_spots(df):
    
    start_points = list()
    end_points = list()
    length = len(df)
    print(length)

    for i in range(length):
        
        start_points.append(Point(df['start_lat'].iloc[i], df['start_lon'].iloc[i]))
        end_points.append(Point(df['end_lat'].iloc[i], df['end_lon'].iloc[i]))
        
    points = start_points.copy()
    points.extend(end_points)

    optics = Optics(points, 200, 2) # 100m radius for neighbor consideration, cluster size >= 2 points
    optics.run()
    clusters = optics.cluster(150)   # 50m threshold for clustering
    
    
    start_clusters = list()
    end_clusters = list()
    
    for i, p in enumerate(start_points):
        assigned = False
        for j, cluster in enumerate(clusters):
            if p in cluster.points:
                start_clusters.append(j)
                assigned = True
                break
                
        if not assigned:
            start_clusters.append(-1)
        
    
    for i, p in enumerate(end_points):
        assigned = False
        for j, cluster in enumerate(clusters):
            if p in cluster.points:
                end_clusters.append(j)
                assigned = True
                break
        
        if not assigned:
            end_clusters.append(-1)
        
    
    
    
    for i, cid in enumerate(start_clusters):
        if cid == -1:
            point = Point(df['start_lat'].iloc[i], df['start_lon'].iloc[i])
            nearest_cluster = -1
            min_dist = float('infinity')
            for j in range(length):
                if i == j:
                    continue
                    
                sc = Point(df['start_lat'].iloc[j], df['start_lon'].iloc[j])
                #print(point.distance(sc))
                
        

    
    df['start_cluster'] = start_clusters
    print(Counter(start_clusters))
    df['end_cluster'] = end_clusters
    print(Counter(end_clusters))
    return df
            
        
    
df = cluster_into_spots(df)
df

counter = 0
for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

157
Counter({10: 40, -1: 35, 0: 21, 2: 20, 3: 5, 17: 5, 16: 4, 19: 4, 11: 3, 13: 3, 4: 2, 5: 2, 9: 2, 12: 2, 15: 2, 1: 1, 6: 1, 7: 1, 8: 1, 14: 1, 18: 1, 21: 1})
Counter({10: 48, 2: 43, -1: 25, 12: 9, 11: 6, 17: 6, 20: 5, 6: 3, 8: 2, 16: 2, 0: 1, 1: 1, 5: 1, 7: 1, 13: 1, 14: 1, 18: 1, 21: 1})
69 157


In [133]:
visualize_cluster(df)

In [90]:
def agglomerative_cluster_into_spots(df, min_cluster, max_cluster):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    sil_scores = list()
    for i in range(min_cluster, max_cluster):
        ac = AgglomerativeClustering(n_clusters=i)
        pred = ac.fit_predict(points)
        sil_score = silhouette_score(points, pred)
        sil_scores.append(sil_score)

    
    n_cluster = np.argmax(sil_scores) + min_cluster
    ac = AgglomerativeClustering(n_clusters=n_cluster)
    clusters = ac.fit_predict(points)
    
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [90]:
def kmeans_cluster_into_spots(df, min_cluster, max_cluster):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    sil_scores = list()
    for i in range(min_cluster, max_cluster):
        ac = KMeans(n_clusters=i)
        pred = ac.fit_predict(points)
        sil_score = silhouette_score(points, pred)
        sil_scores.append(sil_score)

    
    n_cluster = np.argmax(sil_scores) + min_cluster
    ac = KMeans(n_clusters=n_cluster, n_jobs=-1)
    clusters = ac.fit_predict(points)
    
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [113]:
def meanshift_cluster_into_spots(df):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))


    ac = MeanShift(n_jobs=-1)
    print('start clustering')
    clusters = ac.fit_predict(points)
    print('done clustering')
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [84]:
n_cluster = np.argmax(sil_scores) +100
ac = AgglomerativeClustering(n_clusters=n_cluster)
pred = ac.fit_predict(points)
points_deg = np.rad2deg(points)
visualize_cluster(points_deg, pred)

In [85]:
Counter(pred)

Counter({0: 8,
         1: 14,
         2: 12,
         3: 18,
         4: 21,
         5: 623,
         6: 121,
         7: 257,
         8: 6,
         9: 61,
         10: 15,
         11: 7,
         12: 3,
         13: 14,
         14: 4,
         15: 18,
         16: 12,
         17: 9,
         18: 4,
         19: 46,
         20: 16,
         21: 3,
         22: 7,
         23: 16,
         24: 23,
         25: 74,
         26: 4,
         27: 5,
         28: 25,
         29: 4,
         30: 2,
         31: 3,
         32: 4,
         33: 64,
         34: 15,
         35: 6,
         36: 12,
         37: 2,
         38: 16,
         39: 26,
         40: 3,
         41: 29,
         42: 2,
         43: 3,
         44: 3,
         45: 6,
         46: 3,
         47: 14,
         48: 57,
         49: 3,
         50: 3,
         51: 2,
         52: 4,
         53: 4,
         54: 4,
         55: 1,
         56: 2,
         57: 1,
         58: 2,
         59: 9,
         60: 3,
     

In [16]:
from matplotlib.colors import cnames
def visualize_cluster(df):
    colors = [hexc for hexc in cnames.values()]
    map_lat, map_lon = df.iloc[0].start_lat, df.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in df.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        start_c = row['start_cluster']
        marker = folium.CircleMarker(tup, color=colors[start_c], fill_color=colors[start_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)
        
        tup = (row['end_lat'], row['end_lon'])
        end_c = row['end_cluster']
        marker = folium.CircleMarker(tup, color=colors[end_c], fill_color=colors[end_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)

    return map_osm

In [111]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import dbscan
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.neighbors import DistanceMetric

from tpm.util.dist import haversine_distance
from tpm.data_model import R

from collections import Counter

import numpy as np
import pandas as pd

In [39]:
import hdbscan
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [51]:
def cluster_into_spots(df):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points) * R
    
    dist = StandardScaler().fit_transform(dist)
    
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed')
    clusters = clusterer.fit_predict(dist)

    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i % length + length])

    print(Counter(start_clusters))        
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [52]:
df = cluster_into_spots(df)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
    
        
print(counter, len(df))

Counter({0: 142, 1: 14})
152 156


In [53]:
visualize_cluster(df)

In [91]:
df = agglomerative_cluster_into_spots(df,100,120)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

719 946


In [94]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [96]:
df = kmeans_cluster_into_spots(df,10,50)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

147 177


In [114]:
df = meanshift_cluster_into_spots(df)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

start clustering


Process ForkPoolWorker-104:
Process ForkPoolWorker-107:
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-108:
Process ForkPoolWorker-105:
Process ForkPoolWorker-100:
Process ForkPoolWorker-106:
Process ForkPoolWorker-101:
Process ForkPoolWorker-103:
Process ForkPoolWorker-98:
Process ForkPoolWorker-99:
Traceback (most recent call last):
Process ForkPoolWorker-102:
Traceback (most recent call last):
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):


KeyboardInterrupt: 

In [122]:
visualize_cluster(df)

In [99]:
df

Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date,start_cluster,end_cluster
2008-10-24 02:58:54,39.999844,116.326752,2008-10-24 02:58:54,40.007736,116.318764,2008-10-24 03:16:29,0,20
2008-10-24 11:02:27,40.007732,116.319717,2008-10-24 11:02:27,40.000404,116.327118,2008-10-24 11:08:42,20,0
2008-10-24 12:55:30,40.005718,116.325012,2008-10-24 12:55:30,40.007076,116.319946,2008-10-24 16:08:14,20,20
2008-10-24 14:15:50,40.007076,116.319946,2008-10-24 14:15:50,39.999592,116.326836,2008-10-24 18:28:23,20,0
2008-10-24 16:11:44,39.999592,116.326836,2008-10-24 16:11:44,39.993462,116.326736,2008-10-24 19:07:40,0,48
2008-10-24 18:31:08,39.993462,116.326736,2008-10-24 18:31:08,39.999798,116.327133,2008-10-24 20:08:23,48,0
2008-10-24 20:08:23,39.999798,116.327133,2008-10-24 20:08:23,40.000122,116.327438,2008-10-24 21:08:47,0,0
2008-10-25 04:29:54,40.000534,116.327370,2008-10-25 04:29:54,40.007759,116.319725,2008-10-25 04:42:39,0,20
2008-10-25 11:04:35,40.000717,116.321342,2008-10-25 11:04:35,40.003887,116.322250,2008-10-25 14:04:27,0,20
2008-10-25 12:44:46,40.003887,116.322250,2008-10-25 12:44:46,39.980862,116.324638,2008-10-25 19:42:23,20,51


In [12]:
def visualize_rows(rows):
    map_lat, map_lon = rows.iloc[0].start_lat, rows.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in rows.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='green'), popup='{} {}'.format(i, row['start_cluster']))
        map_osm.add_children(marker)
        tup = (row['end_lat'], row['end_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='red'), popup='{} {}'.format(i, row['end_cluster']))
        map_osm.add_children(marker)

    return map_osm

In [30]:
visualize_rows(df.iloc[:-1])

In [80]:
ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
ts.days/7

22.571428571428573

In [98]:
duration = timedelta(days=7*15)
train_start = df['start_date'].iloc[0]
train_end = train_start + duration
train = df[train_start:train_end]
test = df[train_end:]

In [99]:
dates = list()
for i in test.index:
    if len(dates) > 1:
        if dates[-1] == (i.month, i.day):
            continue
    dates.append((i.month, i.day))

In [100]:
from masterthesis.models import BayesWeekdayEstimator
from masterthesis.models import FrequentistEstimator
import operator

In [123]:
bwe = FrequentistEstimator()

bwe = bwe.fit(train)

counter_no_prob = 0

for date in dates:
    month, day = date
    x = pd.DataFrame(data=[[49.475752, 8.482531]],index=pd.DatetimeIndex([pd.Timestamp("2008-{}-{} 19:45:21".format(month,day))]), columns=['lat', 'lon'])
    print(x)
    pred = bwe.predict_proba(x)
    
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    
    for i, row in df.loc[(df.index.month==month) & (df.index.day==day)].iterrows():
        print((row['start_cluster'], row['end_cluster']))
        if (row['start_cluster'], row['end_cluster']) in pred.keys():
            print(pred[(row['start_cluster'], row['end_cluster'])])
            for i, s_pred in enumerate(sorted_pred):
                if s_pred[0] == (row['start_cluster'], row['end_cluster']):
                    print('Ranked:', i+1, 'of total', len(pred), 'predictions')
        else:
            print("no prob")
            counter_no_prob += 1
            
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))

                           lat       lon
2008-01-13 19:45:21  49.475752  8.482531
(1, 1)
no prob
                           lat       lon
2008-01-14 19:45:21  49.475752  8.482531
(1, 0)
0.03225806451612903
Ranked: 9 of total 32 predictions
(0, 1)
no prob
                           lat       lon
2008-01-15 19:45:21  49.475752  8.482531
(0, 12)
0.016129032258064516
Ranked: 14 of total 32 predictions
(1, 1)
no prob
                           lat       lon
2008-01-16 19:45:21  49.475752  8.482531
(1, 0)
0.03225806451612903
Ranked: 9 of total 32 predictions
(23, 1)
no prob
                           lat       lon
2008-01-17 19:45:21  49.475752  8.482531
(1, 1)
no prob
(1, 1)
no prob
                           lat       lon
2008-01-18 19:45:21  49.475752  8.482531
(1, 1)
no prob
                           lat       lon
2008-01-19 19:45:21  49.475752  8.482531
(1, 0)
0.03225806451612903
Ranked: 9 of total 32 predictions
(1, 1)
no prob
                           lat       lon
2008-01-20 19:45:

In [130]:
from masterthesis.preprocessing import DenseDepartureTimes
from masterthesis.models import BayesDepartureTimeEstimator
from sklearn.base import BaseEstimator
from sklearn.cluster import dbscan

In [182]:
class BayesDepartureTimeEstimator(BaseEstimator):
    def fit(self, X, y=None):
        self.data_ = X
        return self

    def partial_fit(self, X):
        # stack data to present data
        pass

    def predict_proba(self, x):
        length = len(self.data_)
        start_end = [(start, end) for start, end in zip(self.data_['start_cluster'], self.data_['end_cluster'])]
        priors = {str(k): v / length for k, v in Counter(start_end).items()}

        dayofweek = x.index.dayofweek
        
        p_ba = [row['start_time_cluster'] for index, row in self.data_.iterrows() if
                index.dayofweek == dayofweek]
        p_ba = {k: v / len(p_ba) for k, v in Counter(p_ba).items()}
        
        res = {key: priors[key.split('_')[0]] * p_ba[key] / (1 / 7) for key in p_ba}

        return res
    
        
    def resolve_start_time_cluster(self, stc):
        stc_df = self.data_[self.data_['start_time_cluster'] == stc]
        
        return min(stc_df.index.time), max(stc_df.index.time)

In [211]:
from datetime import datetime
from datetime import date
from datetime import time

In [213]:
def _time_to_degree(time):
    return ((time.hour + (time.minute + (time.second / 60)) / 60) / 24) * 360

def _time_distance(t1, t2):
    circumference = 2 * np.pi
    return (np.abs(_time_to_degree(t1) - _time_to_degree(t2))) * (circumference / 360)

_time_distance(time(6,44,22), time(7))

0.068213284932111679

In [219]:
ddf = DenseDepartureTimes(0.03)
train_ddt = ddf.fit_transform(train.copy())
bdte = BayesDepartureTimeEstimator()
bdte = bdte.fit(train_ddt)

counter_no_prob = 0

for i, row in test.iterrows():
    x = pd.DataFrame(data=[[row['start_lat'], row['start_lon']]], index=[i], columns=['lat', 'lon'])
    pred = bdte.predict_proba(x)
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    for key in pred.keys():
        time_wa, time_wb = bdte.resolve_start_time_cluster(key)
        dummydate = date(1970, 1, 1)
        #time_wa = (datetime.combine(dummydate,time_wa)-timedelta(hours=1)).time()
        #time_wb = (datetime.combine(dummydate,time_wb)+timedelta(hours=1)).time()
        if len(x.between_time(time_wa, time_wb)) == 1:
            
            
            if str((row['start_cluster'], row['end_cluster'])) == key.split('_')[0]:
                print(x)
                print(key)
                print(time_wa, time_wb)
                print(row['start_cluster'], row['end_cluster'])
                for i, s_pred in enumerate(sorted_pred):
                    if s_pred[0] == key:
                        print('Ranked:', i+1, 'of total', len(pred), 'predictions')
                counter_no_prob += 1
        
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))

                           lat         lon
2009-05-24 18:45:57  39.999981  116.328804
(0, 0)_4
18:22:23 19:17:33
0 0
Ranked: 3 of total 84 predictions
                           lat         lon
2009-05-27 02:07:06  39.999573  116.326935
(0, 23)_7
01:54:39 02:47:27
0 23
Ranked: 2 of total 86 predictions
                           lat         lon
2009-05-31 02:39:50  39.999756  116.326828
(0, 23)_7
01:54:39 02:47:27
0 23
Ranked: 2 of total 84 predictions
                          lat         lon
2009-06-02 14:29:55  40.00779  116.321648
(23, 0)_1
13:51:54 14:43:36
23 0
Ranked: 5 of total 97 predictions
                           lat         lon
2009-06-02 18:52:16  40.001148  116.326561
(0, 0)_4
18:22:23 19:17:33
0 0
Ranked: 6 of total 97 predictions
                           lat         lon
2009-06-03 03:53:07  40.000717  116.327827
(0, 23)_4
03:10:31 03:54:37
0 23
Ranked: 5 of total 86 predictions
                          lat         lon
2009-06-05 19:09:03  40.00325  116.324341
(0, 

In [116]:
df_new = df.copy()
df_new[(df_new['start_cluster'] >= 0) & (df_new['end_cluster'] >=0)]

Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date,start_cluster,end_cluster
2008-09-27 21:08:19,39.975651,116.329742,2008-09-27 21:08:19,39.959160,116.418816,2008-09-27 21:51:09,0,4
2008-09-28 08:38:05,39.967533,116.418808,2008-09-28 08:38:05,39.976456,116.331406,2008-09-28 09:12:55,1,0
2008-10-02 12:53:58,39.509480,118.664230,2008-10-02 12:53:58,39.509762,118.664429,2008-10-02 17:50:36,2,2
2008-10-07 08:23:59,39.960037,116.418610,2008-10-07 08:23:59,39.977654,116.331406,2008-10-07 09:19:04,3,0
2008-10-07 18:29:00,39.975445,116.330078,2008-10-07 18:29:00,39.955845,116.419044,2008-10-07 19:09:35,0,11
2008-10-08 18:18:28,39.975906,116.329582,2008-10-08 18:18:28,39.954632,116.423004,2008-10-08 19:07:16,0,5
2008-10-09 08:22:07,39.967499,116.418686,2008-10-09 08:22:07,39.977497,116.330711,2008-10-09 09:00:53,1,0
2008-10-10 18:37:59,39.975304,116.330765,2008-10-10 18:37:59,39.954796,116.422966,2008-10-10 19:37:49,0,5
2008-10-13 08:25:54,39.959026,116.418800,2008-10-13 08:25:54,39.976521,116.331627,2008-10-13 09:08:35,4,0
2008-10-14 08:20:07,39.959084,116.418655,2008-10-14 08:20:07,39.977467,116.330818,2008-10-14 09:03:11,4,0
