In [1]:
from tpm.data_model import *
from tpm.util.io import read_geolife
from tpm.util.dist import haversine_distance
from tpm.preprocessing import time_duplication_filter
from tpm.preprocessing import speed_filter_abs
import numpy as np
import pandas as pd
import folium
from datetime import timedelta

In [2]:
trajs = read_geolife('/mnt/hdd1/christian/data/geotracking/Geolife Trajectories 1.3/Data/002/Trajectory')


In [3]:
preprocessed = list()
for traj in trajs:
    traj_new = time_duplication_filter(traj)
    traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
    preprocessed.append(traj_new)


In [4]:
def staypoints_geolife(traj):
    time_thresh = 30*60
    dist_thresh = 250

    staypoints = list()
    i, i_max = 0, len(traj)
    while i < i_max:
        j = i+1
        token = 0
        while j < i_max:
            dist = haversine_distance(traj[i], traj[j])
            if dist > dist_thresh:
                delta_time = traj[j].datetime - traj[i].datetime
                if delta_time.total_seconds() > time_thresh:
                    mean_point = np.mean([[p.lat, p.lon] for p in traj[i:j+1]], axis=0)
                    arrival_time = traj[i].datetime
                    leave_time = traj[j].datetime
                    staypoints.append([mean_point, arrival_time, leave_time, i, j])
                    i = j
                    token = 1
                break
            j = j+1
        if not token == 1:
            i = i+1

    
    return staypoints

In [5]:
len(trajs)

175

In [6]:
def make_df(trajs):
    data = list()
    for traj in trajs:
        fp = traj[0]
        sps = staypoints_geolife(traj)
        lp = traj[-1]               
        
        if len(sps) > 1:
            data.append([fp.lat, fp.lon, fp.datetime, sps[0][0][0], sps[0][0][1], sps[0][1]])
            for i in range(1, len(sps)-1):
                data.append([sps[i][0][0], sps[i][0][1], sps[i][1], sps[i+1][0][0], sps[i+1][0][1], sps[i+1][2]])
            data.append([sps[-1][0][0], sps[-1][0][1], sps[-1][2], lp.lat, lp.lon, lp.datetime])
        else:
            data.append([fp.lat, fp.lon, fp.datetime, lp.lat, lp.lon, lp.datetime])
        
        
    df = pd.DataFrame(data, columns=['start_lat','start_lon','start_date','end_lat','end_lon','end_date'])
    df = df.set_index(pd.DatetimeIndex(df['start_date'])).sort_index()
    return df

In [7]:
df = make_df(preprocessed)
df


Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date
2008-10-23 21:45:23,39.927937,116.338966,2008-10-23 21:45:23,39.926289,116.338585,2008-10-24 01:44:22
2008-10-24 09:08:05,39.926975,116.336418,2008-10-24 09:08:05,39.900803,116.386566,2008-10-24 09:51:30
2008-10-24 12:47:08,39.899078,116.380188,2008-10-24 12:47:08,39.900440,116.385948,2008-10-24 14:01:38
2008-10-24 13:18:29,39.900440,116.385948,2008-10-24 13:18:29,39.900612,116.386749,2008-10-24 14:31:56
2008-10-24 14:01:38,39.900612,116.386749,2008-10-24 14:01:38,39.900688,116.386703,2008-10-24 20:11:23
2008-10-24 14:32:00,39.900688,116.386703,2008-10-24 14:32:00,39.966335,116.321312,2008-10-24 22:09:19
2008-10-24 20:31:39,39.966335,116.321312,2008-10-24 20:31:39,39.926849,116.337265,2008-10-24 23:46:35
2008-10-24 23:11:12,39.926849,116.337265,2008-10-24 23:11:12,39.926193,116.337631,2008-10-25 00:20:18
2008-10-24 23:46:35,39.926193,116.337631,2008-10-24 23:46:35,39.926239,116.337364,2008-10-25 01:46:18
2008-10-25 00:20:30,39.926239,116.337364,2008-10-25 00:20:30,39.926228,116.338188,2008-10-25 02:27:56


In [8]:
from sklearn.cluster import dbscan
from sklearn.neighbors import DistanceMetric
from tpm.data_model import R
from collections import Counter

In [9]:
def haversine_distance(p1_lat, p1_lon, p2_lat, p2_lon):
    lat_rad1 = radians(p1_lat)
    lon_rad1 = radians(p1_lon)
    lat_rad2 = radians(p2_lat)
    lon_rad2 = radians(p2_lon)
    return 2*R * asin(sqrt(sin((lat_rad2-lat_rad1)/2)**2 + cos(lat_rad1)*cos(lat_rad2)*(sin((lon_rad2-lon_rad1)/2)**2)))

In [10]:
def cluster_into_spots(df, init_eps=150, levels=2, threshold=0.1):
    start_points = list()
    end_points = list()
    length = len(df)
    
    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])
    
    points = np.radians(np.vstack([start_points, end_points]))
    
    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points)*R

    clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
    clusters = np.array(clusters, dtype=np.object)
    
    
    for _ in range(levels):
        init_eps = init_eps*0.6
        counts = dict(Counter(clusters))
        for key in counts:
            if counts[key] > threshold*length:
                idxs = np.where(clusters == key)[0]
                dist = haversine.pairwise(points[idxs])*R
                inner_clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
                for i, idx in enumerate(idxs):
                    
                    clusters[idx] = "{}_{}".format(clusters[idx], inner_clusters[i])
    
    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])
    
    print(len(dict(Counter(clusters)).keys()))
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [31]:
from sklearn.cluster import MeanShift
def meanshift_cluster_into_spots(df):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))


    ac = MeanShift(n_jobs=-1, min_bin_freq=3)
    print('start clustering')
    clusters = ac.fit_predict(points)
    print('done clustering')
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [34]:
df = cluster_into_spots(df, init_eps=300, levels=0, threshold=0.5)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

72
227 329


In [33]:
df = meanshift_cluster_into_spots(df.iloc[0:2])

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

start clustering


Process ForkPoolWorker-79:
Process ForkPoolWorker-81:
Process ForkPoolWorker-88:
Process ForkPoolWorker-77:
Process ForkPoolWorker-87:
Process ForkPoolWorker-85:
Process ForkPoolWorker-83:
Process ForkPoolWorker-86:
Traceback (most recent call last):
Process ForkPoolWorker-82:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-84:
Process ForkPoolWorker-80:
Process ForkPoolWorker-78:
Traceback (most recent call last):
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
 

KeyboardInterrupt: 

In [12]:
def visualize_rows(rows):
    map_lat, map_lon = rows.iloc[0].start_lat, rows.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in rows.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='green'), popup='{} {}'.format(i, row['start_cluster']))
        map_osm.add_children(marker)
        tup = (row['end_lat'], row['end_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='red'), popup='{} {}'.format(i, row['end_cluster']))
        map_osm.add_children(marker)

    return map_osm

In [13]:
visualize_rows(df.iloc[:-1])

In [14]:
ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
ts.days/7

21.285714285714285

In [21]:
duration = timedelta(days=7*17)
train_start = df['start_date'].iloc[0]
train_end = train_start + duration
train = df[train_start:train_end]
test = df[train_end:]

In [22]:
dates = list()
for i in test.index:
    if len(dates) > 1:
        if dates[-1] == (i.month, i.day):
            continue
    dates.append((i.month, i.day))

In [23]:
from masterthesis.models import BayesWeekdayEstimator
from masterthesis.models import FrequentistEstimator
import operator

In [26]:
bwe = BayesWeekdayEstimator()

bwe = bwe.fit(train)

counter_no_prob = 0

for date in dates:
    month, day = date
    x = pd.DataFrame(data=[[49.475752, 8.482531]],index=pd.DatetimeIndex([pd.Timestamp("2008-{}-{} 19:45:21".format(month,day))]), columns=['lat', 'lon'])
    print(x)
    pred = bwe.predict_proba(x)
    
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    
    for i, row in df.loc[(df.index.month==month) & (df.index.day==day)].iterrows():
        print((row['start_cluster'], row['end_cluster']))
        if (row['start_cluster'], row['end_cluster']) in pred.keys():
            print(pred[(row['start_cluster'], row['end_cluster'])])
            for i, s_pred in enumerate(sorted_pred):
                if s_pred[0] == (row['start_cluster'], row['end_cluster']):
                    print('Ranked:', i+1, 'of total', len(pred), 'predictions')
        else:
            print("no prob")
            counter_no_prob += 1
            
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))

                           lat       lon
2008-02-20 19:45:21  49.475752  8.482531
(1, 0)
0.1967021172797346
Ranked: 1 of total 15 predictions
                           lat       lon
2008-02-21 19:45:21  49.475752  8.482531
(0, 47)
no prob
(48, 0)
no prob
                           lat       lon
2008-02-22 19:45:21  49.475752  8.482531
(0, 0)
0.14139590854392298
Ranked: 3 of total 16 predictions
(13, 16)
no prob
(16, 0)
0.0018050541516245486
Ranked: 6 of total 16 predictions
                           lat       lon
2008-02-23 19:45:21  49.475752  8.482531
(1, 0)
0.02205447981621267
Ranked: 2 of total 38 predictions
                           lat       lon
2008-02-24 19:45:21  49.475752  8.482531
(1, 0)
0.029585277802236503
Ranked: 3 of total 20 predictions
                           lat       lon
2008-02-25 19:45:21  49.475752  8.482531
(1, 0)
0.4182746172040334
Ranked: 1 of total 11 predictions
                           lat       lon
2008-02-26 19:45:21  49.475752  8.482531
(1, 1)
0.

In [35]:
from matplotlib.colors import cnames
def visualize_cluster(df):
    colors = [hexc for hexc in cnames.values()]
    map_lat, map_lon = df.iloc[0].start_lat, df.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in df.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        start_c = row['start_cluster']
        if not isinstance(start_c, int) and '_' in start_c:
            start_c = start_c.split('_')[1]
        start_c = int(start_c)
        marker = folium.CircleMarker(tup, color=colors[start_c], fill_color=colors[start_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)
        
        tup = (row['end_lat'], row['end_lon'])
        end_c = row['end_cluster']
        if not isinstance(end_c, int) and'_' in end_c:
            end_c = end_c.split('_')[1]
        end_c = int(end_c)
        marker = folium.CircleMarker(tup, color=colors[end_c], fill_color=colors[end_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)

    return map_osm

In [36]:
visualize_cluster(df)