# Detect and cluster stop places and calculate each cluster's medoid based on the original GeoLife data set

In [1]:
import pandas as pd

import yaml

with open("conf.yaml") as f:
    conf = yaml.load(f, Loader=yaml.FullLoader)

out_path = conf["out_path"]
data_path = conf["data_path"]

cols = ["date_time", "lat", "lon", "uid"]
df = pd.read_csv(data_path+"complete_with_tids.csv", \
                 usecols = cols, parse_dates = ["date_time"])

df.head()

Unnamed: 0,date_time,lat,lon,uid
0,2008-10-23 02:53:04,39.984702,116.318417,0
1,2008-10-23 02:53:10,39.984683,116.31845,0
2,2008-10-23 02:53:15,39.984686,116.318417,0
3,2008-10-23 02:53:20,39.984688,116.318385,0
4,2008-10-23 02:53:25,39.984655,116.318263,0


## We're interested in a subset comprising only beijing coordinates in a specific timeframe
- definition of lat min/max lo min/max
- defintion of start_time end_time

In [2]:
#restricting to beijing area
lat_min = 39.54
lat_max = 40.3
lon_min = 115.75
lon_max = 117.13
df = df[(df['lat'].between(lat_min, lat_max )) & (df['lon'].between(lon_min, lon_max))]

#restricting to june 2008
start_time = "2008-06-01 00:00:00"
end_time = "2008-08-31 23:59:00"

df = df[(df.date_time > start_time) & (df.date_time < end_time)]

df.head()

Unnamed: 0,date_time,lat,lon,uid
2101262,2008-06-17 09:44:44,39.97606,116.310953,10
2101263,2008-06-17 09:44:45,39.976016,116.310973,10
2101264,2008-06-17 09:44:46,39.975973,116.311003,10
2101265,2008-06-17 09:44:47,39.975938,116.311025,10
2101266,2008-06-17 09:44:48,39.975906,116.311038,10


## Now we convert df to a TrajectoryDataFrame and set the uid column to "0" in order to calculate and cluster stops for all the trajectories, indipendently on the user who recorded each of them

In [3]:
import skmob
from skmob.preprocessing import detection

#converting to tdf
tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', datetime='date_time', user_id='uid')
tdf.head()

#setting uid to 0
tdf["uid"] = 0

#detecting stops
stdf = detection.stops(tdf, stop_radius_factor=0.5, minutes_for_a_stop=60.0, spatial_radius_km=0.5, leaving_time=True)
stdf.head()

Unnamed: 0,datetime,lat,lng,uid,leaving_datetime
0,2008-06-01 14:07:46,39.984939,116.352704,0,2008-06-01 15:11:14
1,2008-06-01 15:24:04,39.984915,116.352534,0,2008-06-01 23:03:27
2,2008-06-02 01:48:56,39.97619,116.330269,0,2008-06-02 03:46:06
3,2008-06-02 15:21:05,39.971958,116.324943,0,2008-06-02 19:04:58
4,2008-06-02 19:22:02,39.987358,116.451792,0,2008-06-02 22:56:50


Now we show the stopping points on a folium map

In [4]:
import folium

stopping_points = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")

stdf.plot_stops(stopping_points)
stopping_points.save(out_path+"stopping_points.html")

stopping_points

Next we need to cluster all the stopping points. After doing that we plot the clustered points on a folium map.

In [5]:
from skmob.preprocessing import clustering

#clustering the stopping points

clustered_stops = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
cstdf = clustering.cluster(stdf, cluster_radius_km=0.5, min_samples=1)

#printing on a folium map
cstdf.plot_stops(clustered_stops)
clustered_stops.save(out_path+"clustered_stopping_points.html")

clustered_stops

Now we calculate the medoid for each cluster and we put each of them in a dataframe, then we serialize it on disk for further usage.

In [6]:
import sklearn
import numpy

medoids = pd.DataFrame()

#for each cluster
for i in range(0, cstdf.cluster.max() + 1):
    cluster_i = cstdf[(cstdf["cluster"] == i)].copy()
    cluster_i.reset_index(inplace=True)
    pairwise = sklearn.metrics.pairwise_distances(cluster_i[["lat", "lng"]], metric='euclidean')
    medoid = numpy.argmin(pairwise.sum(axis=0))
    #print(medoid)
    #print(cluster_i.loc[medoid])
    medoids = medoids.append(cluster_i.loc[[medoid]])

#setting back the index
medoids = medoids.set_index("index")

#we expect 81 rows
print(medoids.shape)


medoids_map = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
#we plot the medoids on a folium map
medoids.plot_stops(medoids_map)
medoids_map.save(out_path+"medoids_map.html")

medoids_map

(81, 6)


The stops have been correctly clustered so we serialize the dataframe

In [7]:
print(medoids.head())
medoids.to_csv(data_path+"medoids.csv")

                 datetime        lat         lng  uid    leaving_datetime  \
index                                                                       
85    2008-06-22 01:53:33  39.975428  116.330495    0 2008-06-22 04:01:07   
59    2008-06-15 11:14:26  39.984828  116.303761    0 2008-06-15 12:50:11   
6     2008-06-03 05:22:48  39.984988  116.352642    0 2008-06-03 08:08:27   
300   2008-08-11 16:21:21  40.006256  116.320507    0 2008-08-11 23:34:28   
311   2008-08-13 19:02:24  40.030552  116.409093    0 2008-08-13 20:33:27   

       cluster  
index           
85           0  
59           1  
6            2  
300          3  
311          4  


## Adding more points to further enhance trajectory generation
In order to get more trajectories in the broader Beijing area, we try to pick random points to "augment" our medoids

In [8]:
import random as rand

#picking 100 random points in our inner bounding box perimeter
points = 100
rd_pts = [(rand.uniform(39.6332, 40.1411), (rand.uniform(115.7657, 116.7435))) for i in range(points)]

We visualize the points on a folium map to check everything's ok

In [9]:
import folium 

pt_map = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
for pt in rd_pts:
    folium.Marker(pt).add_to(pt_map)

pt_map

### Now we augment the medoid dataframe to have more starting/stopping points

In [10]:
points_df = pd.DataFrame(rd_pts, columns = ["lat", "lon"])
print(points_df.head())

meds_reset = medoids.reset_index().drop(["index", "datetime", "leaving_datetime", "uid", "cluster"], axis = 1)
meds_reset = meds_reset.rename(columns = {"lng":"lon"})
print(meds_reset.head())

#let's see if both have the same shape

         lat         lon
0  39.856215  116.640477
1  39.652854  115.846842
2  39.944152  116.721681
3  39.952524  116.549498
4  39.832093  116.079627
         lat         lon
0  39.975428  116.330495
1  39.984828  116.303761
2  39.984988  116.352642
3  40.006256  116.320507
4  40.030552  116.409093


In [11]:
augmented_medoids = pd.concat([points_df, meds_reset], ignore_index = True)
print(augmented_medoids)

           lat         lon
0    39.856215  116.640477
1    39.652854  115.846842
2    39.944152  116.721681
3    39.952524  116.549498
4    39.832093  116.079627
..         ...         ...
176  39.937655  116.366564
177  39.983241  116.358994
178  39.930710  116.461598
179  39.957360  116.282585
180  39.580983  116.021833

[181 rows x 2 columns]


### Finally, we serialize the augmented medoids file

In [12]:
augmented_medoids.to_csv(data_path+"augmented_medoids.csv")