# Detect and cluster stop places and calculate each cluster's medoid based on the original GeoLife data set

In [2]:
import pandas as pd

import yaml

with open("conf.yaml") as f:
    conf = yaml.load(f, Loader=yaml.FullLoader)

out_path = conf["out_path"]
data_path = conf["data_path"]

cols = ["date_time", "lat", "lon", "uid"]
df = pd.read_csv(data_path+"complete_with_tids.csv", \
                 usecols = cols, parse_dates = ["date_time"])

df.head()

Unnamed: 0,date_time,lat,lon,uid
0,2009-01-03 01:21:34,39.974294,116.399741,135
1,2009-01-03 01:21:35,39.974292,116.399592,135
2,2009-01-03 01:21:36,39.974309,116.399523,135
3,2009-01-03 01:21:38,39.97432,116.399588,135
4,2009-01-03 01:21:39,39.974365,116.39973,135


## We're interested in a subset comprising only beijing coordinates in a specific timeframe
- definition of lat min/max lo min/max
- defintion of start_time end_time

In [3]:
#restricting to beijing area
lat_min = 39.54
lat_max = 40.3
lon_min = 115.75
lon_max = 117.13
df = df[(df['lat'].between(lat_min, lat_max )) & (df['lon'].between(lon_min, lon_max))]

#restricting to june 2008
start_time = "2008-06-01 00:00:00"
end_time = "2008-08-31 23:59:00"

df = df[(df.date_time > start_time) & (df.date_time < end_time)]

df.head()

Unnamed: 0,date_time,lat,lon,uid
107402,2008-08-16 07:47:56,39.930748,116.306143,104
107403,2008-08-16 07:47:57,39.930792,116.306167,104
107404,2008-08-16 07:48:00,39.931093,116.306342,104
107405,2008-08-16 07:48:05,39.93095,116.306313,104
107406,2008-08-16 07:48:10,39.930963,116.306383,104


## Now we convert df to a TrajectoryDataFrame and set the uid column to "0" in order to calculate and cluster stops for all the trajectories, indipendently on the user who recorded each of them

In [5]:
import skmob
from skmob.preprocessing import detection

#converting to tdf
tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', datetime='date_time', user_id='uid')
tdf.head()

#setting uid to 0
tdf["uid"] = 0

#detecting stops
stdf = detection.stops(tdf, stop_radius_factor=0.5, minutes_for_a_stop=60.0, spatial_radius_km=0.5, leaving_time=True)
stdf.head()

KeyboardInterrupt: 

Now we show the stopping points on a folium map

In [None]:
import folium

stopping_points = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")

stdf.plot_stops(stopping_points)
stopping_points.save(out_path+"stopping_points.html")

stopping_points

Next we need to cluster all the stopping points. After doing that we plot the clustered points on a folium map.

In [None]:
from skmob.preprocessing import clustering

#clustering the stopping points

clustered_stops = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
cstdf = clustering.cluster(stdf, cluster_radius_km=0.5, min_samples=1)

#printing on a folium map
cstdf.plot_stops(clustered_stops)
clustered_stops.save(out_path+"clustered_stopping_points.html")

clustered_stops

Now we calculate the medoid for each cluster and we put each of them in a dataframe, then we serialize it on disk for further usage.

In [None]:
import sklearn
import numpy

medoids = pd.DataFrame()

#for each cluster
for i in range(0, cstdf.cluster.max() + 1):
    cluster_i = cstdf[(cstdf["cluster"] == i)].copy()
    cluster_i.reset_index(inplace=True)
    pairwise = sklearn.metrics.pairwise_distances(cluster_i[["lat", "lng"]], metric='euclidean')
    medoid = numpy.argmin(pairwise.sum(axis=0))
    #print(medoid)
    #print(cluster_i.loc[medoid])
    medoids = medoids.append(cluster_i.loc[[medoid]])

#setting back the index
medoids = medoids.set_index("index")

#we expect 81 rows
print(medoids.shape)


medoids_map = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
#we plot the medoids on a folium map
medoids.plot_stops(medoids_map)
medoids_map.save(out_path+"medoids_map.html")

medoids_map

The stops have been correctly clustered so we serialize the dataframe

In [None]:
print(medoids.head())
medoids.to_csv(data_path+"medoids_newrun.csv")

## Adding more points to further enhance trajectory generation
In order to get more trajectories in the broader Beijing area, we try to pick random points to "augment" our medoids

In [None]:
import random as rand

#picking 100 random points in our inner bounding box perimeter
points = 100
rd_pts = [(rand.uniform(39.6332, 40.1411), (rand.uniform(115.7657, 116.7435))) for i in range(points)]

We visualize the points on a folium map to check everything's ok

In [None]:
import folium 

pt_map = folium.Map(location=[39.9042, 116.4074], tiles="Stamen Toner")
for pt in rd_pts:
    folium.Marker(pt).add_to(pt_map)

pt_map

### Now we augment the medoid dataframe to have more starting/stopping points

In [None]:
points_df = pd.DataFrame(rd_pts, columns = ["lat", "lon"])
print(points_df.head())

meds_reset = medoids.reset_index().drop(["index", "datetime", "leaving_datetime", "uid", "cluster"], axis = 1)
meds_reset = meds_reset.rename(columns = {"lng":"lon"})
print(meds_reset.head())

#let's see if both have the same shape

In [None]:
augmented_medoids = pd.concat([points_df, meds_reset], ignore_index = True)
print(augmented_medoids)

### Finally, we serialize the augmented medoids file

In [None]:
augmented_medoids.to_csv(data_path+"augmented_medoids.csv")