# Dataset augmentation by creating fake trajectories
By exploiting the previously written code (in the trajectory generation notebook), we synthetically augment the dataset in order to have more working data.
We begin by loading medoids and getting/serializing graphs on disk.

In [14]:
#imports cell

import pandas as pd
import matplotlib.pyplot as plt
import osmnx as ox 
from sklearn.neighbors import KDTree
import networkx as nx
import folium
import random
from geographiclib.geodesic import Geodesic
import math

base_path = "/Users/tommasocolella/Desktop/Tesi"

#in order to load medoids only once
medoids = pd.read_csv(base_path + "/datasets/medoids.csv")
print(medoids.head())

n_med = len(medoids)

   index             datetime        lat         lng  uid  \
0     85  2008-06-22 01:53:33  39.975428  116.330495    0   
1     59  2008-06-15 11:14:26  39.984828  116.303761    0   
2      6  2008-06-03 05:22:48  39.984988  116.352642    0   
3    300  2008-08-11 16:21:21  40.006256  116.320507    0   
4    311  2008-08-13 19:02:24  40.030552  116.409093    0   

      leaving_datetime  cluster  
0  2008-06-22 04:01:07        0  
1  2008-06-15 12:50:11        1  
2  2008-06-03 08:08:27        2  
3  2008-08-11 23:34:28        3  
4  2008-08-13 20:33:27        4  


We download graphs for each network type and serialize them on disk in order to save us a lot of time during the trajectory generation.

In [54]:
%%time
#get graphs from place and serialize them on disk

#get the graphs
D = ox.graph_from_place('Beijing, China', which_result=2, network_type='drive')
B = ox.graph_from_place('Beijing, China', which_result=2, network_type='bike')
W = ox.graph_from_place('Beijing, China', which_result=2, network_type='walk')

CPU times: user 11min 46s, sys: 18.2 s, total: 12min 4s
Wall time: 20min 23s


In [55]:
#serialize them on disk
ox.save_graphml(D, base_path+"/datasets/drive_graph.graphml")
ox.save_graphml(B, base_path+"/datasets/bike_graph.graphml")
ox.save_graphml(W, base_path+"/datasets/walk_graph.graphml")

print("Serialized graphs on disk")

Serialized graphs on disk


We define the interpolation and the faker functions to be called for each generation.

In [66]:
#interpolates the dataframe
def interpolator(gdf, uid):
    
    cols = ["datetime", "lat", "lon", "uid", "tid"]
    traj_df = pd.DataFrame(columns = cols)
    
    #meters for interpolation
    k = 5
    geod = Geodesic.WGS84

    for i in range(len(gdf) - 1):

        l = geod.InverseLine(gdf.iloc[i].y, gdf.iloc[i].x, gdf.iloc[i+1].y, gdf.iloc[i+1].x)
        ds = k; n = int(math.ceil(l.s13 / ds))
        for i in range(n + 1):
            #if i == 0:
                #print( "distance latitude longitude azimuth")
            s = min(ds * i, l.s13)
            g = l.Position(s, Geodesic.STANDARD | Geodesic.LONG_UNROLL)
            lat = g["lat2"]
            lon = g["lon2"]
            
            new_row = pd.DataFrame([[0, lat, lon, uid]], columns = cols)
            traj_df = traj_df.append(new_row, ignore_index = True)
            
            #print(traj_df.head())
            
    
    return traj_df
    



#faker function
def faker(profile, n_users):
    #traj faker helper fun
    
    cols = ["datetime", "lat", "lon", "uid","tid"]
    fake_trajs = pd.DataFrame(columns = cols)
    #uid to recognize different types of fake users
    fake_uid = 0
    
    if (profile == "drive"):
        G = ox.load_graphml(base_path+"/datasets/drive_graph.graphml")
        fake_uid = 1000
        
    elif (profile == "bike"):
        G = ox.load_graphml(base_path+"/datasets/bike_graph.graphml")
        fake_uid = 2000

    elif (profile == "walk"):
        G = ox.load_graphml(base_path+"/datasets/walk_graph.graphml")
        fake_uid = 3000

    else:
        print("not a valid profile!")
        return None
    
    print("Loaded {:s} graph from disk".format(profile))
    
    
    gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)
    #print(gdf_nodes)
    tree = KDTree(gdf_nodes[['y', 'x']], metric='euclidean')
    
    n_trajs = 0
    errs = 0
    
    for i in range(0, n_users):
        for j in range(5, random.randint(10, 15)):

            #sample 2 indexes
            picks = random.sample(range(0, n_med), 2)
            med_a = picks[0]
            med_b = picks[1]

            #get lat and lng for medoids
            med_a = (medoids.iloc[med_a].lat, medoids.iloc[med_a].lng)
            med_b = (medoids.iloc[med_b].lat, medoids.iloc[med_b].lng)

            #get the nearest points in the gdf
            med_a_idx = tree.query([med_a], k=1, return_distance=False)[0]
            med_b_idx = tree.query([med_b], k=1, return_distance=False)[0]

            closest_node_to_a = gdf_nodes.iloc[med_a_idx].index.values[0]
            closest_node_to_b = gdf_nodes.iloc[med_b_idx].index.values[0]  

            #calculate the shortest path
            try:
                path = nx.shortest_path(G, 
                             closest_node_to_a,
                             closest_node_to_b,
                             weight='length')
                n_trajs += 1

            #happens when there's not path between two points    
            except nx.NetworkXNoPath:
                errs += 1
                
                
            #print(path)
            gdf = gdf_nodes.loc[path]
            #print("Gdf number {:d}".format(n_trajs))
            #print(gdf.head())
            
            traj = interpolator(gdf, fake_uid)
            traj["tid"] = n_trajs
            
            
            fake_trajs = fake_trajs.append(traj, ignore_index = True)
            
            
            #print route for checking purposes
            """fig, ax = ox.plot_graph_route(G, path, fig_height=10, 
                                  fig_width=10, 
                                  show=False, close=False, 
                                  edge_color='black',
                                  orig_dest_node_color='green',
                                  route_color='green')
            plt.show()"""
            
        #on to another user!
        fake_uid += 1


    print("generated {:d} trajectories for {:d} users with a {:s} profile. {:d} errors generated"
          .format(n_trajs, n_users, profile, errs))
    
    print(fake_trajs.head())
    
    return fake_trajs



In [67]:
%%time
#trajs = faker("drive", 2)
#faker("bike", 1)
trajs = faker("walk", 3)

Loaded walk graph from disk
generated 25 trajectories for 3 users with a walk profile. 0 errors generated
  datetime        lat         lon   uid tid
0        0  39.954742  116.493452  3000   1
1        0  39.954785  116.493436  3000   1
2        0  39.954829  116.493420  3000   1
3        0  39.954872  116.493404  3000   1
4        0  39.954915  116.493389  3000   1
CPU times: user 5min 12s, sys: 6.83 s, total: 5min 18s
Wall time: 5min 36s


Let's see if the trajectory generation worked correctly

In [73]:
import skmob

print(trajs)

tdf = skmob.TrajDataFrame(trajs[(trajs["uid"] == 3001)], longitude = "lon")
print(tdf)


tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Stamen Toner')

      datetime        lat         lon   uid tid
0            0  39.954742  116.493452  3000   1
1            0  39.954785  116.493436  3000   1
2            0  39.954829  116.493420  3000   1
3            0  39.954872  116.493404  3000   1
4            0  39.954915  116.493389  3000   1
...        ...        ...         ...   ...  ..
80136        0  39.999904  116.327328  3002  25
80137        0  39.999893  116.327271  3002  25
80138        0  39.999881  116.327214  3002  25
80139        0  39.999870  116.327158  3002  25
80140        0  39.999865  116.327136  3002  25

[80141 rows x 5 columns]
        datetime        lat         lng   uid tid
27783 1970-01-01  39.955764  116.397064  3001  11
27784 1970-01-01  39.955800  116.397028  3001  11
27785 1970-01-01  39.955835  116.396992  3001  11
27786 1970-01-01  39.955871  116.396956  3001  11
27787 1970-01-01  39.955906  116.396921  3001  11
...          ...        ...         ...   ...  ..
54804 1970-01-01  39.897382  116.315096  3001  1

  dtime = pd.datetime.strftime(dtime, '%Y/%m/%d %H:%M')
  dtime = pd.datetime.strftime(dtime, '%Y/%m/%d %H:%M')


Now we put timedeltas on each trajectory

In [162]:
from geopy.distance import geodesic

traj_copy = trajs.copy(deep = True)
traj_copy["timedelta"] = 0.0

#selecting each traj by uid and tid
for uid in range(traj_copy.uid.min(), traj_copy.uid.max() + 1):
    user = traj_copy[traj_copy["uid"] == uid]
    for tid in range(user.tid.min(), user.tid.max() + 1):
        traj = user[user["tid"] == tid]
        #print(traj.index.min())
        #print(traj.index.max())
        #print(traj)
        for i in range(1, traj.index.max() - traj.index.min() + 1):
            try : 
                #print(traj.iloc[i])
                dist = geodesic((traj.iloc[i-1].lat, traj.iloc[i].lon), \
                                           (traj.iloc[i].lat, traj.iloc[i].lon))

                tdelta = (dist.meters)/1.38
                traj.iat[i, 5] =  tdelta
            except IndexError:
                print(i)
                #print(traj.iloc[i-1])
                #print(traj.iloc[i])
        print(traj)

            #print(traj)


     datetime        lat         lon   uid tid  timedelta
0           0  39.954742  116.493452  3000   1   0.000000
1           0  39.954785  116.493436  3000   1   3.486784
2           0  39.954829  116.493420  3000   1   3.486784
3           0  39.954872  116.493404  3000   1   3.486783
4           0  39.954915  116.493389  3000   1   3.486783
...       ...        ...         ...   ...  ..        ...
1210        0  39.987508  116.451194  3000   1   0.646145
1211        0  39.987500  116.451136  3000   1   0.646147
1212        0  39.987492  116.451079  3000   1   0.646149
1213        0  39.987484  116.451021  3000   1   0.646152
1214        0  39.987480  116.450990  3000   1   0.350102

[1215 rows x 6 columns]
     datetime        lat         lon   uid tid  timedelta
1215        0  39.986056  116.377711  3000   2   0.000000
1216        0  39.986055  116.377652  3000   2   0.134722
1217        0  39.986053  116.377594  3000   2   0.134724
1218        0  39.986051  116.377535  3000   2 