# Geodata Attack - Using GPS

In [1]:
import skmob
from sklearn.cluster import DBSCAN
from skmob.utils import constants
from skmob.utils import utils

from geopy.distance import distance
import pandas as pd
import numpy as np
import csv
import folium

## Preprocessing GPS Data

### Load data from file

In [2]:
gps = 'privamov/privamov-gps.csv'
sub_gps= 'subsamples/privamov_gps_id_1.csv'
gsm = 'privamov/privamov-gsm.csv'
wifi = 'privamov/privamov-wifi.csv'

Loading only a subset of data from `privamov-gps.csv` , by using `MAX_LINES` and `SKIP_LINES`

In [5]:
def formatGPS(file, MAX_LINES = 10**7, SKIP_LINES = 50):
    ids = []
    horodate = []
    lat = []
    long = []
    i = 0
    t = 0
    with open(file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t', quotechar="'")
        for row in csv_reader:
            if t == SKIP_LINES:
                ids.append(row[0])
                horodate.append(row[1])
                lat.append(row[3])
                long.append(row[2])
                if i > MAX_LINES : 
                    break
                t=0
            else :
                t+=1
            i+=1
        d = { 'ID': ids, 'Horodate':horodate, 'Latitude':lat, 'Longitude':long}  
        df = pd.DataFrame(data=d)
    return df

### Build DataFrame & Quick Visualization

In [6]:
df_gps = formatGPS(gps)
df_gps.head()

Unnamed: 0,ID,Horodate,Latitude,Longitude
0,1,2014-10-04 08:41:07.161,43.409445,3.68725
1,1,2014-10-04 08:41:32.177,43.4094416666667,3.68725
2,1,2014-10-04 08:41:58.161,43.4093783333333,3.687345
3,1,2014-10-04 08:42:23.22,43.4092866666667,3.68749666666667
4,1,2014-10-04 08:42:50.157,43.4093366666667,3.68738833333333


Aggregating number of points by IDs

In [7]:
def get_unique_id(df):
    ids = df['ID']
    unique_id = set()
    for ident in ids:
        unique_id.add(ident)
    return unique_id  

In [6]:
unique_id = sorted(list(get_unique_id(df_gps)))
print(unique_id)

['1', '10', '107', '11', '12', '13', '14', '15', '16', '17', '19', '2', '20', '21', '22', '23', '24', '25', '26', '32', '37', '42', '44', '5', '6', '7', '8', '9', '98']


In [9]:
agg = df_gps.groupby('ID').size().reset_index(name='counts')
agg.sort_values(by='counts',ascending=False).head()

Unnamed: 0,ID,counts
0,1,53966
11,2,43786
24,6,26408
3,11,12685
26,8,10342


## Sampling the Data

```
def sample_by_id(file, ident='1'):
    with open(file, 'r') as i, open('subsamples/privamov_gps_id_{}.csv'.format(ident), 'w', newline='') as o:
        r = csv.reader(i, delimiter='\t')
        w = csv.writer(o, delimiter='\t')
        for row in r:
            if row[0] == ident:
                w.writerow(row)
```

```
sample_by_id(gps,'2')
```

### Build SKMOB Object & Visualize trajectories

In [8]:
tdf = skmob.TrajDataFrame(df_gps, latitude='Latitude', longitude='Longitude', datetime='Horodate', user_id='ID')
tdf.head()

  uid                datetime        lat       lng
0   1 2014-10-04 08:41:07.161  43.409445  3.687250
1   1 2014-10-04 08:41:32.177  43.409442  3.687250
2   1 2014-10-04 08:41:58.161  43.409378  3.687345
3   1 2014-10-04 08:42:23.220  43.409287  3.687497
4   1 2014-10-04 08:42:50.157  43.409337  3.687388


In [10]:
print("number of users:\t", len(tdf.uid.unique()))
print("number of records:\t", len(tdf))

number of users:	 29
number of records:	 196079


## Filtering

Filter out all points with speed higher than `max_speed` km/h from the previous point.

In [13]:
from skmob.preprocessing import filtering

In [14]:
ftdf = filtering.filter(tdf, max_speed_kmh=400.)

Very few points have been filtered.

In [16]:
print('Points of the raw trajectory: %s'%len(tdf))
print('Points of the filtered trajectory: %s'%len(ftdf))

Points of the raw trajectory: 196079
Points of the filtered trajectory: 195558


## Compression

Reduce the number of points of the trajectory, preserving the structure.

Merge together all points that are closer than `spatial_radius_km`=0.2 kilometers from each other.

In [28]:
from skmob.preprocessing import compression

In [29]:
ctdf = compression.compress(ftdf, spatial_radius_km=0.2)
ctdf[:4]

Unnamed: 0,uid,datetime,lat,lng
0,1,2014-10-04 08:41:07.161,43.409325,3.687525
1,1,2014-10-08 07:14:28.223,45.771197,4.869158
2,1,2014-10-08 07:16:48.104,45.775895,4.870882
3,1,2014-10-08 07:19:47.124,45.776451,4.873694


The compressed trajectory has only a small fraction of the points of the filtered trajectory (less than 10%)

In [30]:
print('Points of the filtered trajectory: %s'%len(ftdf))
print('Points of the compressed trajectory: %s'%len(ctdf))

Points of the filtered trajectory: 195558
Points of the compressed trajectory: 11118


## Stop detection

Identify locations where the user spent at least minutes_for_a_stop minutes within a `distance spatial_radius_km` $\times$ `stop_radius_factor`, from a given point.

A new column `leaving_datetime` is added, indicating the time when the user departs from the stop.

In [32]:
from skmob.preprocessing import detection

In [33]:
stdf = detection.stops(ctdf, stop_radius_factor=1, \
            minutes_for_a_stop=30, spatial_radius_km=0.5, 
                       leaving_time=True)
stdf[:4]

Unnamed: 0,uid,datetime,lat,lng,leaving_datetime
0,1,2014-10-04 08:41:07.161,43.409325,3.687525,2014-10-08 07:14:28.223000
1,1,2014-10-08 07:36:47.122,45.78681,4.879812,2014-10-08 16:02:53.131000
2,1,2014-10-08 16:30:31.105,45.77101,4.869932,2014-10-09 07:16:44.987000
3,1,2014-10-09 07:44:45.971,45.788008,4.879843,2014-10-09 10:21:12.970000


**Visualise the compressed trajectory and the stops**

Click on the stop markers to see a pop up with:

* User ID
* Coordinates of the stop (click to see the location on Google maps)
* Arrival time
* Departure time

In [35]:
map_f = ctdf.plot_trajectory(max_points=1, hex_color=-1, start_end_markers=False)
stdf.plot_stops(map_f=map_f, hex_color=-1)

## Clustering stops

Stops are clustered by spatial proximity using DBSCAN.

The new column cluster is added to the Stops TrajectoryDataFrame, where the value correspond to the cluster ID each stop belong to.

Clusters IDs are integers (0, 1, ...): 0 is the most visited cluster, 1 is the second most visited, ...

In [36]:
from skmob.preprocessing import clustering

In [37]:
cstdf = clustering.cluster(stdf, cluster_radius_km=0.2, min_samples=1)
cstdf[:4]

Unnamed: 0,uid,datetime,lat,lng,leaving_datetime,cluster
0,1,2014-10-04 08:41:07.161,43.409325,3.687525,2014-10-08 07:14:28.223,15
1,1,2014-10-08 07:36:47.122,45.78681,4.879812,2014-10-08 16:02:53.131,0
2,1,2014-10-08 16:30:31.105,45.77101,4.869932,2014-10-09 07:16:44.987,1
3,1,2014-10-09 07:44:45.971,45.788008,4.879843,2014-10-09 10:21:12.970,0


**Visualise the compressed trajectory and stops: stops in the same clusters have the same color.**

Click on the stop markers to see a pop up with:

* User ID
* Cluster ID
* Coordinates of the stop (click to see the location on Google maps)
* Arrival time
* Departure time

In [None]:
map_f = ctdf[ctdf['uid'] == '1'].plot_trajectory(max_points=1, \
                                               hex_color='#000000', start_end_markers=False)
cstdf[stdf['uid'] == '1'].plot_stops(map_f=map_f)

**Visualise a user's diary**

In [None]:
user = '1'
start_datetime = pd.to_datetime('2014-10-04 030000')
end_datetime = pd.to_datetime('2014-12-20 030000')
ax = cstdf.plot_diary(user, start_datetime=start_datetime, end_datetime=end_datetime)

In [None]:
df_points = cstdf.groupby(['uid', 'cluster']).agg(
    {
         'lat':"mean",    # Sum duration per group
         'lng':"mean"
    }
)
locations = df_points[['lat', 'lng']]
locationlist = locations.values.tolist()

In [None]:
from folium.plugins import MarkerCluster
map2 = folium.Map(location=[38.9, -77.05], zoom_start=6)

marker_cluster = MarkerCluster().add_to(map2)

for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point]).add_to(marker_cluster)
map2

In [None]:
user = '1'
start_datetime = pd.to_datetime('2014-10-04 030000')
end_datetime = pd.to_datetime('2014-10-11 030000')
ax = cstdf.plot_diary(user, start_datetime=start_datetime, end_datetime=end_datetime)

## Split trajectory into daily trajectories

In [None]:
import skmob
from skmob.utils import utils
from skmob.preprocessing import filtering, compression, detection, clustering

In [None]:
# Re-run all the preprocessing operations in one cell

tdf = skmob.TrajDataFrame(df_gps, latitude='Latitude', longitude='Longitude', datetime='Horodate', user_id='ID') 
ftdf = filtering.filter(tdf, max_speed_kmh=400.)
ctdf = compression.compress(ftdf, spatial_radius_km=0.2)
stdf = detection.stops(ctdf, stop_radius_factor=0.5, \
            minutes_for_a_stop=20.0, spatial_radius_km=0.2, leaving_time=True)
cstdf = clustering.cluster(stdf, cluster_radius_km=0.1, min_samples=1)

In [None]:
cstdf

In [None]:
groups = utils.group_df_by_time(cstdf[cstdf['uid'] == '1'], 
                        offset_value=3, offset_unit='hours', add_starting_location=True)
groups[:3]

In [None]:
from skmob.measures.individual import home_location

In [None]:
hl_df = home_location(tdf)
print(hl_df.head())

In [None]:
def _work_location_individual(traj, start_day='08:00', end_day='18:00'):
    day_visits = traj.set_index(pd.DatetimeIndex(traj.datetime)).between_time(start_day, end_day)
    if len(day_visits) != 0:
        lat, lng = day_visits.groupby([constants.LATITUDE, constants.LONGITUDE]).count().sort_values(by=constants.DATETIME, ascending=False).iloc[0].name
    else:
        lat, lng = traj.groupby([constants.LATITUDE, constants.LONGITUDE]).count().sort_values(by=constants.DATETIME, ascending=False).iloc[0].name
    work_coords = (lat, lng)
    return work_coords

In [None]:
def work_location(traj, start_day='08:00', end_day='18:00', show_progress=True):
    # if 'uid' column in not present in the TrajDataFrame
    if constants.UID not in traj.columns:
        return pd.DataFrame([_work_location_individual(traj, start_day=start_day, end_day=end_day)], columns=[constants.LATITUDE, constants.LONGITUDE])
    
    if show_progress:
        df = traj.groupby(constants.UID).progress_apply(lambda x: _work_location_individual(x, start_day=start_day, end_day=end_day))
    else:
        df = traj.groupby(constants.UID).apply(lambda x: _work_location_individual(x, start_day=start_day, end_day=end_day))
    return df.apply(pd.Series).reset_index().rename(columns={0: constants.LATITUDE, 1: constants.LONGITUDE})

In [None]:
wl_df = work_location(tdf)
print(wl_df.head())

**Visualise the diaries of each sub-trajectory**

In [None]:
for tdf in groups[:3]:
    user = tdf['uid'].iloc[0]
    ax = tdf.plot_diary(user)

## Routing

Using OSMnx and networkx (taken from https://medium.com/@bobhaffner/osmnx-intro-and-routing-1fd744ba23d8).

In [None]:
import osmnx as ox
import folium

In [None]:
start_i = 0
dt_start = cstdf.iloc[start_i]['leaving_datetime']
dt_end = cstdf.iloc[start_i + 1]['datetime']
user_id = cstdf.iloc[start_i]['uid']

dt_start, dt_end

In [None]:
tdf1 = ftdf[(ftdf['datetime'] >= dt_start) & \
            (ftdf['datetime'] <= dt_end) & \
            (ftdf['uid'] == user_id)]
tdf1.plot_trajectory()

## Privacy Risk Assessment

* Simulate privacy attacks and assess risk with a worst-case scenario framework

* First, we import the necessary modules

In [None]:
from skmob.privacy import attacks
from skmob.core.trajectorydataframe import TrajDataFrame
from skmob.utils import constants

In [None]:
# Re-run all the preprocessing operations in one cell
tdf = skmob.TrajDataFrame(df_gps, latitude='Latitude', longitude='Longitude', datetime='Horodate', user_id='ID') 
ftdf = filtering.filter(tdf, max_speed_kmh=300.)
ctdf = compression.compress(ftdf, spatial_radius_km=0.5)
stdf = detection.stops(ctdf, stop_radius_factor=2, \
            minutes_for_a_stop=120.0, spatial_radius_km=1.0, leaving_time=True)
cstdf = clustering.cluster(stdf, cluster_radius_km=1.0, min_samples=1)

In [None]:
cstdf

In [None]:
at = attacks.LocationAttack(knowledge_length=2)

In [None]:
at.assess_risk(cstdf)