# Geodata Attack - Using GPS

In [1]:
import skmob
from sklearn.cluster import DBSCAN
from skmob.utils import constants
from skmob.utils import utils

from geopy.distance import distance
import pandas as pd
import numpy as np
import csv
import folium

## Preprocessing GPS Data

### Load data from file

In [2]:
preprocessed_file = 'privamov/pre_processed.csv'

Loading only a subset of data from `privamov-gps.csv` , by using `MAX_LINES` and `SKIP_LINES`

In [3]:
def formatGPS(file, MAX_LINES = 10**7, SKIP_LINES = 50):
    ids = []
    horodate = []
    lat = []
    long = []
    i = 0
    t = 0
    with open(file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t', quotechar="'")
        for row in csv_reader:
            if t == SKIP_LINES:
                ids.append(row[0])
                horodate.append(row[1])
                lat.append(row[3])
                long.append(row[2])
                if i > MAX_LINES : 
                    break
                t=0
            else :
                t+=1
            i+=1
        d = { 'ID': ids, 'Horodate':horodate, 'Latitude':lat, 'Longitude':long}  
        df = pd.DataFrame(data=d)
    return df

### Build DataFrame & Quick Visualization

In [4]:
df_gps = formatGPS(preprocessed_file, SKIP_LINES=0)
df_gps.head()

Unnamed: 0,ID,Horodate,Latitude,Longitude
0,1,2014-10-04 08:40:42.085,43.4094833333333,3.68737333333333
1,1,2014-10-04 08:41:32.161,43.4094416666667,3.68725
2,1,2014-10-04 08:42:22.161,43.4092883333333,3.68749333333333
3,1,2014-10-04 08:43:13.161,43.4094033333333,3.68728166666667
4,1,2014-10-04 08:44:05.161,43.4093516666667,3.68739166666667


Aggregating number of points by IDs

In [5]:
def get_unique_id(df):
    ids = df['ID']
    unique_id = set()
    for ident in ids:
        unique_id.add(ident)
    return unique_id  

In [6]:
unique_id = sorted(list(get_unique_id(df_gps)))
print(unique_id)

['1', '103', '105', '107', '109', '11', '110', '112', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '39', '4', '40', '41', '42', '43', '44', '46', '48', '49', '50', '51', '52', '53', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '75', '77', '78', '79', '8', '80', '81', '82', '83', '85', '86', '87', '89', '9', '93', '94', '95', '99']


In [7]:
agg = df_gps.groupby('ID').size().reset_index(name='counts')
agg.sort_values(by='counts',ascending=False).head()

Unnamed: 0,ID,counts
61,69,77729
43,50,64019
34,4,60402
37,42,42211
51,59,41863


### Build SKMOB Object & Visualize trajectories

In [8]:
tdf = skmob.TrajDataFrame(df_gps, latitude='Latitude', longitude='Longitude', datetime='Horodate', user_id='ID')
tdf.head()

Unnamed: 0,uid,datetime,lat,lng
0,1,2014-10-04 08:40:42.085,43.409483,3.687373
1,1,2014-10-04 08:41:32.161,43.409442,3.68725
2,1,2014-10-04 08:42:22.161,43.409288,3.687493
3,1,2014-10-04 08:43:13.161,43.409403,3.687282
4,1,2014-10-04 08:44:05.161,43.409352,3.687392


In [9]:
print("number of users:\t", len(tdf.uid.unique()))
print("number of records:\t", len(tdf))

number of users:	 85
number of records:	 1380589


## Filtering

Filter out all points with speed higher than `max_speed` km/h from the previous point.

In [10]:
from skmob.preprocessing import filtering

In [None]:
ftdf = filtering.filter(tdf, max_speed_kmh=400.)

Very few points have been filtered.

In [None]:
print('Points of the raw trajectory: %s'%len(tdf))
print('Points of the filtered trajectory: %s'%len(ftdf))

## Compression

Reduce the number of points of the trajectory, preserving the structure.

Merge together all points that are closer than `spatial_radius_km`=0.2 kilometers from each other.

In [None]:
from skmob.preprocessing import compression

In [None]:
ctdf = compression.compress(ftdf, spatial_radius_km=0.2)
ctdf[:4]

The compressed trajectory has only a small fraction of the points of the filtered trajectory (less than 10%)

In [None]:
print('Points of the filtered trajectory: %s'%len(ftdf))
print('Points of the compressed trajectory: %s'%len(ctdf))

## Stop detection

Identify locations where the user spent at least minutes_for_a_stop minutes within a `distance spatial_radius_km` $\times$ `stop_radius_factor`, from a given point.

A new column `leaving_datetime` is added, indicating the time when the user departs from the stop.

In [None]:
from skmob.preprocessing import detection

In [None]:
stdf = detection.stops(ctdf, stop_radius_factor=1, \
            minutes_for_a_stop=30, spatial_radius_km=0.5, 
                       leaving_time=True)
stdf[:4]

Click on the stop markers to see a pop up with:

* User ID
* Coordinates of the stop (click to see the location on Google maps)
* Arrival time
* Departure time

In [None]:
map_f = ctdf.plot_trajectory(max_points=1, hex_color=-1, start_end_markers=False)
stdf.plot_stops(map_f=map_f, hex_color=-1)

### ST_DBSCAN

In [None]:
import DBSCAN

In [None]:
df = DBSCAN.ST_DBSCAN(stdf)
df.head()

In [None]:
df[df['cluster']==14]

In [None]:
df = df[df["cluster"]!=-1]
clusters = set(df["cluster"])

In [None]:
meanData = []
for i in range(1, len(clusters)+1):
    latLngDate = df[df["cluster"]==i][["lat", "lng"]].to_numpy()
    toAdd = list(np.mean(latLngDate, axis=0))
    toAdd.append(df[df["cluster"]==i]["datetime"].iloc[0])
    toAdd.append(df[df["cluster"]==i]["leaving_datetime"].iloc[0])
    toAdd.insert(0, i)
    meanData.append(toAdd)
meanData = np.array(meanData)

In [None]:
dataset = pd.DataFrame({"uid": meanData[:, 0], 'lat': meanData[:, 1], 'lng': meanData[:, 2], 'datetime': meanData[:, 3], 'leaving_datetime': meanData[:, 4]})

In [None]:
dd = skmob.TrajDataFrame(dataset, latitude=1, longitude=2, datetime=3)
len(clusters)

In [None]:
map_f = dd.plot_trajectory(max_points=1, hex_color=-1, start_end_markers=False)
dd.plot_stops(map_f=map_f, hex_color=-1)
dd[10:20].plot_stops(map_f=map_f, hex_color=-1)
dd[20:30].plot_stops(map_f=map_f, hex_color=-1)
dd[30:40].plot_stops(map_f=map_f, hex_color=-1)
dd[40:50].plot_stops(map_f=map_f, hex_color=-1)
dd[50:57].plot_stops(map_f=map_f, hex_color=-1)

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="OT6-Attack")
places = []
name = []
address = []
for i in range(50, len(clusters)):#len(clusters)):
    test = dataset[dataset["uid"]==i][["lat", "lng"]]
    test = test.to_numpy()
    lat, lng = test[0]
    coord = "{}, {}".format(lat, lng)
    location = geolocator.reverse(coord)
    places.append(list(location.raw["address"].keys())[0])
    name.append(list(location.raw["address"].values())[0])
    address.append(location.raw["display_name"])

In [None]:
print("places : {}\nname : {}\naddress : {}".format(places, name, address))

In [None]:
def get_unique_id(df):
    ids = df['uid']
    unique_id = set()
    for ident in ids:
        unique_id.add(ident)
    return unique_id
a = get_unique_id(df)

In [None]:
liste = np.zeros((33, 33))

In [None]:
mapCluster = {}
for i in range(1, len(clusters)+1):
    var = df[df["cluster"]==i]["uid"].to_numpy()
    mapCluster[i] = var

In [None]:
from collections import Counter
mapId = {}
a = list(a)
for i in range(len(a)):
    mapId[a[i]] = []
for value in mapCluster.values():
    for i in range(len(value)):
        for j in range(len(value)):
            if i!=j:
                mapId[value[i]].append(value[j])
for key in mapId.keys():
    mapId[key] = Counter(mapId[key])

In [None]:
aliste = [[0] * 33 for i in range(33)]
for i in range(len(aliste)):
    for j in range(len(aliste)):
        if i==j:
            aliste[i][j] = -1
        else:
            aliste[i][j] = mapId[a[i]][a[j]]
for i in range(len(aliste)):
    for j in range(len(liste)):
        print(aliste[i][j], end=' ')
    print()

In [None]:
newDf = df[df['uid'].isin(['59','72'])]

In [None]:
newDf[newDf["cluster"] == 15]

In [None]:
newDf = newDf[newDf["cluster"] != 15]

In [None]:
map_f = newDf.plot_trajectory(max_points=1, hex_color=-1, start_end_markers=False)
newDf.plot_stops(map_f=map_f, hex_color=-1)

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="OT6-Attack")

In [None]:
location = geolocator.reverse("48.8709, 2.779")
location.raw

In [None]:
from skmob.measures.individual import home_location
hl_df = home_location(tdf)
print(hl_df[hl_df["uid"]=="59"])

In [None]:
print(hl_df[hl_df["uid"]=="72"])

In [None]:
house_geo = hl_df[hl_df["uid"].isin(["59","72"])]
house_geo['datetime'] = dd['datetime'].iloc[0]
house_geo['leaving_datetime'] = dd['leaving_datetime'].iloc[0]
house_geo = skmob.TrajDataFrame(house_geo, latitude='lat', longitude='lng', datetime='datetime', user_id='uid')
map_f = house_geo.plot_trajectory(max_points=1, hex_color=-1, start_end_markers=False)
house_geo.plot_stops(map_f=map_f, hex_color=-1)

In [None]:
anotherDf = df[df['uid'].isin(['50','42'])]
anotherDf = anotherDf[anotherDf["cluster"] != 41]

In [None]:
map_f = anotherDf.plot_trajectory(max_points=1, hex_color=-1, start_end_markers=False)
anotherDf.plot_stops(map_f=map_f, hex_color=-1)