In [1]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../')

# Load data

## Load data - Demonstration

In [2]:
df = pd.read_csv('../data/drive-me-not/trace1.csv')
# benign_flight.info()

#filter out the anchor points
df = df[df['Anchor_Number'] == 0]
df

Unnamed: 0,GPS_lat,GPS_long,Network_lat,Network_long,Time,Anchor_Number,Type,Registered,CID,LAC,MCC,MNC,dBm,level
0,25.328347,51.425060,0.0,0.0,1561278474631,0,GSM,True,23733,9301,427,1,-45,4
14,25.328347,51.425060,0.0,0.0,1561278474768,0,GSM,True,23733,9301,427,1,-45,4
28,25.328347,51.425060,0.0,0.0,1561278474932,0,GSM,True,23733,9301,427,1,-45,4
42,25.328347,51.425060,0.0,0.0,1561278475082,0,GSM,True,23733,9301,427,1,-45,4
56,25.328347,51.425060,0.0,0.0,1561278475263,0,GSM,True,23733,9301,427,1,-45,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34038,25.331563,51.421738,0.0,0.0,1561280145472,0,GSM,True,24882,9301,427,1,-57,4
34050,25.331563,51.421738,0.0,0.0,1561280155660,0,GSM,True,14792,150,427,2,-37,4
34057,25.331563,51.421738,0.0,0.0,1561280155829,0,GSM,True,24882,9301,427,1,-53,4
34070,25.331563,51.421738,0.0,0.0,1561280156156,0,GSM,True,24882,9301,427,1,-53,4


In [3]:
import pandasql as ps

# Filter out the rows whose change of position is not reflected in the coordinates
stmt = """SELECT * 
FROM df
WHERE Time in (
    SELECT min(Time) 
    FROM df
    GROUP BY GPS_lat, GPS_long 
    )
"""

df = ps.sqldf(stmt, locals())

In [4]:
from utils.preprocessing import zero_one_normalization

# compute velocity
df['vx'] = df.GPS_long.diff() / df.Time.diff()
df['vy'] = df.GPS_lat.diff() / df.Time.diff()
df.dropna(inplace=True)

# compute acceleration
df['ax'] = df.vx.diff() / df.Time.diff()
df['ay'] = df.vy.diff() / df.Time.diff()
df.dropna(inplace=True)

# 0-1 normalization

for col in ['vx', 'vy', 'ax', 'ay']:
    df[col] = zero_one_normalization(df[col])

selected_attributes = ['GPS_lat', 'GPS_long', 'Time', 'vx', 'vy', 'ax', 'ay', 'dBm']
df = df[selected_attributes]
df

Unnamed: 0,GPS_lat,GPS_long,Time,vx,vy,ax,ay,dBm
2,25.328313,51.425080,1561278477990,0.587335,0.702030,0.601616,0.809438,-45
3,25.328287,51.425072,1561278478996,0.587604,0.699785,0.602825,0.809960,-45
4,25.328255,51.425058,1561278480021,0.586167,0.698232,0.602545,0.810076,-45
5,25.328198,51.425045,1561278481009,0.586019,0.688861,0.602755,0.808792,-47
6,25.328133,51.425027,1561278481964,0.584285,0.685068,0.602475,0.809680,-47
...,...,...,...,...,...,...,...,...
528,25.331425,51.421340,1561280133158,0.607548,0.741344,0.602661,0.810040,-59
529,25.331608,51.421440,1561280138810,0.595506,0.720661,0.602422,0.809730,-59
530,25.331632,51.421472,1561280138960,0.654338,0.764265,0.668577,0.857162,-59
531,25.331563,51.421738,1561280144675,0.604317,0.704939,0.601312,0.808647,-57


In [7]:
from utils.visualization import plot_trace

plot_trace(df, mode="velocity")

## Pipelining the dataloader

In [2]:
from utils.preprocessing import data_preprocessing

traces = ['../data/drive-me-not/trace'+ str(i) + '.csv' for i in range(1, 9)]

traces_df = []
for trace in traces:
    traces_df.append(data_preprocessing(trace))
    
for df in traces_df:
    print(df.shape)

(531, 6)
(564, 6)
(316, 6)
(463, 6)
(297, 6)
(393, 6)
(396, 6)
(318, 6)


# PCA preprocessing

In [3]:
# pca
from utils.preprocessing import pca_transform, add_traces

N_COMPONENTS = 3

pca_dfs = [add_traces(df=pca_transform(df, n_components=N_COMPONENTS), num=i+1) for i, df in enumerate(traces_df)]

pca_dfs = pd.concat(pca_dfs)
pca_dfs['trace'] = pca_dfs['trace'].astype(int).astype(str)
pca_dfs.reset_index(drop=True, inplace=True)
pca_dfs

Unnamed: 0,pca-one,pca-two,pca-three,trace
0,0.410902,0.305377,0.299825,1
1,0.411804,0.306539,0.298783,1
2,0.411490,0.308644,0.298339,1
3,0.414135,0.316684,0.293899,1
4,0.414111,0.320337,0.291218,1
...,...,...,...,...
3273,0.799903,1.000000,1.000000,8
3274,0.397145,0.466808,0.116324,8
3275,0.408979,0.491982,0.214291,8
3276,0.395380,0.512113,0.242318,8


In [4]:
from utils.visualization import plot_pca

plot_pca(pca_dfs, n_components=N_COMPONENTS)

# One-class Classification

## OCSVM

In [314]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV
import math

def optimize_OneClassSVM(X, n):
    
    print('Searching for optimal hyperparameters...')
    nu = np.linspace(start=1e-5, stop=1e-2, num=n)
    gamma = np.linspace(start=1e-6, stop=1e-3, num=n)
    opt_diff = 1.0
    opt_nu = None
    opt_gamma = None
    
    for i in range(len(nu)):
        for j in range(len(gamma)):
            classifier = OneClassSVM(kernel="rbf", nu=nu[i], gamma=gamma[j])
            classifier.fit(X)
            label = classifier.predict(X)
            
            p = 1 - float(sum(label == 1.0)) / len(label)
            
            diff = math.fabs(p - nu[i]) # difference between the predicted and expected error rate
            
            if diff < opt_diff: # update the optimal hyperparameters
                opt_diff = diff
                opt_nu = nu[i]
                opt_gamma = gamma[j]
                
    print("Found: nu = %d, gamma = %f" % (opt_nu, opt_gamma))
    return opt_nu, opt_gamma

df_train = pca_dfs[['pca-one', 'pca-two', 'pca-three']]
nu_opt, gamma_opt = optimize_OneClassSVM(df_train, 20)



Searching for optimal hyperparameters...
Found: nu = 0, gamma = 0.000264


## Local Outlier Factor (LOF)

# Autoencoder