Using Data Augmentation in Continuous Authentication on Smartphones  
https://ieeexplore.ieee.org/document/8398208

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import json
import os
from glob import glob
import pickle
import time
pd.set_option('display.max_columns', None)
import scipy as sp
from scipy.fftpack import fft
from scipy.signal import periodogram
import math
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings('once')

In [3]:
sys.path = ['../../../applications/scikitfeature'] + sys.path

In [4]:
from skfeature.function.similarity_based import fisher_score

<br>

#### Params

In [5]:
params = {}

#### Build window for each user

```
Among all 100 users, there were two users whose data were manually discarded due to extremely abnormal values. Among the remaining 98 users, to ensure that users have the same amount of data, we select the first 100 min of data for each user with a 6-s window size. 
```


In [6]:
df_activity_info = pd.read_csv('../../../mobile-authentication/data/hmog_pp/activity_info.csv')
# sort activity by 'subjectid' and 'starttime' in ascending order
df_activity_info = df_activity_info.sort_values(by=['id', 'starttime'], ascending=True)
df_activity_info.tail()

Unnamed: 0,id,subjectid,session_number,taskid,contentid,gesture_scenario,starttime,endtime,activity_millisec,activity_sec,reading,writing,map,touch_cnt,accele_cnt,gyro_cnt,magnet_cnt
6158,998757242000001,998757,24,23,2,1,1399503673242,1399503705549,32307,32.307,0,0,1,2112,3226,3222,3222
6159,998757243000001,998757,24,23,3,1,1399503705560,1399503730783,25223,25.223,0,0,1,1309,2346,2262,2269
6160,998757244000001,998757,24,23,4,1,1399503730813,1399503770545,39732,39.732,0,0,1,1584,3871,3827,3827
6161,998757245000001,998757,24,23,5,1,1399503770554,1399503813868,43314,43.314,0,0,1,1699,4315,4312,4312
6162,998757246000001,998757,24,23,6,1,1399503813893,1399503841765,27872,27.872,0,0,1,1059,2788,2787,2787


In [7]:
params['subjectid'] = [int(x) for x in df_activity_info.subjectid.unique()]
print('subjectid', len(params['subjectid']))

subjectid 100


<br>
Select 100 min sample per each user 
<br>
window size = 6 seconds
<br>
512 samples for each time window

In [8]:
timelimit = 100 * 60 * 1000 # 100 min in milliseconds
acc_window_list = []
gyro_window_list = []
activityid_list = []
subjectid_list = []

for i in range(len(params['subjectid'])):
    start_time = time.time()
    subjectid = params['subjectid'][i]
    subjectid_df = df_activity_info[df_activity_info.subjectid == subjectid]
    time_sum = 0

    for j in np.arange(len(subjectid_df)):
        activityid_df = subjectid_df.iloc[j]
        time_sum += int(activityid_df.activity_millisec)
        activityid = int(activityid_df.id)

        # 6 sec window segmentation
        df = pd.read_csv('../../../mobile-authentication/data/hmog_pp/merged_activity/{}.csv'.format(activityid))
        cursor = 0
        while cursor < len(df):
            window_df = df[df.millisec.isin(range(cursor,cursor+6000))]
            acc_arr = np.array(window_df[['accele_x', 'accele_y', 'accele_z']].dropna())
            gyro_arr = np.array(window_df[['gyro_x', 'gyro_y', 'gyro_z']].dropna())
            # only use data containing 512 or more samples
            if (len(acc_arr) >= 512) & (len(gyro_arr) >= 512):
                acc_window_list.append(acc_arr[:512]) # 512 sample only
                gyro_window_list.append(gyro_arr[:512])
                activityid_list.append(activityid)
                subjectid_list.append(subjectid)
            cursor += 6000

        if time_sum > timelimit:
            last_activityid = activityid_df.id
            last_endtime = int(activityid_df.endtime - (time_sum - timelimit))
            break   
    end_time = time.time()
    print('[{}] subjectid:{} record:{} {:.2f}'.format(i, subjectid, len(acc_window_list), end_time-start_time))
acc_window_list = np.array(acc_window_list)
gyro_window_list = np.array(gyro_window_list)
activityid_list = np.array(activityid_list)
subjectid_list = np.array(subjectid_list)
print('acc_window_list', acc_window_list.shape)
print('gyro_window_list', gyro_window_list.shape)
print('activityid_list', activityid_list.shape)
print('subjectid_list', subjectid_list.shape)
np.save('../../../mobile-authentication/data/hmog_pp/target/sensorauth/acc_window_list.npy', acc_window_list)
np.save('../../../mobile-authentication/data/hmog_pp/target/sensorauth/gyro_window_list.npy', gyro_window_list)
np.save('../../../mobile-authentication/data/hmog_pp/target/sensorauth/activityid_list.npy', activityid_list)
np.save('../../../mobile-authentication/data/hmog_pp/target/sensorauth/subjectid_list.npy', subjectid_list)

[0] subjectid:100669 record:1001 37.27
[1] subjectid:151985 record:1986 34.74
[2] subjectid:171538 record:2974 30.24
[3] subjectid:180679 record:3950 35.14
[4] subjectid:186676 record:4939 30.92
[5] subjectid:201848 record:5973 38.97
[6] subjectid:207696 record:6983 31.68
[7] subjectid:218719 record:8026 43.02
[8] subjectid:219303 record:8895 33.34
[9] subjectid:220962 record:9864 50.04
[10] subjectid:240168 record:10848 40.22
[11] subjectid:248252 record:11838 40.51
[12] subjectid:256487 record:12829 41.49
[13] subjectid:257279 record:13842 38.34
[14] subjectid:261313 record:14838 30.62
[15] subjectid:264325 record:15853 39.94
[16] subjectid:277905 record:16868 32.64
[17] subjectid:278135 record:17863 38.32
[18] subjectid:326223 record:18838 31.80
[19] subjectid:336172 record:19812 36.61
[20] subjectid:342329 record:20789 28.98
[21] subjectid:352716 record:21772 35.66
[22] subjectid:366286 record:22743 32.85
[23] subjectid:368258 record:23761 39.27
[24] subjectid:389015 record:24777 3

<br>
Extract features 
<br>
magnitude = L2(x, y, z)
<br>
Time + Frequency domain

In [9]:
acc_window_mag_list = np.sqrt(np.sum((acc_window_list ** 2), axis=-1))
gyro_window_mag_list = np.sqrt(np.sum((gyro_window_list ** 2), axis=-1))
print('acc_window_mag_list', acc_window_mag_list.shape)
print('gyro_window_mag_list', gyro_window_mag_list.shape)
np.save('../../../mobile-authentication/data/hmog_pp/target/sensorauth/acc_window_mag_list.npy', acc_window_mag_list)
np.save('../../../mobile-authentication/data/hmog_pp/target/sensorauth/gyro_window_mag_list.npy', gyro_window_mag_list)

acc_window_mag_list (98587, 512)
gyro_window_mag_list (98587, 512)


* Time Domain  
`mean`: mean of the magnitudes of sensor readings  
`median`: median of the magnitudes of sensor readings  
`std`: standard deviation of the magnitudes of sensor readings  
`max`: maximum value of the magnitudes of sensor readings  
`min`: minimum value of the magnitudes of sensor readings  
`range`: difference between the maximum value and the minimum value of the magnitudes of sensor readings  
`kurtosis`: width of peak of the magnitudes of sensor readings  
`skewness`: orientation of peak of the magnitudes of sensor readings  
`quantiles_25`: 25% quantiles of magnitudes of sensor readings  
`quantiles_50`: 50% quantiles of magnitudes of sensor readings  
`quantiles_75`: 75% quantiles of magnitudes of sensor readings

In [10]:
acc_feature_dict = {}
acc_feature_dict['mean'] = np.mean(acc_window_mag_list, axis=1)
acc_feature_dict['median'] = np.median(acc_window_mag_list, axis=1)
acc_feature_dict['std'] = np.std(acc_window_mag_list, axis=1)
acc_feature_dict['max'] = np.max(acc_window_mag_list, axis=1)
acc_feature_dict['min'] = np.min(acc_window_mag_list, axis=1)
acc_feature_dict['range'] = acc_feature_dict['max'] - acc_feature_dict['min']
acc_feature_dict['kurtosis'] = sp.stats.kurtosis(acc_window_mag_list, axis=1)
acc_feature_dict['skewness'] = sp.stats.skew(acc_window_mag_list, axis=1)
acc_feature_dict['quantiles_25'] = np.quantile(acc_window_mag_list, 0.25, axis=1)
acc_feature_dict['quantiles_50'] = np.quantile(acc_window_mag_list, 0.5, axis=1)
acc_feature_dict['quantiles_75'] = np.quantile(acc_window_mag_list, 0.75, axis=1)

In [11]:
gyro_feature_dict = {}
gyro_feature_dict['mean'] = np.mean(gyro_window_mag_list, axis=1)
gyro_feature_dict['median'] = np.median(gyro_window_mag_list, axis=1)
gyro_feature_dict['std'] = np.std(gyro_window_mag_list, axis=1)
gyro_feature_dict['max'] = np.max(gyro_window_mag_list, axis=1)
gyro_feature_dict['min'] = np.min(gyro_window_mag_list, axis=1)
gyro_feature_dict['range'] = gyro_feature_dict['max'] - gyro_feature_dict['min']
gyro_feature_dict['kurtosis'] = sp.stats.kurtosis(gyro_window_mag_list, axis=1)
gyro_feature_dict['skewness'] = sp.stats.skew(gyro_window_mag_list, axis=1)
gyro_feature_dict['quantiles_25'] = np.quantile(gyro_window_mag_list, 0.25, axis=1)
gyro_feature_dict['quantiles_50'] = np.quantile(gyro_window_mag_list, 0.5, axis=1)
gyro_feature_dict['quantiles_75'] = np.quantile(gyro_window_mag_list, 0.75, axis=1)

* Frequency Domain  
`energy`: intensity of the magnitudes of sensor readings  
`entropy`: dispersion of spectral distribution of the magnitudes of sensor readings  
`peak1`: amplitude of the first highest peak of the magnitudes of sensor readings  
`peak2f`: frequency of the second highest peak of the manitudes of sensor readings  
`peak2`: amplitude of the second highest peak of the manitudes of sensor readings

In [12]:
acc_feature_dict['energy'] = []
acc_feature_dict['entropy'] = []
acc_feature_dict['peak1'] = []
acc_feature_dict['peak2f'] = []
acc_feature_dict['peak2'] = []
start_time = time.time()

for i in range(len(acc_window_mag_list)):
    Fs = 100                     # Sampling frequency
    T = 1/Fs                     # Sample interval time
    te= 5.2                     # End of time
    t = np.arange(0, te, T)      # Time vector
    y = acc_window_mag_list[i]
    y.shape
    # Calculate FFT ....................
    n=len(y)        # Length of signal
    NFFT=n      # ?? NFFT=2^nextpow2(length(y))  ??
    k=np.arange(NFFT)
    f0=k*Fs/NFFT    # double sides frequency range
    f0=f0[range(math.trunc(NFFT/2))]        # single sied frequency range

    Y=np.fft.fft(y)/NFFT        # fft computing and normaliation
    Y=Y[range(math.trunc(NFFT/2))]          # single sied frequency range
    amplitude_Hz = 2*abs(Y)
    freq_res = pd.DataFrame(np.array([f0, amplitude_Hz]).T, columns=['freq', 'Hz'])
    freq_res = freq_res.sort_values(by='Hz', ascending=False)
    # spectral enrtopy (https://raphaelvallat.com/entropy/build/html/_modules/entropy/entropy.html#spectral_entropy)
    _, psd = periodogram(y, Fs)
    psd_norm = np.divide(psd, psd.sum())
    entropy = -np.multiply(psd_norm, np.log2(psd_norm)).sum()
    
    acc_feature_dict['energy'].append(sum(np.power(np.abs(Y),2))*n*T)
    acc_feature_dict['entropy'].append(entropy)
    acc_feature_dict['peak1'].append(freq_res.iloc[0].Hz)
    acc_feature_dict['peak2f'].append(freq_res.iloc[1].freq)
    acc_feature_dict['peak2'].append(freq_res.iloc[1].Hz)
    
acc_feature_dict['energy'] = np.array(acc_feature_dict['energy'])
acc_feature_dict['entropy'] = np.array(acc_feature_dict['entropy'])
acc_feature_dict['peak1'] = np.array(acc_feature_dict['peak1'])
acc_feature_dict['peak2f'] = np.array(acc_feature_dict['peak2f'])
acc_feature_dict['peak2'] = np.array(acc_feature_dict['peak2'])
end_time = time.time()
print(i, round((end_time - start_time)/60, 2), 'min')



98586 3.34 min


In [13]:
gyro_feature_dict['energy'] = []
gyro_feature_dict['entropy'] = []
gyro_feature_dict['peak1'] = []
gyro_feature_dict['peak2f'] = []
gyro_feature_dict['peak2'] = []
start_time = time.time()

for i in range(len(gyro_window_mag_list)):
    Fs = 100                     # Sampling frequency
    T = 1/Fs                     # Sample interval time
    te= 5.2                     # End of time
    t = np.arange(0, te, T)      # Time vector
    y = gyro_window_mag_list[i]
    y.shape
    # Calculate FFT ....................
    n=len(y)        # Length of signal
    NFFT=n      # ?? NFFT=2^nextpow2(length(y))  ??
    k=np.arange(NFFT)
    f0=k*Fs/NFFT    # double sides frequency range
    f0=f0[range(math.trunc(NFFT/2))]        # single sied frequency range

    Y=np.fft.fft(y)/NFFT        # fft computing and normaliation
    Y=Y[range(math.trunc(NFFT/2))]          # single sied frequency range
    amplitude_Hz = 2*abs(Y)
    freq_res = pd.DataFrame(np.array([f0, amplitude_Hz]).T, columns=['freq', 'Hz'])
    freq_res = freq_res.sort_values(by='Hz', ascending=False)
    # spectral enrtopy (https://raphaelvallat.com/entropy/build/html/_modules/entropy/entropy.html#spectral_entropy)
    _, psd = periodogram(y, Fs)
    psd_norm = np.divide(psd, psd.sum())
    entropy = -np.multiply(psd_norm, np.log2(psd_norm)).sum()
    
    gyro_feature_dict['energy'].append(sum(np.power(np.abs(Y),2))*n*T)
    gyro_feature_dict['entropy'].append(entropy)
    gyro_feature_dict['peak1'].append(freq_res.iloc[0].Hz)
    gyro_feature_dict['peak2f'].append(freq_res.iloc[1].freq)
    gyro_feature_dict['peak2'].append(freq_res.iloc[1].Hz)
    
gyro_feature_dict['energy'] = np.array(gyro_feature_dict['energy'])
gyro_feature_dict['entropy'] = np.array(gyro_feature_dict['entropy'])
gyro_feature_dict['peak1'] = np.array(gyro_feature_dict['peak1'])
gyro_feature_dict['peak2f'] = np.array(gyro_feature_dict['peak2f'])
gyro_feature_dict['peak2'] = np.array(gyro_feature_dict['peak2'])
end_time = time.time()
print(i, round((end_time - start_time)/60, 2), 'min')



98586 3.34 min


<br>
save feature dict

In [14]:
pickle.dump(acc_feature_dict, open('../../../mobile-authentication/data/hmog_pp/target/sensorauth/acc_feature_dict.pkl', 'wb'))
pickle.dump(gyro_feature_dict, open('../../../mobile-authentication/data/hmog_pp/target/sensorauth/gyro_feature_dict.pkl', 'wb'))

  """Entry point for launching an IPython kernel.
  


<br>

#### build dataset df

In [15]:
activityid_list = np.load('../../../mobile-authentication/data/hmog_pp/target/sensorauth/activityid_list.npy')
subjectid_list = np.load('../../../mobile-authentication/data/hmog_pp/target/sensorauth/subjectid_list.npy')
acc_feature_dict = pickle.load(open('../../../mobile-authentication/data/hmog_pp/target/sensorauth/acc_feature_dict.pkl', 'rb'))
gyro_feature_dict = pickle.load(open('../../../mobile-authentication/data/hmog_pp/target/sensorauth/gyro_feature_dict.pkl', 'rb'))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [16]:
df_data = pd.DataFrame(subjectid_list, columns=['subjectid'])
subjectid_dict = {x:i for i,x in enumerate(params['subjectid'])}
df_data['label'] = [subjectid_dict[x] for x in df_data['subjectid']]
acc_cols = ['mean', 'median', 'std', 'max', 'min', 'range', 'kurtosis', 'skewness',
            'quantiles_25', 'quantiles_50', 'quantiles_75', 'energy', 'entropy', 'peak1', 'peak2f', 'peak2']
gyro_cols = ['mean', 'median', 'std', 'max', 'min', 'range', 'kurtosis', 'skewness',
            'quantiles_25', 'quantiles_50', 'quantiles_75', 'energy', 'entropy', 'peak1', 'peak2f', 'peak2']

for col in acc_cols[:11]:
    df_data['acc_t_'+col] = acc_feature_dict[col]
for col in acc_cols[11:]:
    df_data['acc_f_'+col] = acc_feature_dict[col]
for col in gyro_cols[:11]:
    df_data['gyro_t_'+col] = gyro_feature_dict[col]
for col in gyro_cols[11:]:
    df_data['gyro_f_'+col] = gyro_feature_dict[col]

In [17]:
print(df_data.shape)
df_data.to_csv('../../../mobile-authentication/data/hmog_pp/target/sensorauth/data.csv', index=False)
df_data.tail()

(98587, 34)


Unnamed: 0,subjectid,label,acc_t_mean,acc_t_median,acc_t_std,acc_t_max,acc_t_min,acc_t_range,acc_t_kurtosis,acc_t_skewness,acc_t_quantiles_25,acc_t_quantiles_50,acc_t_quantiles_75,acc_f_energy,acc_f_entropy,acc_f_peak1,acc_f_peak2f,acc_f_peak2,gyro_t_mean,gyro_t_median,gyro_t_std,gyro_t_max,gyro_t_min,gyro_t_range,gyro_t_kurtosis,gyro_t_skewness,gyro_t_quantiles_25,gyro_t_quantiles_50,gyro_t_quantiles_75,gyro_f_energy,gyro_f_entropy,gyro_f_peak1,gyro_f_peak2f,gyro_f_peak2
98582,998757,99,10.033684,10.146326,0.656079,11.643142,7.120305,4.522836,0.258244,-0.380204,9.530538,10.146326,10.447362,516.556881,4.51894,20.067367,1.171875,0.565463,0.420649,0.354958,0.238959,1.17604,0.026391,1.149649,0.8803,1.078294,0.261461,0.354958,0.534094,1.052142,4.840975,0.841299,0.390625,0.121211
98583,998757,99,10.034959,10.033893,0.632435,11.658115,7.849859,3.808256,0.145641,-0.169806,9.599736,10.033893,10.444976,516.61001,5.280049,20.069918,1.171875,0.385769,0.464794,0.433832,0.232352,1.577818,0.073399,1.504419,2.559486,1.166789,0.291921,0.433832,0.605509,1.244301,4.819591,0.929589,0.195312,0.131289
98584,998757,99,9.982186,10.030775,1.045393,14.704916,6.794541,7.910376,3.330088,0.729173,9.390713,10.030775,10.501706,512.975188,4.970194,19.964373,1.171875,0.669656,0.877039,0.73978,0.622908,3.466881,0.126377,3.340504,3.95329,1.872905,0.462043,0.73978,1.038594,4.931603,3.847863,1.754077,1.171875,0.368487
98585,998757,99,9.88247,10.010674,0.848183,12.161006,7.790751,4.370255,-0.569274,-0.24521,9.319937,10.010674,10.524536,501.877342,4.634573,19.76494,1.171875,0.49694,0.327692,0.305621,0.131579,0.899636,0.111517,0.78812,1.681638,1.097356,0.228629,0.305621,0.393619,0.594116,5.216942,0.655384,1.367188,0.078143
98586,998757,99,9.863207,9.923339,0.732925,11.888627,7.854968,4.033659,-0.116119,-0.33065,9.349914,9.923339,10.373766,499.463344,4.600916,19.726413,1.171875,0.474175,0.303166,0.276176,0.128324,0.701072,0.059985,0.641087,0.14543,0.778632,0.207795,0.276176,0.378909,0.512732,4.724558,0.606331,0.195312,0.08258


In [18]:
df_data = pd.read_csv('../../../mobile-authentication/data/hmog_pp/target/sensorauth/data.csv')

In [19]:
X = np.array(df_data.iloc[:,2:])
scaler = StandardScaler()# standard scale
X = scaler.fit_transform(X)
y = df_data.label.values    # label
n_samples, n_features = X.shape    # number of samples and number of features

# split data into 10 folds
kf = KFold(n_splits=20, shuffle=True)
kf.get_n_splits(X)

# perform evaluation on classification task
num_fea = 20    # number of selected features
clf = svm.LinearSVC()    # linear SVM

score_list = []
num = 0
for train, test in kf.split(X):
    num += 1
    start_time = time.time()
    # obtain the score of each feature on the training set
    score = fisher_score.fisher_score(X[test], y[test])
    score = np.nan_to_num(score, 0)
    score_list.append(score)
    end_time = time.time()
    print('[{}]: {:.2f} sec'.format(num, end_time-start_time))

  D_prime[D_prime < 1e-12] = 10000


[1]: 7.54 sec


  D_prime[D_prime < 1e-12] = 10000


[2]: 7.23 sec


  D_prime[D_prime < 1e-12] = 10000


[3]: 7.37 sec


  D_prime[D_prime < 1e-12] = 10000


[4]: 7.12 sec


  D_prime[D_prime < 1e-12] = 10000


[5]: 7.20 sec


  D_prime[D_prime < 1e-12] = 10000


[6]: 7.20 sec


  D_prime[D_prime < 1e-12] = 10000


[7]: 7.16 sec


  D_prime[D_prime < 1e-12] = 10000


[8]: 7.23 sec


  D_prime[D_prime < 1e-12] = 10000


[9]: 7.20 sec


  D_prime[D_prime < 1e-12] = 10000


[10]: 7.11 sec


  D_prime[D_prime < 1e-12] = 10000


[11]: 7.10 sec


  D_prime[D_prime < 1e-12] = 10000


[12]: 7.32 sec


  D_prime[D_prime < 1e-12] = 10000


[13]: 7.19 sec


  D_prime[D_prime < 1e-12] = 10000


[14]: 7.22 sec


  D_prime[D_prime < 1e-12] = 10000


[15]: 7.20 sec


  D_prime[D_prime < 1e-12] = 10000


[16]: 7.22 sec


  D_prime[D_prime < 1e-12] = 10000


[17]: 7.19 sec


  D_prime[D_prime < 1e-12] = 10000


[18]: 7.15 sec


  D_prime[D_prime < 1e-12] = 10000


[19]: 7.23 sec
[20]: 7.14 sec


  D_prime[D_prime < 1e-12] = 10000


In [20]:
sum_score = np.sum(np.array(score_list), axis=0)
fisher_score_df = pd.DataFrame(np.array([df_data.columns[2:].values, sum_score]).T, columns=['feature', 'score'])
fisher_score_df['rate'] =  fisher_score_df.score / fisher_score_df.score.sum()
fisher_score_df = fisher_score_df.sort_values(by='rate', ascending=False)
fisher_score_df['cum_rate'] = fisher_score_df.rate.cumsum()

In [21]:
valid_features = fisher_score_df[fisher_score_df.cum_rate < 0.91].feature.values
print('valid_features', len(valid_features))
fisher_score_df[fisher_score_df.cum_rate < 0.91]

valid_features 16


Unnamed: 0,feature,score,rate,cum_rate
1,acc_t_median,42.572,0.121511,0.121511
9,acc_t_quantiles_50,42.572,0.121511,0.243021
0,acc_t_mean,42.5228,0.12137,0.364392
13,acc_f_peak1,42.5228,0.12137,0.485762
11,acc_f_energy,41.8186,0.11936,0.605122
8,acc_t_quantiles_25,29.4108,0.0839456,0.689068
10,acc_t_quantiles_75,28.9059,0.0825043,0.771572
4,acc_t_min,7.04716,0.0201143,0.791686
3,acc_t_max,6.53679,0.0186576,0.810344
15,acc_f_peak2,6.06999,0.0173252,0.827669


<br>

#### One-Class SVM score

In [22]:
from sklearn.svm import OneClassSVM
from sklearn.utils import shuffle

<br>
train data size = 500

In [23]:
np.random.seed(0)
for train_cnt in [100, 200, 300, 400, 500]:
    print('train data size = {}'.format(train_cnt))
    avg_score_dict = {}
    for i in range(len(params['subjectid'])):
        start_time = time.time()
        avg_score_dict[i] = {'accuracy':[], 'far':[], 'frr':[], 'hter':[]}
        for j in range(20):
            target_df = df_data[df_data.label == i][valid_features]
            target_df = shuffle(target_df)
            x_train = np.array(target_df.iloc[:train_cnt,:])
            x_test = np.array(target_df.iloc[train_cnt:,:])
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.transform(x_test)

            intruder_df = df_data[df_data.label != i][valid_features]
            intruder_df = shuffle(intruder_df)
            x_intruder = np.array(intruder_df)
            x_intruder = scaler.transform(x_intruder)

            # split test data 10 folds
            x_test_splits = []
            x_intruder_splits = []
            cursor_list = np.arange(0, len(x_test), len(x_test) // 10)
            for start, end in zip(cursor_list[:-1], cursor_list[1:]):
                x_test_splits.append(x_test[start:end])
                x_intruder_splits.append(x_intruder[start:end])

            # train SVM
            svm = OneClassSVM(gamma=0.05, nu=0.01).fit(x_train)
            accuracy_list = []
            far_list = []
            frr_list = []
            hter_list = []
            for test, intruder in zip(x_test_splits, x_intruder_splits):
                x_valid = np.vstack([test, intruder])
                y_true = np.array([1]*len(test) + [-1]*len(intruder))
                y_pred = np.array(svm.predict(x_valid))

                # score (-1 is positive)
                tp = np.sum((y_true == -1) & (y_pred == -1))
                fp = np.sum((y_true == 1) & (y_pred == -1))
                tn = np.sum((y_true == 1) & (y_pred == 1))
                fn = np.sum((y_true == -1) & (y_pred == 1))
                accuracy = np.sum((tp+tn)/(tp+fp+tn+fn)); accuracy_list.append(accuracy);
                if (tp + fp) != 0:
                    far = fp / (tp + fp); far_list.append(far);
                else:
                    far_list.append(0.);
                if (tn + fn) != 0:
                    frr = fn / (tn + fn); frr_list.append(frr);
                else:
                    frr_list.append(0.)
                hter = (far + frr) / 2; hter_list.append(hter);

            # update average 10-fold score
            avg_score_dict[i]['accuracy'].append(np.mean(accuracy_list))
            avg_score_dict[i]['far'].append(np.mean(far_list))
            avg_score_dict[i]['frr'].append(np.mean(frr_list))
            avg_score_dict[i]['hter'].append(np.mean(hter_list))

        end_time = time.time()
        print('[{}] {} accracy:{:.2f} FAR:{:.2f} FRR:{:.2f} HTER:{:.2f}  {:.2f} sec'.format(
            i, params['subjectid'][i], np.mean(avg_score_dict[i]['accuracy'])*100, np.mean(avg_score_dict[i]['far'][-1])*100,
            np.mean(avg_score_dict[i]['frr'][-1])*100, np.mean(avg_score_dict[i]['hter'][-1])*100, end_time - start_time))
    print('')

    # save average score dict
    pickle.dump(avg_score_dict, open('../../../mobile-authentication/log/target/sensorauth/avg_score_dict_dsize{}.pkl'.format(train_cnt), 'wb'))

train data size = 100
[0] 100669 accracy:82.89 FAR:20.46 FRR:18.08 HTER:19.27  0.93 sec
[1] 151985 accracy:82.29 FAR:15.63 FRR:22.61 HTER:19.12  0.90 sec
[2] 171538 accracy:75.76 FAR:23.42 FRR:28.75 HTER:26.09  0.90 sec
[3] 180679 accracy:79.47 FAR:17.12 FRR:21.91 HTER:19.51  0.90 sec
[4] 186676 accracy:80.08 FAR:15.69 FRR:21.04 HTER:18.36  0.90 sec
[5] 201848 accracy:65.15 FAR:20.80 FRR:36.31 HTER:28.55  0.90 sec
[6] 207696 accracy:86.11 FAR:12.24 FRR:10.04 HTER:11.14  0.97 sec
[7] 218719 accracy:86.37 FAR:13.36 FRR:16.12 HTER:14.74  0.88 sec
[8] 219303 accracy:85.83 FAR:13.39 FRR:12.86 HTER:13.13  0.88 sec
[9] 220962 accracy:81.94 FAR:16.16 FRR:19.09 HTER:17.63  0.89 sec
[10] 240168 accracy:86.00 FAR:19.31 FRR:13.81 HTER:16.56  0.88 sec
[11] 248252 accracy:62.47 FAR:31.72 FRR:40.03 HTER:35.87  0.87 sec
[12] 256487 accracy:89.23 FAR:11.81 FRR:4.86 HTER:8.34  0.89 sec
[13] 257279 accracy:84.26 FAR:17.46 FRR:17.78 HTER:17.62  0.93 sec
[14] 261313 accracy:79.32 FAR:17.93 FRR:25.36 HTER:2



[0] 100669 accracy:84.93 FAR:9.66 FRR:18.95 HTER:14.31  0.89 sec
[1] 151985 accracy:82.59 FAR:10.75 FRR:23.21 HTER:16.98  0.88 sec
[2] 171538 accracy:77.35 FAR:10.82 FRR:27.26 HTER:19.04  0.89 sec
[3] 180679 accracy:80.53 FAR:7.79 FRR:26.40 HTER:17.10  0.88 sec
[4] 186676 accracy:80.93 FAR:16.50 FRR:23.55 HTER:20.02  0.89 sec
[5] 201848 accracy:64.53 FAR:17.94 FRR:38.10 HTER:28.02  0.87 sec
[6] 207696 accracy:87.63 FAR:10.85 FRR:12.30 HTER:11.57  0.90 sec
[7] 218719 accracy:85.97 FAR:8.17 FRR:17.06 HTER:12.62  0.87 sec
[8] 219303 accracy:87.36 FAR:9.48 FRR:16.05 HTER:12.77  0.88 sec
[9] 220962 accracy:83.84 FAR:9.81 FRR:18.94 HTER:14.38  0.88 sec
[10] 240168 accracy:87.13 FAR:11.81 FRR:13.36 HTER:12.59  0.90 sec
[11] 248252 accracy:62.23 FAR:18.60 FRR:40.68 HTER:29.64  0.87 sec
[12] 256487 accracy:91.68 FAR:10.84 FRR:5.67 HTER:8.25  0.91 sec
[13] 257279 accracy:84.90 FAR:8.82 FRR:19.68 HTER:14.25  0.91 sec
[14] 261313 accracy:77.88 FAR:7.89 FRR:28.48 HTER:18.18  0.90 sec
[15] 264325 ac



[0] 100669 accracy:85.41 FAR:10.42 FRR:21.42 HTER:15.92  0.91 sec
[1] 151985 accracy:82.71 FAR:8.12 FRR:21.61 HTER:14.87  0.89 sec
[2] 171538 accracy:77.36 FAR:6.77 FRR:32.19 HTER:19.48  0.89 sec
[3] 180679 accracy:80.38 FAR:7.42 FRR:24.23 HTER:15.83  0.89 sec
[4] 186676 accracy:81.12 FAR:10.30 FRR:22.86 HTER:16.58  0.87 sec
[5] 201848 accracy:64.64 FAR:16.19 FRR:41.59 HTER:28.89  0.88 sec
[6] 207696 accracy:88.03 FAR:11.47 FRR:10.97 HTER:11.22  0.90 sec
[7] 218719 accracy:83.53 FAR:8.83 FRR:26.21 HTER:17.52  0.87 sec
[8] 219303 accracy:87.46 FAR:8.49 FRR:14.49 HTER:11.49  0.88 sec
[9] 220962 accracy:83.61 FAR:5.48 FRR:23.14 HTER:14.31  0.88 sec
[10] 240168 accracy:87.38 FAR:12.06 FRR:15.68 HTER:13.87  0.88 sec
[11] 248252 accracy:61.57 FAR:22.09 FRR:44.03 HTER:33.06  0.88 sec
[12] 256487 accracy:93.19 FAR:8.22 FRR:3.79 HTER:6.00  0.90 sec
[13] 257279 accracy:85.33 FAR:7.44 FRR:20.01 HTER:13.73  0.89 sec
[14] 261313 accracy:75.70 FAR:6.78 FRR:33.92 HTER:20.35  0.87 sec
[15] 264325 accr



[0] 100669 accracy:85.58 FAR:7.84 FRR:19.05 HTER:13.44  0.91 sec
[1] 151985 accracy:82.43 FAR:9.05 FRR:22.65 HTER:15.85  0.89 sec
[2] 171538 accracy:78.41 FAR:6.61 FRR:28.99 HTER:17.80  0.90 sec
[3] 180679 accracy:80.67 FAR:9.06 FRR:27.56 HTER:18.31  0.90 sec
[4] 186676 accracy:80.87 FAR:7.18 FRR:24.91 HTER:16.05  0.90 sec
[5] 201848 accracy:64.77 FAR:14.56 FRR:41.64 HTER:28.10  0.90 sec
[6] 207696 accracy:88.24 FAR:10.64 FRR:16.85 HTER:13.75  0.90 sec
[7] 218719 accracy:83.03 FAR:9.60 FRR:31.28 HTER:20.44  0.90 sec
[8] 219303 accracy:88.10 FAR:10.76 FRR:15.35 HTER:13.05  0.89 sec
[9] 220962 accracy:83.72 FAR:7.18 FRR:23.86 HTER:15.52  0.90 sec
[10] 240168 accracy:88.01 FAR:7.11 FRR:16.95 HTER:12.03  0.90 sec
[11] 248252 accracy:60.61 FAR:16.87 FRR:42.52 HTER:29.69  0.89 sec
[12] 256487 accracy:93.71 FAR:8.14 FRR:5.03 HTER:6.59  0.92 sec
[13] 257279 accracy:85.51 FAR:9.09 FRR:21.55 HTER:15.32  0.91 sec
[14] 261313 accracy:74.80 FAR:6.69 FRR:30.51 HTER:18.60  0.89 sec
[15] 264325 accrac



[0] 100669 accracy:85.53 FAR:9.47 FRR:18.38 HTER:13.92  0.91 sec
[1] 151985 accracy:82.66 FAR:8.95 FRR:21.24 HTER:15.09  0.89 sec
[2] 171538 accracy:78.21 FAR:7.00 FRR:28.56 HTER:17.78  0.88 sec
[3] 180679 accracy:80.19 FAR:9.79 FRR:23.82 HTER:16.81  0.90 sec
[4] 186676 accracy:81.21 FAR:8.24 FRR:26.23 HTER:17.24  0.89 sec
[5] 201848 accracy:64.21 FAR:12.49 FRR:41.40 HTER:26.95  0.90 sec
[6] 207696 accracy:88.34 FAR:7.10 FRR:11.82 HTER:9.46  0.91 sec
[7] 218719 accracy:83.17 FAR:7.03 FRR:21.02 HTER:14.03  0.88 sec
[8] 219303 accracy:87.72 FAR:5.67 FRR:13.99 HTER:9.83  0.89 sec
[9] 220962 accracy:83.95 FAR:2.42 FRR:22.64 HTER:12.53  0.94 sec
[10] 240168 accracy:88.20 FAR:3.69 FRR:16.97 HTER:10.33  0.90 sec
[11] 248252 accracy:60.29 FAR:17.86 FRR:42.84 HTER:30.35  0.89 sec
[12] 256487 accracy:94.15 FAR:9.23 FRR:3.51 HTER:6.37  0.92 sec
[13] 257279 accracy:85.46 FAR:4.88 FRR:21.53 HTER:13.21  0.91 sec
[14] 261313 accracy:74.60 FAR:6.12 FRR:29.36 HTER:17.74  0.88 sec
[15] 264325 accracy:52



In [24]:
for train_cnt in [100, 200, 300, 400, 500]:
    print('[train data size] = {}'.format(train_cnt))
    avg_score_dict = pickle.load(open('../../../mobile-authentication/log/target/sensorauth/avg_score_dict_dsize{}.pkl'.format(train_cnt), 'rb'))
    accuracy_log = []
    far_log = []
    frr_log = []
    hter_log = []
    for i in range(len(params['subjectid'])):
        accuracy_log += list(avg_score_dict[i]['accuracy'])
        far_log += list(avg_score_dict[i]['far'])
        frr_log += list(avg_score_dict[i]['frr'])
        hter_log += list(avg_score_dict[i]['hter'])
    print('Accuracy: mean-{:.2f}% median-{:.2f}% (std-{:.2f}%)'.format(np.mean(accuracy_log)*100, np.median(accuracy_log)*100, np.std(accuracy_log)*100))
    print('FAR: mean-mean-{:.2f}% median-{:.2f}% (std-{:.2f}%)'.format(np.mean(far_log)*100, np.median(far_log)*100, np.std(far_log)*100))
    print('FRR: mean-{:.2f}% median-{:.2f}% (std-{:.2f}%)'.format(np.mean(frr_log)*100, np.median(frr_log)*100, np.std(frr_log)*100))
    print('HTER: mean-{:.2f}% median-{:.2f}% (std-{:.2f}%)'.format(np.mean(hter_log)*100, np.median(hter_log)*100, np.std(hter_log)*100))
    print('')

[train data size] = 100
Accuracy: mean-77.51% median-79.37% (std-9.61%)
FAR: mean-mean-18.74% median-17.40% (std-6.24%)
FRR: mean-23.67% median-23.17% (std-12.29%)
HTER: mean-21.20% median-19.87% (std-8.63%)

[train data size] = 200
Accuracy: mean-78.50% median-80.55% (std-10.44%)
FAR: mean-mean-13.67% median-12.09% (std-5.76%)
FRR: mean-24.73% median-24.37% (std-12.27%)
HTER: mean-19.20% median-17.94% (std-8.41%)

[train data size] = 300
Accuracy: mean-78.79% median-80.92% (std-10.81%)
FAR: mean-mean-11.23% median-9.81% (std-5.32%)
FRR: mean-25.23% median-24.89% (std-12.27%)
HTER: mean-18.23% median-17.04% (std-8.21%)

[train data size] = 400
Accuracy: mean-78.81% median-80.96% (std-11.10%)
FAR: mean-mean-9.73% median-8.35% (std-5.06%)
FRR: mean-25.61% median-25.31% (std-12.32%)
HTER: mean-17.67% median-16.54% (std-8.08%)

[train data size] = 500
Accuracy: mean-78.81% median-80.98% (std-11.26%)
FAR: mean-mean-8.74% median-7.37% (std-4.95%)
FRR: mean-25.87% median-25.52% (std-12.30%)
H

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
