In [3]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points['label'][no_label] = 0

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

In [4]:
df = read_all_users('Data')
df

[1/182] processing user 000
[2/182] processing user 001
[3/182] processing user 002
[4/182] processing user 003
[5/182] processing user 004
[6/182] processing user 005
[7/182] processing user 006
[8/182] processing user 007
[9/182] processing user 008
[10/182] processing user 009
[11/182] processing user 010


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  points['label'][no_label] = 0


[12/182] processing user 011
[13/182] processing user 012
[14/182] processing user 013
[15/182] processing user 014
[16/182] processing user 015
[17/182] processing user 016
[18/182] processing user 017
[19/182] processing user 018
[20/182] processing user 019
[21/182] processing user 020
[22/182] processing user 021
[23/182] processing user 022
[24/182] processing user 023
[25/182] processing user 024
[26/182] processing user 025
[27/182] processing user 026
[28/182] processing user 027
[29/182] processing user 028
[30/182] processing user 029
[31/182] processing user 030
[32/182] processing user 031
[33/182] processing user 032
[34/182] processing user 033
[35/182] processing user 034
[36/182] processing user 035
[37/182] processing user 036
[38/182] processing user 037
[39/182] processing user 038
[40/182] processing user 039
[41/182] processing user 040
[42/182] processing user 041
[43/182] processing user 042
[44/182] processing user 043
[45/182] processing user 044
[46/182] proce

Unnamed: 0,time,lat,lon,alt,label,user
0,2008-10-23 02:53:04,39.984702,116.318417,492.000000,0,0
1,2008-10-23 02:53:10,39.984683,116.318450,492.000000,0,0
2,2008-10-23 02:53:15,39.984686,116.318417,492.000000,0,0
3,2008-10-23 02:53:20,39.984688,116.318385,492.000000,0,0
4,2008-10-23 02:53:25,39.984655,116.318263,492.000000,0,0
...,...,...,...,...,...,...
17,2008-03-14 03:39:56,40.914867,111.710500,3802.493438,0,181
18,2008-03-14 03:41:17,40.914267,111.710333,3795.931759,0,181
19,2008-03-14 03:43:02,40.912467,111.710667,3795.931759,0,181
20,2008-03-14 03:43:28,40.911517,111.711317,3779.527559,0,181


In [67]:
#### only for testing: selecting 4 users that bring a variety of labels
sp = df[df['user'].isin([1,80,100,150])].reset_index()
sp['label'].unique()

array([ 0,  3,  1, 11,  6,  2], dtype=int64)

In [50]:
# from geopy.distance import geodesic
# from geographiclib.geodesic import Geodesic
# import datetime
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler(feature_range=(-90,90))
# scaled = scaler.fit_transform(df['lat'].values.reshape(-1,1))
# df['lat'] = pd.DataFrame(scaled)
# df

In [51]:
from geopy.distance import geodesic
from geographiclib.geodesic import Geodesic
import datetime
from sklearn.preprocessing import MinMaxScaler

def get_distance(df):
    df['distance'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            coords_1 = (df.at[idx-1,'lat'], df.at[idx-1,'lon'])
            coords_2 = (df.at[idx,'lat'], df.at[idx,'lon'])
            df.at[idx,'distance'] = geodesic(coords_1, coords_2).m
    print('........................Finished dist......................')
    return df

def get_time(df):
    df['delta_time'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            start = df.at[idx-1,'time']
            end = df.at[idx,'time']
            df.at[idx,'delta_time'] = pd.Timedelta((end - start)).total_seconds()
    print('........................Finished time......................')
    return df

def get_speed(df):
    df['speed'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            if df.at[idx,'distance'] != 0 and df.at[idx,'delta_time'] != 0:
                df.at[idx,'speed'] = df.at[idx,'distance']/df.at[idx,'delta_time']
    print('........................Finished speed......................')
    return df

def get_acc(df):
    df['acceleration'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            if df.at[idx,'speed'] != 0 and df.at[idx,'delta_time'] != 0:
                df.at[idx,'acceleration'] = df.at[idx,'speed']/df.at[idx,'delta_time']
    print('........................Finished acc......................')
    return df

'''
A bearing is the direction you're facing, measured clockwise as an angle from true north on a compass. 
This can also be called a heading. In this system, north is 0° , east is 90° , south is 180° , and west is 270°.
'''
def get_bearing(df):
    df['bearing'] = 0
    for idx, row in df.iterrows():
        if idx != 0 and df.at[idx-1,'user']==df.at[idx,'user']:
            lat1, long1, lat2, long2 = df.at[idx-1,'lat'], df.at[idx-1,'lon'], df.at[idx,'lat'], df.at[idx,'lon']
            df.at[idx, 'bearing'] = Geodesic.WGS84.Inverse(lat1, long1, lat2, long2)['azi1']
    print('........................Finished bearing......................')
    return df

def get_features(df):
    df = get_distance(df) 
    df = get_time(df)
    df = get_speed(df)
    df = get_acc(df)
    df = get_bearing(df)
    return df

get_features(sp)
sp

........................Finished dist......................
........................Finished time......................
........................Finished speed......................
........................Finished acc......................
........................Finished bearing......................


Unnamed: 0,index,time,lat,lon,alt,label,user,distance,delta_time,speed,acceleration,bearing
0,0,2008-10-23 05:53:05,35.573783,116.319236,492.000000,0,1,0,0,0,0,0
1,1,2008-10-23 05:53:06,35.563060,116.319322,492.000000,0,1,1189,1,1189,1189,179
2,2,2008-10-23 05:53:11,35.564753,116.319402,492.000000,0,1,187,5,37,7,2
3,3,2008-10-23 05:53:16,35.565882,116.319389,492.000000,0,1,125,5,25,5,0
4,4,2008-10-23 05:53:21,35.547257,116.319422,491.000000,0,1,2066,5,413,82,179
...,...,...,...,...,...,...,...,...,...,...,...,...
118364,69,2007-08-10 10:51:55,35.453006,116.330100,226.377953,0,150,313,13,24,1,-178
118365,70,2007-08-10 11:05:10,35.418579,116.330233,226.377953,0,150,3819,795,4,0,179
118366,71,2007-08-10 11:06:00,35.419144,116.330533,226.377953,0,150,68,50,1,0,23
118367,72,2007-08-10 11:06:21,35.421401,116.330717,226.377953,0,150,251,21,11,0,3


In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X = sp[['lat','lon','alt','user','distance','delta_time','speed','acceleration', 'bearing']]

le = preprocessing.LabelEncoder()
y = le.fit_transform(sp[['label']].values.ravel())
X,y

NameError: name 'sp' is not defined

In [64]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

cvs = [3,5,7,10]
for c in cvs:
    xgboost = XGBClassifier(use_label_encoder=False,objective='multi:softmax',eval_metric='mlogloss')
    scores = cross_val_score(xgboost, X, y, cv=c)
    print('cv=',c,'->',scores)

cv= 3 -> [0.99662924 0.99723743 0.7707573 ]
cv= 5 -> [0.99598716 0.99784574 0.99708541 0.9980147  0.62083386]
cv= 7 -> [0.99568303 0.9975754  0.99763454 0.99698403 0.99798936 0.99881727
 0.46649713]
cv= 10 -> [0.99636732 0.99831038 0.9982259  0.9982259  0.99763454 0.99721213
 0.9982259  0.99805694 0.9978035  0.23496114]


In [54]:
X.shape, y.shape

((118369, 9), (118369,))

In [65]:
from sklearn.feature_selection import SelectKBest

vals = [2,3,5,7,'all']
for v in vals:
    X_new = SelectKBest(k=v).fit_transform(X, y)
    scores_new = cross_val_score(xgboost, X_new, y)
    print('v=',v,'->',scores_new)

v= 2 -> [0.99577596 0.99805694 0.99776126 0.99611388 0.61103367]
v= 3 -> [0.99564924 0.9980147  0.99746557 0.99670525 0.61107591]
v= 5 -> [0.99615612 0.9980147  0.99678973 0.9978035  0.61162506]
v= 7 -> [0.99598716 0.99776126 0.99708541 0.99793022 0.61500444]
v= all -> [0.99598716 0.99784574 0.99708541 0.9980147  0.62083386]
