## Combinação de variáveis para identificação de meios de transporte

A maior dificuldade dessa tarefa é diferenciar carros de ônibus e de táxis, tanto que em vários trabalhos essas categorias são unificadas em uma só. A partir da identificação de stay-points surgiu a ideia de relacionar as informações de quantidades de paradas e tempo de parada para facilitar nessa diferenciação, junto com as métricas de Fisher e de Jensen-Shannon.

Para a identificação dos pontos de parada será usada a biblioteca movingpandas. O tempo de cada parada pode ser obtido com a função get_stop_time_ranges() da classe TrajectoryStopDetector dessa biblioteca. Também pode ser possível incluir a variável tempo_entre_paradas.

In [89]:
import pandas as pd
import os
from datetime import datetime
import numpy as np
import geopandas as gpd
import glob
from pyproj import CRS
from shapely.geometry import Point
from datetime import timedelta
import movingpandas as mpd
import matplotlib.pyplot as plt
import haversine as hs
from haversine import Unit
import shapely
import warnings
import uuid
import math
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

In [95]:
''' 
When reading all users, don't forget to check for the labels.txt file
'''
def read_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    
    # df in which each row is a single trajectory
    transport_df = pd.DataFrame(columns=['Start Time','End Time','Transportation Mode'])
    
    # df of all GPS points recorded
    gps_points_df = pd.DataFrame(columns=['Identifier', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Label'])
    
    for i, sf in enumerate(subfolders):
        user_folder = os.path.join(folder,sf)
        labels_file = os.path.join(user_folder, 'labels.txt')
        if os.path.exists(labels_file):
            print('Processing user %s ---------------------------' % (sf))
            
            # single user df in which each row is a single trajectory
            u_transport_df = pd.read_csv(labels_file, sep="\t")
            u_transport_df['Identifier'] = 0
            
            # single user df of all GPS points recorded
            u_gps_points_df = pd.DataFrame(columns=['Identifier', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Label'])
                
            plt_files = os.path.join(user_folder, 'Trajectory')
            
            # searching for the corresponding trajectory for present .plt file based on timestamp
            for index, row in u_transport_df.iterrows():
                start = datetime.strptime(row['Start Time'],'%Y/%m/%d %H:%M:%S')
                end = datetime.strptime(row['End Time'],'%Y/%m/%d %H:%M:%S')

                #creating a random identifier for each trajectory so we can link them to the GPS points
                identifier = uuid.uuid1()
                u_transport_df.loc[index, 'Identifier'] = identifier

                # iterating through .plt files in Trajectory folder
                for filename in os.listdir(plt_files):
                    dt_object = datetime.strptime(filename[:-4], '%Y%m%d%H%M%S')
                
                    if dt_object >= start and dt_object <= end:
                        # turns .plt file into dataframe so it can be added to u_gps_points_df
                        file_df = pd.read_csv(os.path.join(plt_files, filename), skiprows=6, header=None,
                                 parse_dates=[[5, 6]], infer_datetime_format=True)
                        file_df.rename(inplace=True, columns={'5_6': 'Timestamp', 0: 'Latitude', 1: 'Longitude', 3: 'Altitude'})
                        file_df.drop(inplace=True, columns=[2, 4])

                        # receives identifier to indicate which trajectory it belongs to
                        file_df['Identifier'] = identifier

                        file_df['Label'] = u_transport_df.loc[index, 'Transportation Mode']

                        u_gps_points_df = pd.concat((u_gps_points_df,file_df), axis=0)
                        
            transport_df = pd.concat((transport_df,u_transport_df), axis=0)
            gps_points_df = pd.concat((gps_points_df,u_gps_points_df), axis=0)
    
    return transport_df,gps_points_df

start = datetime.now()
transport_df,gps_points_df = read_users('../Data')

end = datetime.now()
print('Tempo gasto:',end-start)

Processing user 010 ---------------------------
--------------Done
Processing user 020 ---------------------------
--------------Done
Processing user 021 ---------------------------
--------------Done
Processing user 052 ---------------------------
--------------Done
Processing user 053 ---------------------------
--------------Done
Processing user 056 ---------------------------
--------------Done
Processing user 058 ---------------------------
--------------Done
Processing user 059 ---------------------------
--------------Done
Processing user 060 ---------------------------
--------------Done
Processing user 062 ---------------------------
--------------Done
Processing user 064 ---------------------------
--------------Done
Processing user 065 ---------------------------
--------------Done
Processing user 067 ---------------------------
--------------Done
Processing user 068 ---------------------------
--------------Done
Processing user 069 ---------------------------
--------------

In [96]:
transport_df

Unnamed: 0,Start Time,End Time,Transportation Mode
0,2007/06/26 11:32:29,2007/06/26 11:40:29,bus
1,2008/03/28 14:52:54,2008/03/28 15:59:59,train
2,2008/03/28 16:00:00,2008/03/28 22:02:00,train
3,2008/03/29 01:27:50,2008/03/29 15:59:59,train
4,2008/03/29 16:00:00,2008/03/30 15:59:59,train
...,...,...,...
314,2008/11/17 06:59:58,2008/11/17 07:06:16,bus
315,2008/11/17 07:06:16,2008/11/17 07:14:32,walk
316,2008/11/29 01:58:05,2008/11/29 02:01:39,bus
317,2008/11/29 02:01:39,2008/11/29 02:07:57,walk


In [97]:
gps_points_df 

Unnamed: 0,time,lat,lon,alt,identifier,label
0,2008-03-28 16:00:01,39.50293,116.714948,-777,2,train
1,2008-03-28 16:01:00,39.497045,116.726137,-777,2,train
2,2008-03-28 16:01:59,39.489695,116.740047,-777,2,train
3,2008-03-28 16:02:59,39.481438,116.755648,-777,2,train
4,2008-03-28 16:03:58,39.472748,116.770972,-777,2,train
...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84,316,bus
4705,2008-11-29 08:15:54,40.00778,116.31936,88,316,bus
4706,2008-11-29 08:15:56,40.007756,116.319362,92,316,bus
4707,2008-11-29 08:15:58,40.00774,116.319361,97,316,bus


In [98]:
from sklearn import preprocessing

gps_points_df_scaled = gps_points_df.copy()

values = gps_points_df_scaled['Latitude'].values.reshape(-1, 1)
min_max_scaler = preprocessing.MinMaxScaler((-90,90))
scaled = min_max_scaler.fit_transform(values)
gps_points_df_scaled['Latitude'] = pd.DataFrame(scaled)

gps_points_df_scaled['geometry'] = [Point(lon, lat) for lon, lat in 
                                 zip(gps_points_df_scaled['Longitude'].to_list(), gps_points_df_scaled['Latitude'].to_list())]
gps_points_df_scaled

Unnamed: 0,time,lat,lon,alt,identifier,label,geometry
0,2008-03-28 16:00:01,-79.983301,116.714948,-777,2,train,POINT (116.714948 -79.98330116950432)
1,2008-03-28 16:01:00,-79.986075,116.726137,-777,2,train,POINT (116.726137 -79.98607481050473)
2,2008-03-28 16:01:59,-79.989539,116.740047,-777,2,train,POINT (116.740047 -79.98953891608724)
3,2008-03-28 16:02:59,-79.993430,116.755648,-777,2,train,POINT (116.755648 -79.9934304970117)
4,2008-03-28 16:03:58,-79.997526,116.770972,-777,2,train,POINT (116.770972 -79.99752615381605)
...,...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,-79.818221,116.319362,84,316,bus,POINT (116.319362 -79.81822122102406)
4705,2008-11-29 08:15:54,-79.818241,116.31936,88,316,bus,POINT (116.31936 -79.81824148721998)
4706,2008-11-29 08:15:56,-79.818250,116.319362,92,316,bus,POINT (116.319362 -79.81825044205074)
4707,2008-11-29 08:15:58,-79.818193,116.319361,97,316,bus,POINT (116.319361 -79.81819294261113)


In [99]:
start = datetime.now()

# Creating a Geodataframe. Be aware it is CRS 4326 WGS84
# A GeoDataFrame object is a pandas.DataFrame that has a column with geometry
# It has one GeoSeries column that holds a special status. 
# This GeoSeries is referred to as the GeoDataFrame’s “geometry”. 
# A GeoSeries is essentially a vector where each entry in the vector is a set 
# of shapes corresponding to one observation.
geodata = gpd.GeoDataFrame(gps_points_df_scaled, crs = CRS.from_epsg('4326'))
geodata = geodata.set_index('Timestamp')

geo = datetime.now()
print('Tempo gasto criando geodata:',geo-start)

# Create a Trajectory Collection with Movingpandas
# data(list[Trajectory] or GeoDataFrame or DataFrame)–List of Trajectory 
#     objects or a GeoDataFrame with trajectory IDs, point geometry column 
#     and timestamp index
# traj_id_col(string)–Name of the GeoDataFrame column containing trajectory IDs
traj_collection = mpd.TrajectoryCollection(geodata, 'Identifier')

traj = datetime.now()

print('Tempo gasto criando TrajectoryCollection:',traj-geo)

Tempo gasto criando geodata: 0:03:02.475532
Tempo gasto criando TrajectoryCollection: 0:23:02.323453


In [100]:
start = datetime.now()

# Detects stops in a trajectory. A stop is detected if the movement stays 
# within an area of specified size for at least the specified duration.
# Define parameters in Hours and Search radius in meters
Hours = .01
SearchRadio = 500
stops = mpd.TrajectoryStopDetector(traj_collection).get_stop_segments(min_duration=timedelta(hours=Hours), max_diameter=SearchRadio)

end = datetime.now()
print('Tempo gasto detectando stop points:',end-start)

stops

Tempo gasto detectando stop points: 1:58:13.984433


TrajectoryCollection with 47244 trajectories

In [102]:
start = datetime.now()

# Create a new Geodataframe and define geometry column
stops_start = gpd.GeoDataFrame(columns = ['geometry'])
stops_start = stops_start.set_geometry('geometry')

# Add the ID of each stop track and define it as index
stops_start['stop_id'] = [track.id for track in stops.trajectories]
stops_start = stops_start.set_index('stop_id')

# Iteration over the Stop Trajectories
for stoptrack in stops.trajectories:

    # add stop duration in hours
    stops_start.at[stoptrack.id,'duration_s'] = stoptrack.get_duration().total_seconds()

    # add length
    stops_start.at[stoptrack.id, 'length_m'] = stoptrack.get_length()

    # add traj id
    stops_start.at[stoptrack.id, 'identifier'] = stoptrack.id.split('_')[0]

    # add datetime
    stops_start.at[stoptrack.id, 'datetime']  = pd.to_datetime(stoptrack.id.split('_')[1]).tz_localize(None)

    # geometry with start point
    stops_start.at[stoptrack.id, 'geometry'] = stoptrack.get_start_location()
        
# Reset indexes
stops_start = stops_start.reset_index(drop=True)
geodata = geodata.reset_index(drop=True)

end = datetime.now()
print('Tempo gasto criando stops df:',end-start)

stops_start

Tempo gasto criando stops df: 0:10:48.942936


Unnamed: 0,geometry,duration_s,length_m,identifier,datetime
0,POINT (116.33040 -79.98330),72.0,309.707009,0,2007-04-12 10:23:25
1,POINT (116.33033 -79.98330),873.0,309.721220,0,2007-04-12 13:46:21
2,POINT (116.33082 -79.98330),59.0,309.728663,0,2007-04-13 15:23:30
3,POINT (116.32755 -80.00660),66.0,483.823859,0,2007-04-13 15:25:57
4,POINT (116.37362 -80.03642),78.0,415.891259,0,2007-04-13 15:32:49
...,...,...,...,...,...
47239,POINT (116.34552 -79.99521),303.0,961.656269,3151,2009-12-24 11:36:25
47240,POINT (116.37082 -79.99628),139.0,501.855553,3151,2009-12-24 11:41:30
47241,POINT (116.39669 -79.99693),131.0,499.115602,3151,2009-12-24 11:43:51
47242,POINT (116.42274 -79.99704),333.0,508.550589,3151,2009-12-24 11:46:04


In [120]:
def get_features(stops_start_features):
    ids = [int(i) for i in stops_start_features['identifier'].unique()]
    for i in ids:
        single_trajectory_df = gps_points_df[gps_points_df['identifier']==i]
        dist = [0]
        lat = single_trajectory_df['lat'].values
        lon = single_trajectory_df['lon'].values
        for p in range(1,len(lat)):
            coords_1 = (lat[p-1],lon[p-1])
            coords_2 = (lat[p],lon[p])
            dist.append(hs.haversine(coords_1,coords_2,unit=Unit.METERS))
        
        timestamps = single_trajectory_df['time'].values
        delta_time = [0]
        for p in range(1,len(lat)):
            start = timestamps[p-1]
            end = timestamps[p]
            delta_time.append(pd.Timedelta((end - start)).total_seconds())

        speed = [0]
        for p in range(1,len(lat)):
            if delta_time[p] > 0:
                speed.append(dist[p]/delta_time[p])
            else:
                speed.append(0)
        
        acc = [0]
        for p in range(1,len(lat)):
            if delta_time[p] > 0:
                acc.append(speed[p]/delta_time[p])
            else:
                acc.append(0)
        
        avg_dist = np.mean(dist)
        avg_speed = np.mean(speed)
        avg_acc = np.mean(acc)
                
        stops_start_features.loc[stops_start_features['identifier']==i,['avg_dist']] = avg_dist
        stops_start_features.loc[stops_start_features['identifier']==i,['avg_speed']] = avg_speed
        stops_start_features.loc[stops_start_features['identifier']==i,['avg_acc']] = avg_acc
    
    return stops_start_features
        
start = datetime.now()

stops_start_features = stops_start.copy()
get_features(stops_start_features)

end = datetime.now()
print('Tempo gasto:',end-start)

stops_start_features

Tempo gasto: 0:06:15.749697


Unnamed: 0,geometry,duration_s,length_m,identifier,datetime,avg_dist,avg_speed,avg_acc
0,POINT (116.33040 -79.98330),72.0,309.707009,0,2007-04-12 10:23:25,,,
1,POINT (116.33033 -79.98330),873.0,309.721220,0,2007-04-12 13:46:21,,,
2,POINT (116.33082 -79.98330),59.0,309.728663,0,2007-04-13 15:23:30,,,
3,POINT (116.32755 -80.00660),66.0,483.823859,0,2007-04-13 15:25:57,,,
4,POINT (116.37362 -80.03642),78.0,415.891259,0,2007-04-13 15:32:49,,,
...,...,...,...,...,...,...,...,...
47239,POINT (116.34552 -79.99521),303.0,961.656269,3151,2009-12-24 11:36:25,,,
47240,POINT (116.37082 -79.99628),139.0,501.855553,3151,2009-12-24 11:41:30,,,
47241,POINT (116.39669 -79.99693),131.0,499.115602,3151,2009-12-24 11:43:51,,,
47242,POINT (116.42274 -79.99704),333.0,508.550589,3151,2009-12-24 11:46:04,,,


In [112]:
# def get_features(df):
#     identifiers = df['identifier'].values
    
#     # getting distances between points
#     dist = []
#     lat = df['lat'].values
#     lon = df['lon'].values
#     for i in range(len(identifiers)):
#         if i != 0 and identifiers[i-1]==identifiers[i]:
#             coords_1 = (lat[i-1],lon[i-1])
#             coords_2 = (lat[i],lon[i])
#             dist.append(hs.haversine(coords_1,coords_2,unit=Unit.METERS))
#         else:
#             dist.append(0)
#     df['distance'] = dist
#     print('........................Finished distance......................')
    
#     # getting time passed between points
#     timestamps = df['time'].values
#     delta_time = []
#     for i in range(len(identifiers)):
#         if i != 0 and identifiers[i-1]==identifiers[i]:
#             start = timestamps[i-1]
#             end = timestamps[i]
#             delta_time.append(pd.Timedelta((end - start)).total_seconds())
#         else:
#             delta_time.append(0)
#     df['delta_time'] = delta_time
#     print('........................Finished time......................')
    
#     # getting speed of vehicles between points
#     speeds = []
#     for i in range(len(identifiers)):
#         if i != 0 and identifiers[i-1]==identifiers[i] and dist[i] != 0 and delta_time[i] != 0:
#             speeds.append(dist[i]/delta_time[i])
#         else:
#             speeds.append(0)
#     df['speed'] = speeds
#     print('........................Finished speed......................')
    
#     # getting acceleration of vehicles between points
#     accs = []
#     for i in range(len(identifiers)):
#         if i != 0 and identifiers[i-1]==identifiers[i] and speeds[i] != 0 and delta_time[i] != 0:
#             accs.append(speeds[i]/delta_time[i])
#         else:
#             accs.append(0)
#     df['acceleration'] = accs
#     print('........................Finished acceleration......................')
    
#     return df

# start = datetime.now()
# get_features(selected_id_df)
# end = datetime.now()
# print('Tempo gasto:',end-start)
# selected_id_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = dist


........................Finished distance......................


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['delta_time'] = delta_time


........................Finished time......................


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['speed'] = speeds


........................Finished speed......................
........................Finished acceleration......................
Tempo gasto: 0:00:55.262193


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['acceleration'] = accs


Unnamed: 0,time,lat,lon,alt,identifier,distance,delta_time,speed,acceleration
0,2008-03-28 16:00:01,39.50293,116.714948,-777,2,0.000000,0.0,0.000000,0.000000
1,2008-03-28 16:01:00,39.497045,116.726137,-777,2,1161.837779,59.0,19.692166,0.333766
2,2008-03-28 16:01:59,39.489695,116.740047,-777,2,1446.596976,59.0,24.518593,0.415569
3,2008-03-28 16:02:59,39.481438,116.755648,-777,2,1623.427027,60.0,27.057117,0.450952
4,2008-03-28 16:03:58,39.472748,116.770972,-777,2,1632.047195,59.0,27.661817,0.468844
...,...,...,...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84,316,2.891072,2.0,1.445536,0.722768
4705,2008-11-29 08:15:54,40.00778,116.31936,88,316,2.452215,2.0,1.226108,0.613054
4706,2008-11-29 08:15:56,40.007756,116.319362,92,316,2.674113,2.0,1.337056,0.668528
4707,2008-11-29 08:15:58,40.00774,116.319361,97,316,1.781159,2.0,0.890579,0.445290


In [115]:
# start = datetime.now()

# stops_start['avg_speed'] = 0
# stops_start['avg_dist'] = 0
# stops_start['avg_acc'] = 0
# for i in ids:
#     speed_list = selected_id_df['speed'][selected_id_df['identifier']==i].to_list()
#     avg_speed = sum(speed_list)/len(speed_list)
#     stops_start['avg_speed'][stops_start['identifier']==i] = avg_speed
    
#     dist_list = selected_id_df['distance'][selected_id_df['identifier']==i].to_list()
#     avg_dist = sum(dist_list)/len(dist_list)
#     stops_start['avg_dist'][stops_start['identifier']==i] = avg_dist
    
#     acc_list = selected_id_df['acceleration'][selected_id_df['identifier']==i].to_list()
#     avg_acc = sum(acc_list)/len(acc_list)
#     stops_start['avg_acc'][stops_start['identifier']==i] = avg_acc

# end = datetime.now()
# print('Tempo gasto:',end-start)

# stops_start

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stops_start['avg_speed'][stops_start['identifier']==i] = avg_speed
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stops_start['avg_dist'][stops_start['identifier']==i] = avg_dist
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stops_start['avg_acc'][stops_start['identifier']==i] = avg_acc


Tempo gasto: 0:23:08.049625


Unnamed: 0,geometry,duration_s,length_m,identifier,datetime,avg_speed,avg_dist,avg_acc
0,POINT (116.33040 -79.98330),72.0,309.707009,0,2007-04-12 10:23:25,0,0,0
1,POINT (116.33033 -79.98330),873.0,309.721220,0,2007-04-12 13:46:21,0,0,0
2,POINT (116.33082 -79.98330),59.0,309.728663,0,2007-04-13 15:23:30,0,0,0
3,POINT (116.32755 -80.00660),66.0,483.823859,0,2007-04-13 15:25:57,0,0,0
4,POINT (116.37362 -80.03642),78.0,415.891259,0,2007-04-13 15:32:49,0,0,0
...,...,...,...,...,...,...,...,...
47239,POINT (116.34552 -79.99521),303.0,961.656269,3151,2009-12-24 11:36:25,0,0,0
47240,POINT (116.37082 -79.99628),139.0,501.855553,3151,2009-12-24 11:41:30,0,0,0
47241,POINT (116.39669 -79.99693),131.0,499.115602,3151,2009-12-24 11:43:51,0,0,0
47242,POINT (116.42274 -79.99704),333.0,508.550589,3151,2009-12-24 11:46:04,0,0,0


In [82]:
# # Average features per trajectory

# start = datetime.now()

# trajs = stops_start['identifier'].unique()
# avg_distance = [0]
# avg_speed = []
# for t in trajs:
#     single_trajectory = gps_points_df[gps_points_df['identifier']==int(t)]
#     lat = single_trajectory['lat'].tolist()
#     lon = single_trajectory['lon'].tolist()
#     for i in range(1,len(lat)):
#         coords_1 = (lat[i-1],lon[i-1])
#         coords_2 = (lat[i],lon[i])
#         avg_distance.append(hs.haversine(coords_1,coords_2,unit=Unit.METERS))
    
# end = datetime.now()
# print('Tempo gasto:',end-start)

# avg_distance

NameError: name 'hs' is not defined

In [75]:
from sklearn.model_selection import train_test_split

X = stops_start_features[['duration_s','length_m','avg_dist','avg_speed','avg_acc']]
y = [transport_df.iloc[int(i)]['Transportation Mode'] for i in stops_start_features['identifier']]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [76]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

start = datetime.now()

depths = [2,3,5]
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average='micro')
    print(score)
end = datetime.now()
print('Tempo gasto:',end-start)

0.3060641337707694
0.30458249550216954
0.31484813207746853
Tempo gasto: 0:00:00.192195
