## Combinação de variáveis para identificação de meios de transporte

A maior dificuldade dessa tarefa é diferenciar carros de ônibus e de táxis, tanto que em vários trabalhos essas categorias são unificadas em uma só. A partir da identificação de stay-points surgiu a ideia de relacionar as informações de quantidades de paradas e tempo de parada para facilitar nessa diferenciação, junto com as métricas de Fisher e de Jensen-Shannon.

Para a identificação dos pontos de parada será usada a biblioteca movingpandas. O tempo de cada parada pode ser obtido com a função get_stop_time_ranges() da classe TrajectoryStopDetector dessa biblioteca. Também pode ser possível incluir a variável tempo_entre_paradas.

In [26]:
import pandas as pd
import os
from datetime import datetime
import numpy as np
import geopandas as gpd
import glob
from pyproj import CRS
from shapely.geometry import Point
from datetime import timedelta
import movingpandas as mpd
import matplotlib.pyplot as plt
import shapely
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

In [27]:
''' 
When reading all users, don't forget to check for the labels.txt file
'''
def read_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    transport_df = pd.DataFrame(columns=['Start Time','End Time','Transportation Mode'])
    gps_points_df = pd.DataFrame(columns=['time', 'lat', 'lon', 'alt', 'identifier'])
    for i, sf in enumerate(subfolders):
        user_folder = os.path.join(folder,sf)
        labels_file = os.path.join(user_folder, 'labels.txt')
        if os.path.exists(labels_file):
            print('Processing user %s ---------------------------' % (sf))
            u_transport_df = pd.read_csv(labels_file, sep="\t")
            u_gps_points_df = pd.DataFrame(columns=['time', 'lat', 'lon', 'alt', 'identifier'])
            plt_files = os.path.join(user_folder, 'Trajectory')
            for filename in os.listdir(plt_files):
                dt_object = datetime.strptime(filename[:-4], '%Y%m%d%H%M%S')
                for index, row in u_transport_df.iterrows():
                    start = datetime.strptime(row['Start Time'],'%Y/%m/%d %H:%M:%S')
                    end = datetime.strptime(row['End Time'],'%Y/%m/%d %H:%M:%S')
                    if dt_object >= start and dt_object <= end:
                        file_df = pd.read_csv(os.path.join(plt_files, filename), skiprows=6, header=None,
                                 parse_dates=[[5, 6]], infer_datetime_format=True)
                        file_df.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})
                        file_df.drop(inplace=True, columns=[2, 4])
                        file_df['identifier'] = index
                        u_gps_points_df = pd.concat((u_gps_points_df,file_df), axis=0)
            print('--------------Done')
            transport_df = pd.concat((transport_df,u_transport_df), axis=0)
            gps_points_df = pd.concat((gps_points_df,u_gps_points_df), axis=0)
    
    return transport_df,gps_points_df

start = datetime.now()
transport_df,gps_points_df = read_users('../Data')

end = datetime.now()
print('Tempo gasto:',end-start)

Processing user 010 ---------------------------
--------------Done
Processing user 020 ---------------------------
--------------Done
Processing user 021 ---------------------------
--------------Done
Processing user 052 ---------------------------
--------------Done
Processing user 053 ---------------------------
--------------Done
Processing user 056 ---------------------------
--------------Done
Processing user 058 ---------------------------
--------------Done
Processing user 059 ---------------------------
--------------Done
Processing user 060 ---------------------------
--------------Done
Processing user 062 ---------------------------
--------------Done
Processing user 064 ---------------------------
--------------Done
Processing user 065 ---------------------------
--------------Done
Processing user 067 ---------------------------
--------------Done
Processing user 068 ---------------------------
--------------Done
Processing user 069 ---------------------------
--------------

In [28]:
transport_df

Unnamed: 0,Start Time,End Time,Transportation Mode
0,2007/06/26 11:32:29,2007/06/26 11:40:29,bus
1,2008/03/28 14:52:54,2008/03/28 15:59:59,train
2,2008/03/28 16:00:00,2008/03/28 22:02:00,train
3,2008/03/29 01:27:50,2008/03/29 15:59:59,train
4,2008/03/29 16:00:00,2008/03/30 15:59:59,train
...,...,...,...
314,2008/11/17 06:59:58,2008/11/17 07:06:16,bus
315,2008/11/17 07:06:16,2008/11/17 07:14:32,walk
316,2008/11/29 01:58:05,2008/11/29 02:01:39,bus
317,2008/11/29 02:01:39,2008/11/29 02:07:57,walk


In [29]:
gps_points_df 

Unnamed: 0,time,lat,lon,alt,identifier
0,2008-03-28 16:00:01,39.50293,116.714948,-777,2
1,2008-03-28 16:01:00,39.497045,116.726137,-777,2
2,2008-03-28 16:01:59,39.489695,116.740047,-777,2
3,2008-03-28 16:02:59,39.481438,116.755648,-777,2
4,2008-03-28 16:03:58,39.472748,116.770972,-777,2
...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84,316
4705,2008-11-29 08:15:54,40.00778,116.31936,88,316
4706,2008-11-29 08:15:56,40.007756,116.319362,92,316
4707,2008-11-29 08:15:58,40.00774,116.319361,97,316


In [30]:
from sklearn import preprocessing

gps_points_df_scaled = gps_points_df.copy()

values = gps_points_df_scaled['lat'].values.reshape(-1, 1)
min_max_scaler = preprocessing.MinMaxScaler((-90,90))
scaled = min_max_scaler.fit_transform(values)
gps_points_df_scaled['lat'] = pd.DataFrame(scaled)

gps_points_df_scaled['geometry'] = [Point(lon, lat) for lon, lat in 
                                 zip(gps_points_df_scaled['lon'].to_list(), gps_points_df_scaled['lat'].to_list())]
gps_points_df_scaled

Unnamed: 0,time,lat,lon,alt,identifier,geometry
0,2008-03-28 16:00:01,-79.983301,116.714948,-777,2,POINT (116.714948 -79.98330116950432)
1,2008-03-28 16:01:00,-79.986075,116.726137,-777,2,POINT (116.726137 -79.98607481050473)
2,2008-03-28 16:01:59,-79.989539,116.740047,-777,2,POINT (116.740047 -79.98953891608724)
3,2008-03-28 16:02:59,-79.993430,116.755648,-777,2,POINT (116.755648 -79.9934304970117)
4,2008-03-28 16:03:58,-79.997526,116.770972,-777,2,POINT (116.770972 -79.99752615381605)
...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,-79.818221,116.319362,84,316,POINT (116.319362 -79.81822122102406)
4705,2008-11-29 08:15:54,-79.818241,116.31936,88,316,POINT (116.31936 -79.81824148721998)
4706,2008-11-29 08:15:56,-79.818250,116.319362,92,316,POINT (116.319362 -79.81825044205074)
4707,2008-11-29 08:15:58,-79.818193,116.319361,97,316,POINT (116.319361 -79.81819294261113)


In [31]:
start = datetime.now()

# Creating a Geodataframe. Be aware it is CRS 4326 WGS84
# A GeoDataFrame object is a pandas.DataFrame that has a column with geometry
# It has one GeoSeries column that holds a special status. 
# This GeoSeries is referred to as the GeoDataFrame’s “geometry”. 
# A GeoSeries is essentially a vector where each entry in the vector is a set 
# of shapes corresponding to one observation.
geodata = gpd.GeoDataFrame(gps_points_df_scaled, crs = CRS.from_epsg('4326'))
geodata = geodata.set_index('time')

geo = datetime.now()
print('Tempo gasto criando geodata:',geo-start)

# Create a Trajectory Collection with Movingpandas
# data(list[Trajectory] or GeoDataFrame or DataFrame)–List of Trajectory 
#     objects or a GeoDataFrame with trajectory IDs, point geometry column 
#     and timestamp index
# traj_id_col(string)–Name of the GeoDataFrame column containing trajectory IDs
traj_collection = mpd.TrajectoryCollection(geodata, 'identifier')

traj = datetime.now()

print('Tempo gasto criando TrajectoryCollection:',traj-geo)

Tempo gasto criando geodata: 0:00:48.014124
Tempo gasto criando TrajectoryCollection: 0:17:47.488565


In [32]:
start = datetime.now()

# Detects stops in a trajectory. A stop is detected if the movement stays 
# within an area of specified size for at least the specified duration.
# Define parameters in Hours and Search radius in meters
Hours = .01
SearchRadio = 500
stops = mpd.TrajectoryStopDetector(traj_collection).get_stop_segments(min_duration=timedelta(hours=Hours), max_diameter=SearchRadio)

end = datetime.now()
print('Tempo gasto detectando stop points:',end-start)

stops

Tempo gasto detectando stop points: 1:26:05.289650


TrajectoryCollection with 47244 trajectories

In [33]:
start = datetime.now()

# Create a new Geodataframe and define geometry column
stops_start = gpd.GeoDataFrame(columns = ['geometry'])
stops_start = stops_start.set_geometry('geometry')

# Add the ID of each stop track and define it as index
stops_start['stop_id'] = [track.id for track in stops.trajectories]
stops_start = stops_start.set_index('stop_id')

# Iteration over the Stop Trajectories
for stoptrack in stops.trajectories:

    # add stop duration in hours
    stops_start.at[stoptrack.id,'duration_s'] = stoptrack.get_duration().total_seconds()

    # add length
    stops_start.at[stoptrack.id, 'length_m'] = stoptrack.get_length()

    # add traj id
    stops_start.at[stoptrack.id, 'identifier'] = stoptrack.id.split('_')[0]

    # add datetime
    stops_start.at[stoptrack.id, 'datetime'] = pd.to_datetime(stoptrack.id.split('_')[1]).tz_localize(None)

    # geometry with start point
    stops_start.at[stoptrack.id, 'geometry'] = stoptrack.get_start_location()
    
# Reset indexes
stops_start = stops_start.reset_index(drop=True)
geodata = geodata.reset_index(drop=True)

end = datetime.now()
print('Tempo gasto criando stops df:',end-start)

stops_start

Tempo gasto criando stops df: 0:08:30.395972


Unnamed: 0,geometry,duration_s,length_m,identifier,datetime
0,POINT (116.33040 -79.98330),72.0,309.707009,0,2007-04-12 10:23:25
1,POINT (116.33033 -79.98330),873.0,309.721220,0,2007-04-12 13:46:21
2,POINT (116.33082 -79.98330),59.0,309.728663,0,2007-04-13 15:23:30
3,POINT (116.32755 -80.00660),66.0,483.823859,0,2007-04-13 15:25:57
4,POINT (116.37362 -80.03642),78.0,415.891259,0,2007-04-13 15:32:49
...,...,...,...,...,...
47239,POINT (116.34552 -79.99521),303.0,961.656269,3151,2009-12-24 11:36:25
47240,POINT (116.37082 -79.99628),139.0,501.855553,3151,2009-12-24 11:41:30
47241,POINT (116.39669 -79.99693),131.0,499.115602,3151,2009-12-24 11:43:51
47242,POINT (116.42274 -79.99704),333.0,508.550589,3151,2009-12-24 11:46:04


In [74]:
from sklearn.model_selection import train_test_split

X = stops_start[['duration_s','length_m','time_diff]]
y = [transport_df.iloc[int(i)]['Transportation Mode'] for i in stops_start['identifier']]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

KeyError: "['time_diff'] not in index"

In [73]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

start = datetime.now()

depths = [2,3,5]
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average='micro')
    print(score)
end = datetime.now()
print('Tempo gasto:',end-start)

0.31326066250396867
0.31749391469996824
0.3221504921155678
Tempo gasto: 0:00:00.210803


## Arquivo enviado pela Isadora

In [None]:
# import math


# def it_features(histogram, features, normalized = False):
    
#     it = []
    
#     if "permutation_entropy" in features:
#         pe = permutation_entropy(histogram, normalized)
#         it.append(pe)
        
#     if "statistical_complexity" in features:
#         if "permutation_entropy" in features:
#             sc = statistical_complexity(histogram, entropy = pe)
#         else:
#             sc = statistical_complexity(histogram)
            
#         it.append(sc)
            
#     if "fisher_information" in features:
#         fi = fisher_information(histogram)
#         it.append(fi)

#     return it



# def permutation_entropy(dist, normalized = False):

#     pe = 0

#     for p in dist:    
#         if p > 1e-30:        
#             pe -= (p*math.log(p))

#     if (normalized == True):
#         pe = pe/math.log(len(dist))

#     return pe


# def statistical_complexity(dist, entropy = None):

#     if entropy is None:
#         entropy = permutation_entropy(dist, normalized = True)

#     # the length of the probabilities, 
#     n = len(dist)

#     # the reference distribution (uniform)
#     P_u = [1/n]*n

#     # the Jensen-shannon divergence
#     pe_pu = permutation_entropy(P_u)/2
#     pe_op = permutation_entropy(dist)/2

#     p1 = [(x + y)/2 for x, y in zip(dist, P_u)]
#     pe_op_pu = permutation_entropy(p1)

#     JS = pe_op_pu - pe_op - pe_pu

#     # the statistical complexity
#     # math.log is ln
#     p2 = (((n+1)/n) * math.log(n + 1) - 2*math.log(2*n) + math.log(n))
#     Q_0 = -2*(1/p2)
#     Q = Q_0 * JS
#     C = Q*entropy

#     return C


# def fisher_information(dist):

#     n = len(dist)

#     if dist[0] == 1 or dist[n - 1] == 1:
#         F_0 = 1

#     else:
#         F_0 = 1/2


#     aux0 = [math.sqrt(dist[i]) for i in range(1, n)]
#     aux1 = [math.sqrt(dist[i]) for i in range(0, n - 1)]
#     aux = [(x - y)**2 for x,y in zip(aux0, aux1)]
#     F = F_0 * sum(aux)

#     return F