In [2]:
import pandas as pd
from haversine import haversine as hvs, Unit
import numpy as np

In [5]:
# USO DE HAVERSINE
# lyon = (-13.5549784, -71.8559785) # (lat, lon)
# stop = (-71.8573001, -13.5548123)

# print(stop)
# hvs(lyon, stop)

# -------------------
# t1,t2,t3
# t2 = t1 + (t1-t3)/2


# Analisis de Datos

### Busstop ida

In [3]:
# leer dataset busstop
df_stops_ida = pd.read_csv('busstops_ida.csv')

# diseño del df: creacion de latitude_longitude
df_stops_ida['latitude'] = df_stops_ida.apply(lambda row: (row['latitude'], row['longitude']), axis=1)
df_stops_ida.drop(['longitude', 'number'], inplace=True, axis=1)
df_stops_ida = df_stops_ida.rename(columns={'latitude':'latitude_longitude'}, inplace=False)

# df_stops_ida.info()
df_stops_ida

Unnamed: 0,id,name,navigation,latitude_longitude,radio
0,0,inicio,159,"(-13.5549784, -71.856025)",30
1,36,primer stop,304,"(-13.5548123, -71.8573001)",18
2,37,segundo stop,281,"(-13.5532944, -71.8619442)",13
3,38,Ccollana,283,"(-13.552296, -71.867299)",9
4,39,Puente Huaccoto,283,"(-13.551592, -71.870211)",10
...,...,...,...,...,...
79,114,Segundo,221,"(-13.547528, -71.985248)",10
80,115,Reservorio,225,"(-13.54966, -71.988003)",10
81,116,Cuarto,230,"(-13.550408, -71.98888)",10
82,117,Quinto,203,"(-13.550841, -71.989448)",10


### Crear links

In [4]:
def generateLinks(row):
    row['stop'] = df_linkref.loc[row.name+1, 'id']
    row['linkref'] = str(row['id']) + ':' + str(df_linkref.loc[row.name+1, 'id'])
    return row

df_linkref = df_stops_ida.copy()
df_linkref = df_linkref.head(df_linkref.shape[0]-1).apply(generateLinks, axis=1)
df_linkref = df_linkref.drop(df_linkref.columns[:5], axis=1)
df_linkref = df_linkref.rename(columns={'id':'init_stop', 'stop':'end_stop'}, inplace=False)
df_linkref['end_stop'] = df_linkref['end_stop'].values.astype(str)
df_linkref

Unnamed: 0,end_stop,linkref
0,36,0:36
1,37,36:37
2,38,37:38
3,39,38:39
4,40,39:40
...,...,...
78,114,113:114
79,115,114:115
80,116,115:116
81,117,116:117


### Monitoring

In [5]:
# leer dataset monitoring
data = pd.read_csv('monitoring.csv')

# diseño del df: creacion data_time y latitude_longitude
data['date'] += ' ' + data['time']
data['latitude'] = data.apply(lambda row: (row['latitude'], row['longitude']), axis=1)
data.drop(['time', 'longitude'], inplace=True, axis=1)
data = data.rename(columns={'date':'date_time', 'latitude':'latitude_longitude', 'vehicle_id_id':'vehicle_id'}, inplace=False)
data = data.drop_duplicates(subset=['latitude_longitude']) # (65155, 7) (64702, 7)
data = data.sort_values(by='date_time')

# convertir de object a datatime
data['date_time'] = pd.to_datetime(data['date_time'], format='%Y-%m-%d %H:%M:%S')
print(data.shape)
data 

(64702, 7)


Unnamed: 0,id,date_time,lap,latitude_longitude,velocity,navigation,vehicle_id
4824,21937142,2022-07-14 00:01:31,0,"(-13.5537836, -71.8619456)",5,6,SJ37
4978,21937155,2022-07-14 00:03:44,0,"(-13.5540421, -71.8620274)",0,194,SJ37
5039,21937160,2022-07-14 00:05:11,0,"(-13.5383785, -71.9810924)",0,225,SJ39
5061,21937162,2022-07-14 00:05:32,0,"(-13.5538999, -71.8619277)",1,155,SJ37
5081,21937164,2022-07-14 00:05:59,0,"(-13.5476402, -71.8805191)",0,33,SJ07
...,...,...,...,...,...,...,...
62360,22142519,2022-07-14 23:46:49,5,"(-13.5510421, -71.8767183)",0,180,SJ04
62405,22142539,2022-07-14 23:51:05,5,"(-13.5539454, -71.8619251)",2,208,SJ37
62443,22142555,2022-07-14 23:54:43,3,"(-13.5513956, -71.9902558)",0,284,SJ40
62481,22142569,2022-07-14 23:56:00,5,"(-13.5539692, -71.8619186)",0,25,SJ37


### Analisis General

In [6]:
# cantidad de datos: laps y vehicles
vehicles = data['vehicle_id'].unique()
print('# laps: ', len(data['lap'].unique()), '->',data['lap'].unique())
print('# vehicles: ', len(vehicles), '->' ,vehicles)

# laps:  7 -> [0 5 1 6 2 3 4]
# vehicles:  37 -> ['SJ37' 'SJ39' 'SJ07' 'SJ35' 'SJ25' 'SJ13' 'SJ04' 'SJ34' 'SJ20' 'SJ36'
 'SJ26' 'SJ12' 'SJ30' 'SJ10' 'SJ09' 'SJ03' 'SJ24' 'SJ16' 'SJ21' 'SJ27'
 'SJ29' 'SJ28' 'SJ31' 'SJ41' 'SJ42' 'SJ08' 'SJ38' 'SJ02' 'SJ01' 'SJ05'
 'SJ14' 'SJ18' 'SJ23' 'SJ19' 'SJ11' 'SJ40' 'SJ06']


In [7]:
# datos por cada vehiculo
veh_rows = {}
for veh in vehicles: 
    veh_rows[veh] = len(data[data['vehicle_id'] == veh])

print('# Max de datos: ', max(veh_rows.values()))
print('# Min de datos: ', min(veh_rows.values()))
veh_rows.items()

# Max de datos:  2607
# Min de datos:  3


dict_items([('SJ37', 2561), ('SJ39', 809), ('SJ07', 1862), ('SJ35', 1994), ('SJ25', 55), ('SJ13', 1884), ('SJ04', 2356), ('SJ34', 2325), ('SJ20', 2376), ('SJ36', 1985), ('SJ26', 3), ('SJ12', 1756), ('SJ30', 2203), ('SJ10', 2194), ('SJ09', 934), ('SJ03', 1759), ('SJ24', 1346), ('SJ16', 2156), ('SJ21', 1027), ('SJ27', 2607), ('SJ29', 1391), ('SJ28', 1848), ('SJ31', 2191), ('SJ41', 2245), ('SJ42', 1571), ('SJ08', 1366), ('SJ38', 2310), ('SJ02', 2416), ('SJ01', 1978), ('SJ05', 2324), ('SJ14', 2078), ('SJ18', 2371), ('SJ23', 1761), ('SJ19', 2110), ('SJ11', 1319), ('SJ40', 1227), ('SJ06', 4)])

### Funciones

In [9]:
# funcion para verificar si el punto paso por un stop
def isStop(row, stops):
    r_nav = 45
    r_err = 50  # meters
    stops = stops[abs(stops['navigation'] - row['navigation']) <= r_nav]
    stops['dis'] = stops['latitude_longitude'].apply(lambda stop: round(hvs(row['latitude_longitude'], stop, unit=Unit.METERS), 2))
    stops = stops[stops['dis'] <= stops['radio'] + r_err]

    # print(stops)
    # print(row)
    # print('-'*50)
    if len(stops) != 0:
        row['stop'] = str(stops.loc[stops['dis'].idxmin(), 'id'])
        row['dis'] = float(stops.loc[stops['dis'].idxmin(), 'dis'])
    else:
        row['stop'] = np.nan
        row['dis'] = 0
        
    return row

# funcion para calcular el tiempo de un punto con su antecesor en seg
def time_travel(row, df):
    if row.name == 0:
        return 0
    else:
        return (df.loc[row.name, 'date_time'] - df.loc[row.name-1, 'date_time']).total_seconds()


### crear samples

In [16]:
df_his = pd.DataFrame(columns=['vehicle', 'lap', 'total_Stopbus' ,'total_NaN', 'Percent_StopPoints'])   
nStopbus = 84

for veh in vehicles:
    data_veh = data[data['vehicle_id'] == veh].reset_index(drop=True).copy()
    laps = data_veh['lap'].unique()[1:-1]

    print(veh, laps)
    for lap in laps:
        data_veh_lap = data_veh[data_veh['lap'] == lap].reset_index(drop=True)

        # verificar puntos por stop
        data_veh_lap_stop = data_veh_lap.apply(lambda row: isStop(row, df_stops_ida), axis=1)
        data_veh_lap_stop = data_veh_lap_stop[data_veh_lap_stop['stop'].notna()]
        data_veh_lap_stop = data_veh_lap_stop.sort_values('dis').drop_duplicates('stop').sort_index().reset_index(drop=True)

        nNaN = nStopbus - len(data_veh_lap_stop)
        # print(lap, nNaN)
        if nNaN <= nStopbus*0.75:
            # calcular time
            data_veh_lap_stop['time_travel'] = data_veh_lap_stop.apply(lambda row: time_travel(row, data_veh_lap_stop), axis=1)

            # unir con linkref
            df_out = df_linkref.join(data_veh_lap_stop.set_index('stop'), on='end_stop')
        
        df_his.loc[df_his.shape[0]] = [veh, lap, nStopbus, nNaN, 1-nNaN/nStopbus]

SJ37 [1]
0      3
62     2
111    2
109    2
103    2
      ..
47     1
46     1
82     1
83     1
78     1
Name: stop, Length: 70, dtype: int64


In [14]:
df_his

Unnamed: 0,vehicle,lap,total_Stopbus,total_NaN,Percent_StopPoints
0,SJ37,1,84,14,0.833333
