In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
# Scooters data path
DATA_PATH = "./data/ks.csv"
scooters = pd.read_csv(DATA_PATH, parse_dates={"time":["gps_date","gps_t"]}).drop(["lat", "lon"], axis=1)
display(scooters)

Unnamed: 0,time,ride_id,wheel
0,2021-05-17 00:01:25.577,0,6.1
1,2021-05-17 00:01:28.324,0,9.2
2,2021-05-17 00:01:32.948,0,19.2
3,2021-05-17 00:01:34.226,0,22.2
4,2021-05-17 00:01:35.525,0,23.1
...,...,...,...
11243049,2021-05-31 01:05:53.033,22502,14.7
11243050,2021-05-31 01:05:54.430,22502,5.2
11243051,2021-05-31 01:06:00.934,22502,0.0
11243052,2021-05-31 01:06:15.765,22502,0.0


In [11]:
def calc_avrg_time(df):
    idxs = set(df.ride_id)
    res = 0
    for ride_id in tqdm(range(0, 22000, 22)):
        data = df[df.ride_id == ride_id].reset_index()
        time_data = data["time"]
        N = len(data)
        avg = 0
        for i in range(N-1):
            dt = (data.time[i+1] - data.time[i]).total_seconds()
            avg += dt
        res += avg / N
    res /= 1000
    
    return res

In [12]:
calc_avrg_time(scooters)

100%|██████████████████████████████████████| 1000/1000 [02:22<00:00,  7.02it/s]


4.623131758615718

In [35]:
def get_raw_trip_data(df, ride_id):
    data = df[df.ride_id == ride_id].reset_index()
    N = len(data)
    
    data["speed"] = data.wheel.div(3.6)
    
    accel = [0.0]
    dt_arr = [0.0]
    a = 0.0
    for i in range(N-1):
        dv = data.speed[i+1] - data.speed[i]
        dt = (data.time[i+1] - data.time[i]).total_seconds()
        dt_arr.append(dt)
        # print(f"{i}.{dv} / {dt} = {dv/dt}")
        if dt > 1.0:
            a = dv / dt
        accel.append(a)
    data["accel"] = accel
    data["dt"] = dt_arr
    
    data.drop(labels=["time", "wheel"], axis=1, inplace=True)
    return data

In [48]:
def get_trips_data(df):
    col_names = ["average_speed", "peak_speed", "average_acceleration", "peak_acceleration",
                 "rapid_overclock", "hard_braking"]
    trips_data = pd.DataFrame(columns=col_names)
    idxs = set(df.ride_id)
    for trip_id in tqdm(idxs):
        raw_data = get_raw_trip_data(df, trip_id)
        
        average_speed = raw_data.speed.mean()
        peak_speed = raw_data.speed.max()
        average_acceleration = raw_data.accel.mean()
        peak_acceleration = raw_data.accel.max()
        # New features
        rapid_overclock = len(raw_data[(raw_data.accel > 3.43) & (raw_data.dt < 5)])
        hard_braking = len(raw_data[(raw_data.accel < -4.42) & (raw_data.dt < 5)])
        
        info = [average_speed, peak_speed, average_acceleration, peak_acceleration,
                rapid_overclock, hard_braking]
        trips_data.loc[trip_id] = info
    
    return trips_data

In [49]:
X = get_trips_data(scooters)
X

100%|██████████████████████████████████| 22503/22503 [1:08:10<00:00,  5.50it/s]


Unnamed: 0,average_speed,peak_speed,average_acceleration,peak_acceleration,rapid_overclock,hard_braking
0,6.443223,7.388889,-0.011087,0.977465,0.0,0.0
1,5.237365,7.500000,-0.007952,2.199546,0.0,0.0
2,4.968280,7.083333,-0.030921,1.382306,0.0,0.0
3,6.049740,7.500000,-0.029479,1.229105,0.0,3.0
4,6.428615,7.555556,-0.062866,0.950728,0.0,0.0
...,...,...,...,...,...,...
22498,3.907132,6.750000,0.003167,1.248171,0.0,0.0
22499,4.830391,7.611111,-0.052442,2.365561,0.0,1.0
22500,5.389947,7.250000,-0.014757,0.748727,0.0,0.0
22501,4.175000,6.722222,-0.012502,0.781178,0.0,0.0


**rapid_overclock** — резкое ускорение  
**hard_braking** — резкое торможение

In [50]:
X.to_csv("./data/new_scooters_trips_info.csv")

Случайная выборка поездок (для демонстрации прототипа)

In [163]:
y = scooters.loc[scooters.ride_id % 197 == 0]
len(set(y.ride_id))
display(y)
y.to_csv("test_user_rides.csv")

Unnamed: 0,ride_id,gps_date,gps_t,lat,lon,wheel
0,0,2021-05-17,00:01:25.577,55.702469,37.506378,6.1
1,0,2021-05-17,00:01:28.324,55.702454,37.506348,9.2
2,0,2021-05-17,00:01:32.948,55.702408,37.506248,19.2
3,0,2021-05-17,00:01:34.226,55.702362,37.506145,22.2
4,0,2021-05-17,00:01:35.525,55.702328,37.506081,23.1
...,...,...,...,...,...,...
11222820,22458,2021-05-30,21:09:41.111,55.757854,37.645981,9.8
11222821,22458,2021-05-30,21:09:43.991,55.757835,37.646133,8.8
11222822,22458,2021-05-30,21:09:47.561,55.757824,37.646244,7.0
11222823,22458,2021-05-30,21:09:51.444,55.757832,37.646374,1.8


In [32]:
x = get_raw_trip_data(scooters, 1)
len(x[(x.accel > -4.42) & (x.dt < 5)])

1234