## Загрузка данных GPS

Парсим папки, формируем общий датасет данных GPS

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import folium
from haversine import haversine
from os import listdir
from tqdm import tqdm
from snippets import *

%matplotlib inline

In [2]:
DATA_PATH = "../../data"

GOOD_PATH = f"{DATA_PATH}/data_good_right_dates"
FRAUD_PATH = f"{DATA_PATH}/drivers_with_gps_and_sl_problems"
UNKNOWN_PATH = f"{DATA_PATH}/unlabeled"

listdir(DATA_PATH)

['data_good_right_dates',
 'drivers_stats.csv',
 'gps_data.csv',
 'unlabeled',
 'gps_data_raw.csv',
 'good_data',
 'points_stats.csv',
 'accel_data.csv',
 'accel_data_raw.csv',
 'drivers_with_gps_and_sl_problems',
 'gps_stats.csv',
 'rides_of_drivers.csv',
 'accel_stats.csv']

## Парсинг файлов GPS

In [3]:
def load_gps_data_from_folders(path, check_order=False):
    is_digit = lambda s: (s.startswith('-') and s[1:].isdigit()) or s.isdigit()
    idx = np.array([s for s in listdir(path) if is_digit(s)]).astype(np.int64)
    result = pd.DataFrame([], columns=["time", "lat", "lon", "gps_time", "driver_hash", "fraud"])
    different_order, empty_data = [], []
    for x in tqdm(idx):
        df = pd.read_csv(f"{path}/{x}/track.csv", parse_dates=[0, 3])
        if df.shape[0] > 0:
            df["driver_hash"] = x
            if check_order:
                ordered_by_gps = df[["gps_time"]].sort_values(by="gps_time").reset_index()
                ordered_by_time = df[["time"]].sort_values(by="time").reset_index()
                if not np.all(ordered_by_gps["index"] - ordered_by_time["index"] == 0):
                    different_order.append(x)
        else:
            empty_data.append(x)
        result = pd.concat((result, df), axis=0)
    if check_order:
        display("With empty data", set(empty_data))
        display("GPS and time order different", set(different_order))
        display("GPS and time order same", set(idx) - set(different_order))
    result["driver_hash"] = result["driver_hash"].astype(np.int64)
    return result

## Данные без фрода

In [4]:
df_gps_good = load_gps_data_from_folders(GOOD_PATH)
df_gps_good["fraud"] = 0
describe(df_gps_good)

100%|██████████| 23/23 [00:02<00:00, 11.37it/s]


Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],958574,2017-09-09 15:40:48.800993536,2015-03-01 03:00:02,2016-01-18 21:05:52.500000,2017-08-07 22:13:06.500000,2019-01-21 22:16:56.250000128,2020-04-13 22:47:15,
lat,float64,958574,55.734,55.3926,55.6604,55.7435,55.8066,56.024,0.10941
lon,float64,958574,37.6157,36.8129,37.4999,37.613,37.7306,38.1199,0.164338
gps_time,datetime64[ns],958574,2017-09-09 15:40:43.680295936,2015-03-01 03:00:02,2016-01-18 21:05:52.500000,2017-08-07 22:13:06.500000,2019-01-21 22:16:56.250000128,2020-04-13 22:47:15,
driver_hash,int64,958574,8.88151e+17,-8.85545e+18,-2.32611e+18,2.5462e+17,4.80524e+18,8.89355e+18,4.7696e+18
fraud,int64,958574,0,0,0,0,0,0,0.0


(958574, 6)

## Данные с фродом

In [5]:
df_gps_fraud = load_gps_data_from_folders(FRAUD_PATH)
df_gps_fraud["fraud"] = 1
describe(df_gps_fraud)

100%|██████████| 14/14 [00:00<00:00, 65.58it/s]


Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],107535,2021-05-10 07:18:06.560756736,2020-12-22 06:35:55,2021-02-11 18:05:14,2021-04-14 19:41:17,2021-07-31 01:47:13,2021-08-20 12:43:05,
lat,float64,107535,55.7452,42.98,55.7361,55.7564,55.7752,56.3264,0.143642
lon,float64,107535,37.6533,37.2836,37.5676,37.6698,37.7251,132.415,0.87641
gps_time,datetime64[ns],107535,2021-05-10 07:18:02.733910016,2020-12-22 06:26:02,2021-02-11 18:05:13.500000,2021-04-14 19:41:17,2021-07-31 01:47:13,2021-08-20 12:26:47,
driver_hash,int64,107535,-2.73693e+18,-8.91541e+18,-6.7159e+18,-3.96276e+18,2.02531e+18,8.56974e+18,4.11427e+18
fraud,int64,107535,1,1,1,1,1,1,0.0


(107535, 6)

## Неразмеченные данные

In [6]:
df_gps_unknown = load_gps_data_from_folders(UNKNOWN_PATH)
df_gps_unknown["fraud"] = -1
describe(df_gps_unknown)

100%|██████████| 280/280 [00:52<00:00,  5.38it/s]


Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],3984183.0,2021-04-04 04:41:20.900185600,2020-10-25 03:59:58,2021-01-14 07:38:30.500000,2021-04-06 17:14:19,2021-06-17 06:29:28,2021-08-28 02:47:58,
lat,float64,3984180.0,55.7452,48.4742,55.6677,55.7506,55.8301,82.4343,0.183564
lon,float64,3984180.0,37.5975,26.1013,37.4997,37.58,37.6899,135.46,0.420532
gps_time,datetime64[ns],3984183.0,2021-04-04 04:41:19.051149824,2020-10-25 03:59:56,2021-01-14 07:38:30.500000,2021-04-06 17:14:19,2021-06-17 06:29:28,2021-08-28 02:47:58,
driver_hash,int64,3984180.0,2.44972e+17,-9.21858e+18,-4.29668e+18,-3.2897e+15,4.51957e+18,9.20647e+18,5.3914e+18
fraud,int64,3984180.0,-1,-1,-1,-1,-1,-1,0.0


(3984183, 6)

## Общий датасет GPS

In [7]:
df_gps = pd.concat((df_gps_good, df_gps_fraud, df_gps_unknown), axis=0)
df_gps

Unnamed: 0,time,lat,lon,gps_time,driver_hash,fraud
0,2015-03-29 05:03:40,55.644084,37.716370,2015-03-29 04:57:03,-4294745953159109974,0
1,2015-03-29 05:03:40,55.644084,37.716370,2015-03-29 04:57:04,-4294745953159109974,0
2,2015-03-29 05:03:40,55.644082,37.716389,2015-03-29 05:03:39,-4294745953159109974,0
3,2015-03-29 06:23:24,55.644082,37.716389,2015-03-29 06:23:23,-4294745953159109974,0
4,2015-03-29 07:28:52,55.644131,37.716376,2015-03-29 07:28:51,-4294745953159109974,0
...,...,...,...,...,...,...
11157,2021-02-05 18:32:06,55.935577,37.517716,2021-02-05 18:32:06,-2035413153418187612,-1
11158,2021-02-05 18:32:06,55.935580,37.517710,2021-02-05 18:32:06,-2035413153418187612,-1
11159,2021-02-05 18:32:09,55.935572,37.517723,2021-02-05 18:32:09,-2035413153418187612,-1
11160,2021-02-05 18:32:10,55.935572,37.517724,2021-02-05 18:32:10,-2035413153418187612,-1


In [8]:
df_gps.to_csv(f"{DATA_PATH}/gps_data_raw.csv", index=False)
'Done'

'Done'

## Обзор датасета GPS

In [9]:
describe(df_gps)

Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],5050292.0,2020-07-31 17:37:31.378776832,2015-03-01 03:00:02,2020-11-20 07:07:20.750000128,2021-03-05 11:16:20,2021-06-03 21:19:25.750000128,2021-08-28 02:47:58,
lat,float64,5050290.0,55.743,42.98,55.6674,55.7496,55.8242,82.4343,0.171211
lon,float64,5050290.0,37.6021,26.1013,37.5009,37.5848,37.7032,135.46,0.401377
gps_time,datetime64[ns],5050292.0,2020-07-31 17:37:28.866648576,2015-03-01 03:00:02,2020-11-20 07:07:20.750000128,2021-03-05 11:16:20,2021-06-03 21:19:25.750000128,2021-08-28 02:47:58,
driver_hash,int64,5050290.0,3.03558e+17,-9.21858e+18,-4.06958e+18,1.02237e+17,4.79299e+18,9.20647e+18,5.27957e+18
fraud,int64,5050290.0,-0.767609,-1,-1,-1,-1,1,0.470076


(5050292, 6)

In [10]:
df_gps.corr()

Unnamed: 0,lat,lon,driver_hash,fraud
lat,1.0,0.486209,0.015026,-0.020133
lon,0.486209,1.0,0.021444,0.025214
driver_hash,0.015026,0.021444,1.0,-0.007463
fraud,-0.020133,0.025214,-0.007463,1.0


## Замечания

- Данные имеются для всех водителей
- Сортировка по "gps_time" и "time" отличается

## Примеры отрисовки маршрутов

In [11]:
# [None, -1, 0, 1]
FORCE_FRAUD = None

# Выбор случайного водителя
driver_gps, driver_hash, driver_fraud = sample_driver(df_gps, None)

center = (driver_gps["lat"].mean(), driver_gps["lon"].mean())
m = folium.Map(location=center, zoom_start=10, legend=True)

# Маршрут по отсортированному "gps_time"
driver_gps = driver_gps.sort_values(by="gps_time").reset_index(drop=True)
map_driver_points(m, driver_gps, driver_hash, label="gps_time", color="random")

# Маршрут по отсортированному "gps_time"
driver_gps = driver_gps.sort_values(by="time").reset_index(drop=True)
map_driver_points(m, driver_gps, driver_hash, label="time", color="random")

folium.map.LayerControl('topleft', collapsed= False).add_to(m)
m

In [12]:
# [None, -1, 0, 1]
FORCE_FRAUD = None

center = (df_gps["lat"].mean(), df_gps["lon"].mean())
m = folium.Map(location=center, zoom_start=10, legend=True)

# Водитель с неразмеченными данными
driver_gps, driver_hash, driver_fraud = sample_driver(df_gps, -1)
map_driver_points(m, driver_gps, driver_hash, label=driver_fraud)

# Водитель без фрода
driver_gps, driver_hash, driver_fraud = sample_driver(df_gps, 0)
map_driver_points(m, driver_gps, driver_hash, label=driver_fraud)

# Водитель с фродом
driver_gps, driver_hash, driver_fraud = sample_driver(df_gps, 1)
map_driver_points(m, driver_gps, driver_hash, label=driver_fraud)

folium.map.LayerControl('topleft', collapsed= False).add_to(m)
m