## Загрузка данных акселерометра

Парсим папки, формируем общий датасет данных акселерометра

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import folium
from haversine import haversine
from os import listdir
from tqdm import tqdm
from snippets import *

%matplotlib inline

In [2]:
DATA_PATH = "../../data"

GOOD_PATH = f"{DATA_PATH}/data_good_right_dates"
FRAUD_PATH = f"{DATA_PATH}/drivers_with_gps_and_sl_problems"
UNKNOWN_PATH = f"{DATA_PATH}/unlabeled"

listdir(DATA_PATH)

['data_good_right_dates',
 'drivers_stats.csv',
 'gps_data.csv',
 'unlabeled',
 'gps_data_raw.csv',
 'good_data',
 'points_stats.csv',
 'accel_data.csv',
 'accel_data_raw.csv',
 'drivers_with_gps_and_sl_problems',
 'gps_stats.csv',
 'rides_of_drivers.csv',
 'accel_stats.csv']

## Парсинг файлов акселерометра

In [3]:
def load_accel_data_from_folders(path, check_order=False):
    is_digit = lambda s: (s.startswith('-') and s[1:].isdigit()) or s.isdigit()
    idx = np.array([s for s in listdir(path) if is_digit(s)]).astype(np.int64)
    result = pd.DataFrame([], columns=["time", "x", "y", "z", "lat", "lon","driver_hash", "fraud"])
    empty_data = []
    for x in tqdm(idx):
        df = pd.read_csv(f"{path}/{x}/accelerometer.csv", parse_dates=[0])
        if df.shape[0] > 0:
            df["time"] = df["time"].dt.tz_convert(None) + pd.DateOffset(hours=3)
            df["driver_hash"] = x
        else:
            empty_data.append(x)
        result = pd.concat((result, df), axis=0)
    if check_order:
        display("With empty data", set(empty_data))
    result["driver_hash"] = result["driver_hash"].astype(np.int64)
    return result

## Данные без фрода

In [4]:
df_accel_good = load_accel_data_from_folders(GOOD_PATH)
df_accel_good["fraud"] = 0
# Фикс даты для хороших данных
df_accel_good["time"] = df_accel_good["time"] + pd.DateOffset(hours=-24)
describe(df_accel_good)

100%|██████████| 23/23 [00:02<00:00,  9.39it/s]


Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],229709,2017-09-27 15:26:15.470369280,2015-02-28 05:46:42.832594,2016-01-18 15:17:03.967033088,2017-09-11 08:23:17.829850880,2019-05-18 23:20:39.848606976,2020-04-14 18:07:27.919088,
x,float64,229709,-0.0839609,-19.8078,-0.478541,0.0957681,0.735042,57.0202,1.98267
y,float64,229709,8.11527,-37.0974,7.95354,8.97347,9.51246,23.3913,2.64659
z,float64,229709,3.78649,-65.5062,2.22084,3.40934,5.18501,28.9555,2.46283
lat,float64,229709,55.7377,55.3163,55.6746,55.7453,55.7977,56.178,0.100379
lon,float64,229709,37.6042,36.813,37.4961,37.5936,37.7091,38.4464,0.157964
driver_hash,int64,229709,9.55708e+17,-8.85545e+18,-3.75461e+18,2.69343e+17,4.9574e+18,8.89355e+18,4.96158e+18
fraud,int64,229709,0,0,0,0,0,0,0.0


(229709, 8)

## Данные с фродом

In [5]:
df_accel_fraud = load_accel_data_from_folders(FRAUD_PATH)
df_accel_fraud["fraud"] = 1
describe(df_accel_fraud)

100%|██████████| 14/14 [00:01<00:00, 13.03it/s]


Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],105816,2021-05-26 12:35:30.161651200,2020-12-21 09:25:48.474028,2021-03-18 21:17:44.451609856,2021-06-22 17:28:10.526172672,2021-07-29 12:03:39.636103680,2021-08-22 14:45:29.417931,
x,float64,105816,0.176285,-71.1054,-0.872088,0,1.053,65.1529,3.89279
y,float64,105816,3.94255,-74.7916,0.062192,5.04781,8.66703,33.5116,5.63456
z,float64,105816,5.23198,-55.1792,2.22182,5.51863,8.772,76.4756,4.14453
lat,float64,105816,55.7553,55.4259,55.733,55.7566,55.793,55.8995,0.0640573
lon,float64,105816,37.6374,37.2782,37.5725,37.6546,37.7327,37.9712,0.127567
driver_hash,int64,105816,-2.79601e+18,-8.91541e+18,-6.7159e+18,-3.96276e+18,1.29354e+18,8.56974e+18,4.1981e+18
fraud,int64,105816,1,1,1,1,1,1,0.0


(105816, 8)

## Неразмеченные данные

In [6]:
df_accel_unknown = load_accel_data_from_folders(UNKNOWN_PATH)
df_accel_unknown["fraud"] = -1
describe(df_accel_unknown)

100%|██████████| 280/280 [00:35<00:00,  7.89it/s]


Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],1837464.0,2021-04-04 21:19:12.671973376,2020-10-24 08:45:27.981150,2021-01-13 08:30:16.014356224,2021-04-13 19:06:15.511302656,2021-06-15 15:39:44.727341568,2021-08-29 00:03:16.287344,
x,float64,1837460.0,0.436646,-48.2001,-0.345962,0.201113,0.783502,36.4469,2.28938
y,float64,1837460.0,7.8135,-49.1889,7.776,8.885,9.51231,55.2294,3.18575
z,float64,1837460.0,3.75819,-31.7222,1.96325,3.69126,5.24825,72.5276,2.63675
lat,float64,1837460.0,55.7491,55.3592,55.673,55.7511,55.8301,82.4343,0.179542
lon,float64,1837460.0,37.5986,36.3143,37.4993,37.5836,37.6791,108.856,0.403864
driver_hash,int64,1837460.0,-9.16892e+16,-9.21858e+18,-4.87201e+18,-4.43966e+17,4.42743e+18,9.20647e+18,5.40781e+18
fraud,int64,1837460.0,-1,-1,-1,-1,-1,-1,0.0


(1837464, 8)

## Общий датасет акселерометра

In [7]:
df_accel = pd.concat((df_accel_good, df_accel_fraud, df_accel_unknown), axis=0)
df_accel

Unnamed: 0,time,x,y,z,lat,lon,driver_hash,fraud
0,2015-03-28 07:21:13.142944,1.753877,1.067479,11.864562,55.647224,37.722467,-4294745953159109974,0
1,2015-03-28 07:21:13.144762,-1.561805,3.973245,9.135306,55.647241,37.721796,-4294745953159109974,0
2,2015-03-28 07:32:49.210339,0.240943,8.118824,5.867700,55.651617,37.726127,-4294745953159109974,0
3,2015-03-28 07:32:49.208345,-0.959165,8.450642,2.407102,55.651772,37.724092,-4294745953159109974,0
4,2015-03-28 07:32:49.209669,1.390380,7.843664,6.525100,55.651603,37.725644,-4294745953159109974,0
...,...,...,...,...,...,...,...,...
9922,2021-02-06 16:55:59.060570,-1.195419,8.938995,3.889862,55.792084,37.596221,-2035413153418187612,-1
9923,2021-02-06 16:55:59.071583,-1.344467,8.027405,5.188721,55.791302,37.599634,-2035413153418187612,-1
9924,2021-02-06 16:55:59.069208,-0.986526,8.467331,4.461472,55.791834,37.599514,-2035413153418187612,-1
9925,2021-02-06 16:56:59.937713,-1.055954,8.147705,4.295075,55.790986,37.601701,-2035413153418187612,-1


In [8]:
df_accel.to_csv(f"{DATA_PATH}/accel_data_raw.csv", index=False)
'Done'

'Done'

## Обзор датасета акселерометра

In [9]:
describe(df_accel)

Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],2172989.0,2020-11-22 12:54:59.277889792,2015-02-28 05:46:42.832594,2020-12-18 06:53:34.205120,2021-03-18 16:54:00.549220096,2021-06-12 11:02:18.664983040,2021-08-29 00:03:16.287344,
x,float64,2172990.0,0.368934,-71.1054,-0.371101,0.184359,0.785298,65.1529,2.36914
y,float64,2172990.0,7.6569,-74.7916,7.63271,8.85376,9.50019,55.2294,3.4034
z,float64,2172990.0,3.83295,-65.5062,2.01402,3.69992,5.35365,76.4756,2.73072
lat,float64,2172990.0,55.7482,55.3163,55.6768,55.7512,55.8214,82.4343,0.16893
lon,float64,2172990.0,37.601,36.3143,37.5016,37.5863,37.686,108.856,0.376061
driver_hash,int64,2172990.0,-1.12657e+17,-9.21858e+18,-4.47912e+18,-4.14168e+17,4.43147e+18,9.20647e+18,5.3536e+18
fraud,int64,2172990.0,-0.796897,-1,-1,-1,-1,1,0.509161


(2172989, 8)

In [10]:
df_accel.corr()

Unnamed: 0,x,y,z,lat,lon,driver_hash,fraud
x,1.0,-0.169534,-0.092387,0.008605,0.018356,0.000835,-0.055243
y,-0.169534,1.0,-0.351446,-0.102737,-0.007301,0.043622,-0.180794
z,-0.092387,-0.351446,1.0,0.04676,-0.017801,-0.029218,0.094466
lat,0.008605,-0.102737,0.04676,1.0,0.639248,0.049226,-0.004809
lon,0.018356,-0.007301,-0.017801,0.639248,1.0,0.026716,0.020254
driver_hash,0.000835,0.043622,-0.029218,0.049226,0.026716,1.0,-0.054442
fraud,-0.055243,-0.180794,0.094466,-0.004809,0.020254,-0.054442,1.0


## Замечания

- Данные имеются не для всех водителей (пустые: -2521732935212074026, -1096547806415121851, 955896487322552843)
- Точки не отсортированы по времени

## Примеры отрисовки маршрутов

In [11]:
# [None, -1, 0, 1]
FORCE_FRAUD = None

# Выбор случайного водителя
driver_accel, driver_hash, driver_fraud = sample_driver(df_accel, -1)

center = (driver_accel["lat"].mean(), driver_accel["lon"].mean())
m = folium.Map(location=center, zoom_start=10, legend=True)

# Отрисовка данных в исходном порядке
map_driver_points(m, driver_accel, driver_hash, label=driver_fraud, color="random")

# Отрисовка данных по времени
driver_accel = driver_accel.sort_values(by="time")
map_driver_points(m, driver_accel, driver_hash, label=driver_fraud, color="random")

folium.map.LayerControl('topleft', collapsed= False).add_to(m)
m

In [12]:
# [None, -1, 0, 1]
FORCE_FRAUD = None

df_accel = df_accel.sort_values(by=["driver_hash", "time"])
center = (df_accel["lat"].mean(), df_accel["lon"].mean())
m = folium.Map(location=center, zoom_start=10, legend=True)

# Водитель с неразмеченными данными
driver_accel, driver_hash, driver_fraud = sample_driver(df_accel, -1)
map_driver_points(m, driver_accel, driver_hash, label=driver_fraud)

# Водитель без фрода
driver_accel, driver_hash, driver_fraud = sample_driver(df_accel, 0)
map_driver_points(m, driver_accel, driver_hash, label=driver_fraud)

# Водитель с фродом
driver_accel, driver_hash, driver_fraud = sample_driver(df_accel, 1)
map_driver_points(m, driver_accel, driver_hash, label=driver_fraud)

folium.map.LayerControl('topleft', collapsed= False).add_to(m)
m