## Препроцессинг данных

__Папка с данными на gdrive__

https://drive.google.com/drive/folders/1FeUwrVfxp09xAifiMngwtfrRZ8nhtj8T?usp=sharing



In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import folium
from matplotlib import pyplot as plt
from matplotlib import colors
from os import listdir
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from snippets import *

%matplotlib inline

In [51]:
DATA_PATH = "../../data"

GPS_PATH = f"{DATA_PATH}/gps_data.csv"
ACCEL_PATH = f"{DATA_PATH}/accel_data.csv"

listdir(DATA_PATH)

['data_good_right_dates',
 'drivers_stats.csv',
 'gps_data.csv',
 'unlabeled',
 'gps_data_raw.csv',
 'good_data',
 'points_stats.csv',
 'accel_data.csv',
 'accel_data_raw.csv',
 'drivers_with_gps_and_sl_problems',
 'gps_stats.csv',
 'rides_of_drivers.csv',
 'accel_stats.csv']

## Датасет GPS

In [52]:
df_gps = pd.read_csv(GPS_PATH, parse_dates=[0, 3])
describe(df_gps)

Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],5050292.0,2020-07-31 17:37:31.378774272,2015-03-01 03:00:02,2020-11-20 07:07:20.750000128,2021-03-05 11:16:20,2021-06-03 21:19:25.750000128,2021-08-28 02:47:58,
lat,float64,5050290.0,55.743,42.98,55.6674,55.7496,55.8242,82.4343,0.171211
lon,float64,5050290.0,37.6021,26.1013,37.5009,37.5848,37.7032,135.46,0.401377
gps_time,datetime64[ns],5050292.0,2020-07-31 17:37:28.866647296,2015-03-01 03:00:02,2020-11-20 07:07:20.750000128,2021-03-05 11:16:20,2021-06-03 21:19:25.750000128,2021-08-28 02:47:58,
driver_hash,int64,5050290.0,3.03558e+17,-9.21858e+18,-4.06958e+18,1.02237e+17,4.79299e+18,9.20647e+18,5.27957e+18
fraud,int64,5050290.0,-0.767609,-1,-1,-1,-1,1,0.470076
gps_delta,float64,5050290.0,6.85008,0,2,3,3,3600,84.5366
gps_distance,float64,5050290.0,19.588,0,0,5,25,10000,180.183
gps_speed,float64,5050290.0,5.64977,0,0,2,9.66667,100,8.39405
gps_accel,float64,5050290.0,-0.0740194,-100,-0.111111,0,0.111111,100,6.11186


(5050292, 18)

## Датасет акселерометра

In [53]:
df_accel = pd.read_csv(ACCEL_PATH, parse_dates=[0])
describe(df_accel)

Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],2172989.0,2020-11-22 12:54:59.282576896,2015-02-28 05:46:43,2020-12-18 06:53:34,2021-03-18 16:54:01,2021-06-12 11:02:19,2021-08-29 00:03:16,
x,float64,2172990.0,0.368934,-71.1054,-0.371101,0.184359,0.785298,65.1529,2.36914
y,float64,2172990.0,7.6569,-74.7916,7.63271,8.85376,9.50019,55.2294,3.4034
z,float64,2172990.0,3.83295,-65.5062,2.01402,3.69992,5.35365,76.4756,2.73072
lat,float64,2172990.0,55.7482,55.3163,55.6768,55.7512,55.8214,82.4343,0.16893
lon,float64,2172990.0,37.601,36.3143,37.5016,37.5863,37.686,108.856,0.376061
driver_hash,int64,2172990.0,-1.12657e+17,-9.21858e+18,-4.47912e+18,-4.14168e+17,4.43147e+18,9.20647e+18,5.3536e+18
fraud,int64,2172990.0,-0.796897,-1,-1,-1,-1,1,0.509161
ac_delta,float64,2172990.0,11.2106,0,0,0,0,3600,118.59
ac_distance,float64,2172990.0,185.61,0,11,77,230,10000,399.524


(2172989, 12)

In [54]:
# [None, -1, 0, 1]
FORCE_FRAUD = None

# Выбор случайного водителя
driver_gps, driver_hash, driver_fraud = sample_driver(df_gps, FORCE_FRAUD)
driver_accel = df_accel[df_accel["driver_hash"] == driver_hash]

center = (driver_gps["lat"].mean(), driver_gps["lon"].mean())
m = folium.Map(location=center, zoom_start=10, legend=True)

# Маршрут по данным акселерометра
driver_accel = driver_accel.sort_values(by="time").reset_index(drop=True)
map_driver_points(m, driver_accel, driver_hash, label="accel", color="blue")

# Маршрут по данным  GPS
driver_gps = driver_gps.sort_values(by="gps_time").reset_index(drop=True)
map_driver_points(m, driver_gps, driver_hash, label="gps")

folium.map.LayerControl('topleft', collapsed= False).add_to(m)
m

In [55]:
MERGE_TOLERANCE = "600s"

df_merged = pd.merge_asof(driver_gps, driver_accel,
              left_on="gps_time",
              right_on="time",
              direction="nearest",
              tolerance=pd.Timedelta(MERGE_TOLERANCE),
              suffixes=("", "_r"),
              allow_exact_matches=False)

df_matched = df_merged.dropna(thresh=len(df_gps.columns) + len(df_accel.columns)) \
    .copy() \
    .drop(columns=["driver_hash_r", "fraud_r"]) \
    .rename(columns={"time_r": "accel_time", "lat_r": "accel_lat", "lon_r": "accel_lon",
                     "time_diff_r": "accel_time_diff", "distance_r": "accel_distance",
                     "speed_r": "accel_speed", "route_by_time_r": "route_by_accel",
                     "x": "accel_x", "y": "accel_y", "z": "accel_z"
                    })

display(f"{driver_hash} Matched: {df_matched.shape[0]} of {df_merged.shape[0]}")
if df_matched.shape[0] > 0:
    describe(df_matched)

'-4294745953159109974 Matched: 38204 of 40827'

Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],38204,2015-03-30 04:21:13.728510208,2015-03-29 07:46:16,2015-03-29 16:00:39.249999872,2015-03-30 07:53:31.500000,2015-03-30 17:34:01.750000128,2015-03-31 01:29:45,
lat,float64,38204,55.7178,55.6217,55.6748,55.7114,55.7563,55.8274,0.0493196
lon,float64,38204,37.7646,37.6009,37.7173,37.7567,37.8185,37.9408,0.0727329
gps_time,datetime64[ns],38204,2015-03-30 04:21:04.057847296,2015-03-29 07:46:16,2015-03-29 16:00:39.249999872,2015-03-30 07:53:31.500000,2015-03-30 17:34:01.750000128,2015-03-31 01:29:44,
driver_hash,int64,38204,-4.29475e+18,-4.29475e+18,-4.29475e+18,-4.29475e+18,-4.29475e+18,-4.29475e+18,0.0
fraud,int64,38204,0,0,0,0,0,0,0.0
gps_delta,float64,38204,2.90127,0,2,3,3,2541,16.6057
gps_distance,float64,38204,17.1482,0,0,11,28,3020,30.9247
gps_speed,float64,38204,6.53073,0,0,4.5,11.3333,100,7.29097
gps_accel,float64,38204,-0.0468527,-100,-0.25,0,0.25,100,4.81522


(38204, 28)

In [56]:
gps_time_diff = df_matched["gps_time"].diff()
gps_time_diff.fillna(gps_time_diff.mean(), inplace=True)
display(gps_time_diff.round("5min").value_counts())

accel_time_diff = df_matched["accel_time"].diff()
accel_time_diff.fillna(accel_time_diff.diff().mean(), inplace=True)
display(accel_time_diff.round("5min").value_counts())

0 days 00:00:00    38183
0 days 00:05:00        7
0 days 00:10:00        6
0 days 00:20:00        2
0 days 00:25:00        2
0 days 00:40:00        1
0 days 08:25:00        1
0 days 01:25:00        1
0 days 00:15:00        1
Name: gps_time, dtype: int64

0 days 00:00:00      38140
0 days 00:05:00         33
0 days 00:10:00          8
0 days 00:15:00          7
0 days 00:20:00          3
-1 days +23:55:00        3
0 days 00:30:00          2
0 days 00:35:00          2
0 days 00:55:00          1
0 days 08:40:00          1
0 days 00:40:00          1
0 days 00:25:00          1
0 days 00:45:00          1
0 days 01:40:00          1
Name: accel_time, dtype: int64

In [57]:
df_matched["track"] = gps_time_diff.round("10min").cumsum().dt.seconds
number_of_tracks = len(df_matched["track"].unique())
display(f"Number of tracks: {number_of_tracks}")

df_matched = df_matched[["driver_hash", "fraud", "track"] + cols_gps[2:] + cols_accel[2:]].copy()
describe(df_matched)

'Number of tracks: 18'

Unnamed: 0,dtypes,count,mean,min,25%,50%,75%,max,std
driver_hash,int64,38204,-4.29475e+18,-4.29475e+18,-4.29475e+18,-4.29475e+18,-4.29475e+18,-4.29475e+18,0.0
fraud,int64,38204,0,0,0,0,0,0,0.0
track,int64,38204,24499.3,0,4800,37200,47400,49200,20680.7
time,datetime64[ns],38204,2015-03-30 04:21:13.728510208,2015-03-29 07:46:16,2015-03-29 16:00:39.249999872,2015-03-30 07:53:31.500000,2015-03-30 17:34:01.750000128,2015-03-31 01:29:45,
lat,float64,38204,55.7178,55.6217,55.6748,55.7114,55.7563,55.8274,0.0493196
lon,float64,38204,37.7646,37.6009,37.7173,37.7567,37.8185,37.9408,0.0727329
gps_time,datetime64[ns],38204,2015-03-30 04:21:04.057847296,2015-03-29 07:46:16,2015-03-29 16:00:39.249999872,2015-03-30 07:53:31.500000,2015-03-30 17:34:01.750000128,2015-03-31 01:29:44,
accel_time,datetime64[ns],38204,2015-03-30 04:20:56.333839616,2015-03-29 07:47:07,2015-03-29 16:00:29,2015-03-30 07:52:58,2015-03-30 17:33:54,2015-03-31 01:20:38,
accel_lat,float64,38204,55.7181,55.6222,55.6762,55.7122,55.7565,55.8271,0.0494421
accel_lon,float64,38204,37.7643,37.6009,37.7189,37.7545,37.8174,37.9406,0.0725094


(38204, 13)

In [58]:
df_matched

Unnamed: 0,driver_hash,fraud,track,time,lat,lon,gps_time,accel_time,accel_lat,accel_lon,accel_x,accel_y,accel_z
9,-4294745953159109974,0,0,2015-03-29 07:46:16,55.647351,37.721282,2015-03-29 07:46:16,2015-03-29 07:47:07,55.647154,37.721708,0.980665,7.768428,5.956458
10,-4294745953159109974,0,0,2015-03-29 07:46:19,55.647500,37.720908,2015-03-29 07:46:18,2015-03-29 07:47:07,55.647154,37.721708,0.980665,7.768428,5.956458
11,-4294745953159109974,0,0,2015-03-29 07:46:19,55.647500,37.720908,2015-03-29 07:46:19,2015-03-29 07:47:07,55.647154,37.721708,0.980665,7.768428,5.956458
12,-4294745953159109974,0,0,2015-03-29 07:46:20,55.647158,37.721549,2015-03-29 07:46:20,2015-03-29 07:47:07,55.647154,37.721708,0.980665,7.768428,5.956458
13,-4294745953159109974,0,0,2015-03-29 07:46:23,55.647155,37.721691,2015-03-29 07:46:23,2015-03-29 07:47:07,55.647154,37.721708,0.980665,7.768428,5.956458
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40633,-4294745953159109974,0,49200,2015-03-31 01:29:20,55.694439,37.712870,2015-03-31 01:29:19,2015-03-31 01:20:38,55.691712,37.663064,-6.692665,-11.508574,-1.331827
40634,-4294745953159109974,0,49200,2015-03-31 01:29:23,55.694192,37.713091,2015-03-31 01:29:22,2015-03-31 01:20:38,55.691712,37.663064,-6.692665,-11.508574,-1.331827
40635,-4294745953159109974,0,49200,2015-03-31 01:29:26,55.693961,37.713417,2015-03-31 01:29:25,2015-03-31 01:20:38,55.691712,37.663064,-6.692665,-11.508574,-1.331827
40636,-4294745953159109974,0,49200,2015-03-31 01:29:42,55.692796,37.715105,2015-03-31 01:29:42,2015-03-31 01:20:38,55.691712,37.663064,-6.692665,-11.508574,-1.331827


In [59]:
center = (df_matched["lat"].mean(), df_matched["lon"].mean())
m = folium.Map(location=center, zoom_start=10, legend=True)

color = driver_accel.head(1)["fraud"].replace({1:"red", 0:"green", -1:"yellow"}).values[0]

for track in df_matched["track"].unique():
    fg = folium.FeatureGroup(name=f"track{track}")
    gps_track = df_matched[df_matched["track"] == track]
    folium.PolyLine(gps_track[["lat", "lon"]], color="blue", weight=8, label="accel", opacity=0.8).add_to(fg)
    accel_track = df_matched[df_matched["track"] == track]
    folium.PolyLine(accel_track[["accel_lat", "accel_lon"]], color=color, weight=6, label="tracks", opacity=0.8).add_to(fg)
    fg.add_to(m)

folium.map.LayerControl('topleft', collapsed=False).add_to(m)
m

### Вывод - такая разбивка на поездки выглядит вполне правдоподобной