Папка на GDrive:

https://drive.google.com/drive/folders/1FeUwrVfxp09xAifiMngwtfrRZ8nhtj8T?usp=sharing


In [1]:
from os import listdir

USE_GDRIVE = False


if USE_GDRIVE:
    from google.colab import drive
    drive.mount("/home/GDrive")
    DATA_PATH = f"/home/GDrive/MyDrive/made_2021_fraud_project/data"
else:
    DATA_PATH = "../../data"

GPS_PATH = "../../data/gps_data.csv"
ACCEL_PATH = "../../data/accel_data.csv"

listdir(DATA_PATH)

['drivers_stats.csv',
 'gps_data.csv',
 'unlabeled',
 'good_data',
 'points_stats.csv',
 'all_gps.csv',
 'all_accel.csv',
 'accel_data.csv',
 'drivers_with_gps_and_sl_problems',
 'gps_stats.csv',
 'rides_of_drivers.csv',
 'accel_stats.csv']

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import folium
from haversine import haversine
from tqdm import tqdm

%matplotlib inline

In [3]:
def describe(df):
    display(pd.concat((df.dtypes, df.describe(datetime_is_numeric=True).T), axis=1))
    display(df.shape)

In [4]:
df_gps = pd.read_csv(GPS_PATH, parse_dates=[0, 3])
describe(df_gps)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],5050292.0,2021-04-06 10:23:00.788006400,2020-10-25 03:59:58,2021-01-16 06:40:41.750000128,2021-04-07 17:01:10.500000,2021-06-22 15:23:15.499999744,2021-08-28 02:47:58,
lat,float64,5050290.0,55.743,42.98,55.6674,55.7496,55.8242,82.4343,0.171211
lon,float64,5050290.0,37.6021,26.1013,37.5009,37.5848,37.7032,135.46,0.401377
gps_time,datetime64[ns],5050292.0,2021-04-06 10:22:58.275877632,2020-10-25 03:59:56,2021-01-16 06:40:38.749999872,2021-04-07 17:01:10.500000,2021-06-22 15:23:15.499999744,2021-08-28 02:47:58,
driver_hash,int64,5050290.0,3.03558e+17,-9.21858e+18,-4.06958e+18,1.02237e+17,4.79299e+18,9.20647e+18,5.27957e+18
fraud,int64,5050290.0,-0.767609,-1,-1,-1,-1,1,0.470076


(5050292, 6)

In [5]:
df_accel = pd.read_csv(ACCEL_PATH, parse_dates=[0])
describe(df_accel)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],2172989.0,2020-11-22 15:27:12.714645504,2015-03-01 05:46:42.832594,2020-12-18 06:53:34.205120,2021-03-18 16:54:00.549220096,2021-06-12 11:02:18.664983040,2021-08-29 00:03:16.287344,
x,float64,2172990.0,0.368934,-71.1054,-0.371101,0.184359,0.785298,65.1529,2.36914
y,float64,2172990.0,7.6569,-74.7916,7.63271,8.85376,9.50019,55.2294,3.4034
z,float64,2172990.0,3.83295,-65.5062,2.01402,3.69992,5.35365,76.4756,2.73072
lat,float64,2172990.0,55.7482,55.3163,55.6768,55.7512,55.8214,82.4343,0.16893
lon,float64,2172990.0,37.601,36.3143,37.5016,37.5863,37.686,108.856,0.376061
driver_hash,int64,2172990.0,-1.12657e+17,-9.21858e+18,-4.47912e+18,-4.14168e+17,4.43147e+18,9.20647e+18,5.3536e+18
fraud,int64,2172990.0,-0.796897,-1,-1,-1,-1,1,0.509161


(2172989, 8)

In [6]:
num_gps_drivers = len(df_gps["driver_hash"].unique())
num_accel_drivers = len(df_accel["driver_hash"].unique())
display(f"Unique gps drivers: {num_gps_drivers}")
display(f"Unique accel drivers: {num_accel_drivers}")

'Unique gps drivers: 317'

'Unique accel drivers: 314'

In [7]:
def map_driver(driver_hash, df_gps=df_gps, df_accel=df_accel):
    driver_gps = df_gps[df_gps["driver_hash"] == driver_hash]
    driver_accel = df_accel[df_accel["driver_hash"] == driver_hash]
    color = driver_gps.head(1)["fraud"].replace({1:"red", 0:"green", -1:"yellow"}).values[0]
    center = (driver_gps["lat"].mean(), driver_gps["lon"].mean())
    m = folium.Map(location=center, zoom_start=10, legend=True)
    if driver_gps.shape[0] > 0:
        fg = folium.FeatureGroup(name=f"gps").add_to(m)
        folium.PolyLine(driver_gps[["lat", "lon"]], color='blue', weight=8, label="tracks", opacity=0.8).add_to(fg)
    if driver_accel.shape[0] > 0:
        fg = folium.FeatureGroup(name=f"accel").add_to(m)
        folium.PolyLine(driver_accel[["lat", "lon"]], color=color, weight=6, label="accel", opacity=0.8).add_to(fg)
    folium.map.LayerControl('topleft', collapsed= False).add_to(m)
    return m

In [8]:
# [1, 0, -1, None]
FORCE_FRAUD = None

if not FORCE_FRAUD is None:
    cur_driver = np.random.choice(df_gps.loc[df_gps["fraud"] == FORCE_FRAUD, "driver_hash"].unique())
else:
    cur_driver = np.random.choice(df_gps["driver_hash"].unique())

# пример водителя в большим количесвтом сматченных точек
# cur_driver = -6646726793274239750

display(f"Current Driver: {cur_driver}")
map_driver(cur_driver)

'Current Driver: -7796157997828470055'

In [9]:
df_gps_stats = df_gps.groupby(by=["driver_hash"], sort=False).agg(["min", "max", "count"])
df_gps_stats.columns = ["_".join(c) for c in df_gps_stats.columns[:-3]] + ["fraud", "fraud_count", "gps_points"]
df_gps_stats.drop(columns=[c for c in df_gps_stats.columns if c.endswith("_count")], inplace=True)

df_gps_stats.reset_index().to_csv(f"{DATA_PATH}/gps_stats.csv", index=False)

display(df_gps_stats)
display(df_gps_stats.shape)

Unnamed: 0_level_0,time_min,time_max,lat_min,lat_max,lon_min,lon_max,gps_time_min,gps_time_max,fraud,gps_points
driver_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025312357143911724,2021-04-13 11:11:07,2021-04-14 21:51:11,55.653855,55.790612,37.642853,37.786800,2021-04-13 11:11:07,2021-04-14 21:51:11,1,16948
1293535473153840726,2021-06-23 06:00:57,2021-06-23 15:10:17,55.687417,55.699644,37.607736,37.668832,2021-06-23 06:00:50,2021-06-23 15:09:24,1,17
-8729475955540025841,2021-04-30 11:26:15,2021-05-01 19:41:53,55.803774,55.850894,37.322341,37.382181,2021-04-30 11:26:15,2021-05-01 19:41:53,1,3087
-7745607963253656189,2021-07-30 04:09:24,2021-08-01 02:08:49,55.364459,55.847576,37.509411,37.843447,2021-07-30 04:09:24,2021-08-01 02:08:49,1,15994
-2132251359044308970,2021-07-29 03:03:02,2021-07-31 02:52:34,55.786957,55.829857,37.680754,37.843200,2021-07-29 03:03:01,2021-07-31 02:52:34,1,7749
...,...,...,...,...,...,...,...,...,...,...
4519569983208779248,2020-11-19 06:27:52,2020-11-21 02:48:38,55.662534,55.910985,37.447272,37.866314,2020-11-19 06:27:52,2020-11-21 02:48:38,-1,29755
7799079856941240657,2021-06-01 03:38:03,2021-06-02 11:47:41,54.619275,55.925171,37.283649,39.772490,2021-06-01 03:38:03,2021-06-02 11:47:12,-1,7870
1962059670016140595,2021-01-28 03:05:30,2021-01-30 00:08:06,55.559225,55.894053,37.410392,37.774597,2021-01-28 03:05:30,2021-01-30 00:08:06,-1,37764
-4059047758645465466,2021-06-26 06:14:35,2021-06-27 22:51:56,55.545654,55.713234,37.283642,37.863487,2021-06-26 06:14:34,2021-06-27 22:51:56,-1,1572


(317, 10)

In [10]:
df_accel_stats = df_accel.groupby(by=["driver_hash"], sort=False).agg(["min", "max", "count"])
df_accel_stats.columns = ["_".join(c) for c in df_accel_stats.columns[:-3]] + ["fraud", "fraud_count", "accel_points"]
df_accel_stats.drop(columns=[c for c in df_accel_stats.columns if c.endswith("_count")], inplace=True)

df_accel_stats.reset_index().to_csv(f"{DATA_PATH}/accel_stats.csv", index=False)

display(df_accel_stats)
display(df_accel_stats.shape)

Unnamed: 0_level_0,time_min,time_max,x_min,x_max,y_min,y_max,z_min,z_max,lat_min,lat_max,lon_min,lon_max,fraud,accel_points
driver_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2025312357143911724,2021-04-12 11:18:26.068266,2021-04-15 20:43:34.194544,-26.952726,35.752613,-32.134975,24.559721,-26.265590,23.418884,55.727027,55.789206,37.656848,37.806067,1,9997
1293535473153840726,2021-06-22 11:12:10.079821,2021-06-22 23:18:43.417921,-26.903645,55.207897,-46.914383,27.238832,-17.652449,39.628826,55.635698,55.771440,37.555674,37.804711,1,9675
-8729475955540025841,2021-04-29 10:39:00.680583,2021-05-02 14:38:42.437378,-21.925700,32.680298,-28.881600,18.352398,-38.914400,19.795599,55.827616,55.857243,37.282913,37.422036,1,2521
-7745607963253656189,2021-07-29 20:00:16.923285,2021-07-31 20:34:28.280919,-14.538000,9.807000,-13.024000,11.550000,-9.807000,21.280000,55.499130,55.847576,37.515374,37.924558,1,9471
-2132251359044308970,2021-07-28 09:25:29.166741,2021-07-31 19:18:16.795047,-27.464117,31.546068,-42.290356,14.871704,-13.049031,38.862152,55.786957,55.827484,37.671778,37.829731,1,9997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519569983208779248,2020-11-18 08:21:45.656056,2020-11-22 00:21:13.094995,-8.674192,16.172832,-15.830461,18.516756,-14.659697,70.832460,55.630669,55.910844,37.441586,37.931177,-1,9615
7799079856941240657,2021-05-31 03:00:23.195708,2021-06-01 10:42:55.955341,-5.364739,2.661892,4.436647,14.299585,-2.485858,6.723224,55.415358,55.972630,37.112612,38.135084,-1,10000
1962059670016140595,2021-01-27 11:13:22.314946,2021-01-30 22:30:35.496097,-22.226929,13.426127,-14.481961,12.061321,-8.429615,11.450852,55.541267,55.894053,37.410483,37.861344,-1,9963
-4059047758645465466,2021-06-26 06:25:44.114194,2021-06-28 07:16:43.971905,-10.997766,7.966706,4.566940,17.484856,0.769736,11.943475,55.598380,55.721652,37.283642,37.740554,-1,2031


(314, 14)

In [11]:
df_stats = df_gps_stats.merge(df_accel_stats, how="left", on=["driver_hash", "fraud"], suffixes=("_gps", "_accel"))
df_stats

Unnamed: 0_level_0,time_min_gps,time_max_gps,lat_min_gps,lat_max_gps,lon_min_gps,lon_max_gps,gps_time_min,gps_time_max,fraud,gps_points,...,x_max,y_min,y_max,z_min,z_max,lat_min_accel,lat_max_accel,lon_min_accel,lon_max_accel,accel_points
driver_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025312357143911724,2021-04-13 11:11:07,2021-04-14 21:51:11,55.653855,55.790612,37.642853,37.786800,2021-04-13 11:11:07,2021-04-14 21:51:11,1,16948,...,35.752613,-32.134975,24.559721,-26.265590,23.418884,55.727027,55.789206,37.656848,37.806067,9997.0
1293535473153840726,2021-06-23 06:00:57,2021-06-23 15:10:17,55.687417,55.699644,37.607736,37.668832,2021-06-23 06:00:50,2021-06-23 15:09:24,1,17,...,55.207897,-46.914383,27.238832,-17.652449,39.628826,55.635698,55.771440,37.555674,37.804711,9675.0
-8729475955540025841,2021-04-30 11:26:15,2021-05-01 19:41:53,55.803774,55.850894,37.322341,37.382181,2021-04-30 11:26:15,2021-05-01 19:41:53,1,3087,...,32.680298,-28.881600,18.352398,-38.914400,19.795599,55.827616,55.857243,37.282913,37.422036,2521.0
-7745607963253656189,2021-07-30 04:09:24,2021-08-01 02:08:49,55.364459,55.847576,37.509411,37.843447,2021-07-30 04:09:24,2021-08-01 02:08:49,1,15994,...,9.807000,-13.024000,11.550000,-9.807000,21.280000,55.499130,55.847576,37.515374,37.924558,9471.0
-2132251359044308970,2021-07-29 03:03:02,2021-07-31 02:52:34,55.786957,55.829857,37.680754,37.843200,2021-07-29 03:03:01,2021-07-31 02:52:34,1,7749,...,31.546068,-42.290356,14.871704,-13.049031,38.862152,55.786957,55.827484,37.671778,37.829731,9997.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519569983208779248,2020-11-19 06:27:52,2020-11-21 02:48:38,55.662534,55.910985,37.447272,37.866314,2020-11-19 06:27:52,2020-11-21 02:48:38,-1,29755,...,16.172832,-15.830461,18.516756,-14.659697,70.832460,55.630669,55.910844,37.441586,37.931177,9615.0
7799079856941240657,2021-06-01 03:38:03,2021-06-02 11:47:41,54.619275,55.925171,37.283649,39.772490,2021-06-01 03:38:03,2021-06-02 11:47:12,-1,7870,...,2.661892,4.436647,14.299585,-2.485858,6.723224,55.415358,55.972630,37.112612,38.135084,10000.0
1962059670016140595,2021-01-28 03:05:30,2021-01-30 00:08:06,55.559225,55.894053,37.410392,37.774597,2021-01-28 03:05:30,2021-01-30 00:08:06,-1,37764,...,13.426127,-14.481961,12.061321,-8.429615,11.450852,55.541267,55.894053,37.410483,37.861344,9963.0
-4059047758645465466,2021-06-26 06:14:35,2021-06-27 22:51:56,55.545654,55.713234,37.283642,37.863487,2021-06-26 06:14:34,2021-06-27 22:51:56,-1,1572,...,7.966706,4.566940,17.484856,0.769736,11.943475,55.598380,55.721652,37.283642,37.740554,2031.0


In [12]:
coord_stats = df_stats[
    ["lat_min_gps", "lat_max_gps", "lon_min_gps", "lon_max_gps",
    "lat_min_accel", "lat_max_accel", "lon_min_accel", "lon_max_accel"]
]
describe(coord_stats)

Unnamed: 0,0,count,mean,std,min,25%,50%,75%,max
lat_min_gps,float64,317.0,55.52639,0.851628,42.97998,55.516224,55.601025,55.681427,56.733843
lat_max_gps,float64,317.0,55.952785,1.497066,55.44221,55.800431,55.877871,55.938835,82.434317
lon_min_gps,float64,317.0,37.356928,0.670567,26.101324,37.283731,37.38787,37.489145,38.20247
lon_max_gps,float64,317.0,38.645384,8.592781,37.264402,37.667467,37.813341,37.90256,135.459743
lat_min_accel,float64,314.0,55.625413,0.146027,55.316289,55.544644,55.615212,55.709606,56.735588
lat_max_accel,float64,314.0,55.951227,1.504217,55.473642,55.796322,55.887361,55.944633,82.434317
lon_min_accel,float64,314.0,37.409076,0.196796,36.314257,37.284149,37.39246,37.496839,38.20247
lon_max_accel,float64,314.0,38.011905,4.015636,37.158017,37.657798,37.792386,37.900904,108.855677


(317, 8)

In [13]:
time_stats = df_stats[["gps_time_min", "gps_time_max", "time_min_gps", "time_max_gps", "time_min_accel", "time_max_accel"]]
describe(time_stats)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max
gps_time_min,datetime64[ns],317,2021-04-05 15:56:02.741325056,2020-10-25 03:59:56.000000,2021-01-15 07:38:04.000000000,2021-04-11 03:12:01.000000000,2021-06-24 03:07:37.000000000,2021-08-26 21:04:44.000000
gps_time_max,datetime64[ns],317,2021-04-07 07:17:24.687697152,2020-10-27 02:12:44.000000,2021-01-17 00:46:29.000000000,2021-04-13 02:46:24.000000000,2021-06-26 01:40:07.000000000,2021-08-28 02:47:58.000000
time_min_gps,datetime64[ns],317,2021-04-05 15:59:06.463722496,2020-10-25 03:59:58.000000,2021-01-15 07:42:24.000000000,2021-04-11 03:12:00.000000000,2021-06-24 03:13:20.000000000,2021-08-26 21:04:45.000000
time_max_gps,datetime64[ns],317,2021-04-07 07:30:00.492113664,2020-10-27 02:12:46.000000,2021-01-17 00:46:29.000000000,2021-04-13 02:46:24.000000000,2021-06-26 02:57:06.000000000,2021-08-28 02:47:58.000000
time_min_accel,datetime64[ns],314,2020-12-31 07:21:14.930175744,2015-03-01 05:46:42.832594,2020-12-26 09:34:33.805658880,2021-03-20 02:56:20.050811904,2021-06-13 00:25:52.988546816,2021-08-27 09:18:28.711471
time_max_accel,datetime64[ns],314,2021-01-02 18:21:24.522915072,2015-03-04 21:23:56.196057,2020-12-27 22:53:46.581299200,2021-03-21 19:36:39.848911104,2021-06-15 13:25:14.277254656,2021-08-29 00:03:16.287344


(317, 6)

In [14]:
accel_stats = df_stats[["x_min", "x_max", "y_min", "y_max", "z_min", "z_max"]]
describe(accel_stats)

Unnamed: 0,0,count,mean,std,min,25%,50%,75%,max
x_min,float64,314.0,-9.723516,8.152989,-71.1054,-11.496291,-7.548169,-4.796537,4.150349
x_max,float64,314.0,9.967599,8.442713,-6.336255,4.97548,7.864007,11.787787,65.15289
y_min,float64,314.0,-4.014984,10.353671,-74.791626,-8.267329,-1.435346,3.255216,9.366169
y_max,float64,314.0,14.638243,5.147554,1.053,11.9655,13.842396,16.308902,55.229446
z_min,float64,314.0,-6.741006,7.938663,-65.50625,-8.875272,-5.185785,-1.988436,3.983
z_max,float64,314.0,14.688943,8.991242,-0.991969,10.319615,12.441469,16.416442,76.47559


(317, 6)

In [15]:
matched_tolerance = ["5s", "10s", "20s", "30s", "60s", "120s", "180s", "300s"]

matched_results = {}
for driver_hash in tqdm(df_stats.index):
    driver_gps = df_gps[df_gps["driver_hash"] == driver_hash]
    driver_accel = df_accel[df_accel["driver_hash"] == driver_hash]
    
    matched_counts = [np.nan for _ in matched_tolerance]
    if driver_accel.shape[0] > 0:
        for i, tol in enumerate(matched_tolerance):
            df_matched = pd.merge_asof(driver_gps, driver_accel,
                          left_on="gps_time",
                          right_on="time",
                          direction="nearest",
                          tolerance=pd.Timedelta(tol),
                          suffixes=("", "_r"),
                          allow_exact_matches=False)
            
            # atmost 7 accel columns not na
            df_matched = df_matched.dropna(thresh=7)
            
            matched_counts[i] = df_matched.shape[0]
            
    matched_results[driver_hash] = matched_counts

columns = ["tol_" + c for c in matched_tolerance]
matched_stats = pd.DataFrame.from_dict(matched_results, orient="index", columns=columns)

describe(matched_stats.divide(df_stats["gps_points"], axis=0))
display(matched_stats)

100%|██████████| 317/317 [00:13<00:00, 24.29it/s]


Unnamed: 0,0,count,mean,std,min,25%,50%,75%,max
tol_5s,float64,314.0,0.07461,0.044423,0.0,0.036418,0.084757,0.113181,0.162423
tol_10s,float64,314.0,0.148177,0.088459,0.0,0.073786,0.167872,0.224974,0.317903
tol_20s,float64,314.0,0.295358,0.17644,0.0,0.145601,0.335828,0.448743,0.641941
tol_30s,float64,314.0,0.440069,0.262993,0.0,0.216606,0.49918,0.669111,0.952036
tol_60s,float64,314.0,0.465923,0.273088,0.0,0.237303,0.52825,0.695356,0.965421
tol_120s,float64,314.0,0.492321,0.282532,0.0,0.251752,0.56045,0.735072,0.978806
tol_180s,float64,314.0,0.513453,0.289346,0.0,0.26594,0.587747,0.760514,0.991634
tol_300s,float64,314.0,0.546523,0.299204,0.0,0.292744,0.634658,0.805734,0.992192


(317, 8)

Unnamed: 0,tol_5s,tol_10s,tol_20s,tol_30s,tol_60s,tol_120s,tol_180s,tol_300s
2025312357143911724,1789.0,3572.0,7139.0,10659.0,11186.0,11525.0,11802.0,12126.0
1293535473153840726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-8729475955540025841,161.0,308.0,599.0,891.0,979.0,1085.0,1187.0,1373.0
-7745607963253656189,1415.0,2721.0,5444.0,8099.0,8356.0,8667.0,8930.0,9417.0
-2132251359044308970,691.0,1281.0,2565.0,3836.0,4215.0,4343.0,4410.0,4581.0
...,...,...,...,...,...,...,...,...
4519569983208779248,2474.0,4834.0,9755.0,14405.0,15039.0,15922.0,16651.0,17742.0
7799079856941240657,687.0,1344.0,2731.0,4080.0,4151.0,4260.0,4372.0,4569.0
1962059670016140595,4875.0,9727.0,19416.0,29059.0,30324.0,31491.0,32349.0,33590.0
-4059047758645465466,195.0,383.0,760.0,1131.0,1202.0,1230.0,1252.0,1302.0


In [16]:
points_stats = df_stats[["fraud", "gps_points", "accel_points"]].copy() \
    .merge(matched_stats, left_index=True, right_index=True)

df_accel_stats.reset_index().to_csv(f"{DATA_PATH}/points_stats.csv", index=False)

points_stats

Unnamed: 0,fraud,gps_points,accel_points,tol_5s,tol_10s,tol_20s,tol_30s,tol_60s,tol_120s,tol_180s,tol_300s
2025312357143911724,1,16948,9997.0,1789.0,3572.0,7139.0,10659.0,11186.0,11525.0,11802.0,12126.0
1293535473153840726,1,17,9675.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-8729475955540025841,1,3087,2521.0,161.0,308.0,599.0,891.0,979.0,1085.0,1187.0,1373.0
-7745607963253656189,1,15994,9471.0,1415.0,2721.0,5444.0,8099.0,8356.0,8667.0,8930.0,9417.0
-2132251359044308970,1,7749,9997.0,691.0,1281.0,2565.0,3836.0,4215.0,4343.0,4410.0,4581.0
...,...,...,...,...,...,...,...,...,...,...,...
4519569983208779248,-1,29755,9615.0,2474.0,4834.0,9755.0,14405.0,15039.0,15922.0,16651.0,17742.0
7799079856941240657,-1,7870,10000.0,687.0,1344.0,2731.0,4080.0,4151.0,4260.0,4372.0,4569.0
1962059670016140595,-1,37764,9963.0,4875.0,9727.0,19416.0,29059.0,30324.0,31491.0,32349.0,33590.0
-4059047758645465466,-1,1572,2031.0,195.0,383.0,760.0,1131.0,1202.0,1230.0,1252.0,1302.0


In [17]:
drivers_stats = pd.concat((points_stats, time_stats, coord_stats, accel_stats), axis=1).copy()
drivers_stats.to_csv(f"{DATA_PATH}/drivers_stats.csv")
drivers_stats

Unnamed: 0,fraud,gps_points,accel_points,tol_5s,tol_10s,tol_20s,tol_30s,tol_60s,tol_120s,tol_180s,...,lat_min_accel,lat_max_accel,lon_min_accel,lon_max_accel,x_min,x_max,y_min,y_max,z_min,z_max
2025312357143911724,1,16948,9997.0,1789.0,3572.0,7139.0,10659.0,11186.0,11525.0,11802.0,...,55.727027,55.789206,37.656848,37.806067,-26.952726,35.752613,-32.134975,24.559721,-26.265590,23.418884
1293535473153840726,1,17,9675.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55.635698,55.771440,37.555674,37.804711,-26.903645,55.207897,-46.914383,27.238832,-17.652449,39.628826
-8729475955540025841,1,3087,2521.0,161.0,308.0,599.0,891.0,979.0,1085.0,1187.0,...,55.827616,55.857243,37.282913,37.422036,-21.925700,32.680298,-28.881600,18.352398,-38.914400,19.795599
-7745607963253656189,1,15994,9471.0,1415.0,2721.0,5444.0,8099.0,8356.0,8667.0,8930.0,...,55.499130,55.847576,37.515374,37.924558,-14.538000,9.807000,-13.024000,11.550000,-9.807000,21.280000
-2132251359044308970,1,7749,9997.0,691.0,1281.0,2565.0,3836.0,4215.0,4343.0,4410.0,...,55.786957,55.827484,37.671778,37.829731,-27.464117,31.546068,-42.290356,14.871704,-13.049031,38.862152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519569983208779248,-1,29755,9615.0,2474.0,4834.0,9755.0,14405.0,15039.0,15922.0,16651.0,...,55.630669,55.910844,37.441586,37.931177,-8.674192,16.172832,-15.830461,18.516756,-14.659697,70.832460
7799079856941240657,-1,7870,10000.0,687.0,1344.0,2731.0,4080.0,4151.0,4260.0,4372.0,...,55.415358,55.972630,37.112612,38.135084,-5.364739,2.661892,4.436647,14.299585,-2.485858,6.723224
1962059670016140595,-1,37764,9963.0,4875.0,9727.0,19416.0,29059.0,30324.0,31491.0,32349.0,...,55.541267,55.894053,37.410483,37.861344,-22.226929,13.426127,-14.481961,12.061321,-8.429615,11.450852
-4059047758645465466,-1,1572,2031.0,195.0,383.0,760.0,1131.0,1202.0,1230.0,1252.0,...,55.598380,55.721652,37.283642,37.740554,-10.997766,7.966706,4.566940,17.484856,0.769736,11.943475
