Папка на GDrive:

https://drive.google.com/drive/folders/1FeUwrVfxp09xAifiMngwtfrRZ8nhtj8T?usp=sharing


In [1]:
from os import listdir

USE_GDRIVE = False


if USE_GDRIVE:
    from google.colab import drive
    drive.mount("/home/GDrive")
    DATA_PATH = f"/home/GDrive/MyDrive/made_2021_fraud_project/data"
else:
    DATA_PATH = "../../data"

listdir(DATA_PATH)

['unlabeled',
 'good_data',
 'all_gps.csv',
 'all_accel.csv',
 'drivers_with_gps_and_sl_problems',
 'rides_of_drivers.csv']

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import folium
from haversine import haversine

%matplotlib inline

In [3]:
def describe(df):
    display(pd.concat((df.dtypes, df.describe(datetime_is_numeric=True).T), axis=1))
    display(df.shape)

In [4]:
# [1, 0, None]
FORCE_FRAUD = 0

if FORCE_FRAUD == 1:
    driver_ids = listdir(f"{DATA_PATH}/drivers_with_gps_and_sl_problems")
    CUR_DRIVER = np.random.choice(driver_ids)
    CUR_PATH = f"{DATA_PATH}/drivers_with_gps_and_sl_problems/{CUR_DRIVER}"
    CUR_COLOR = "red"
elif FORCE_FRAUD == 0:
    driver_ids = listdir(f"{DATA_PATH}/good_data")
    CUR_DRIVER = np.random.choice(driver_ids)
    CUR_PATH = f"{DATA_PATH}/good_data/{CUR_DRIVER}"
    CUR_COLOR = "green"
else:
    driver_ids = listdir(f"{DATA_PATH}/unlabeled")
    CUR_DRIVER = np.random.choice(driver_ids)
    CUR_PATH = f"{DATA_PATH}/unlabeled/{CUR_DRIVER}"
    CUR_COLOR = "yellow"

display(CUR_DRIVER, CUR_PATH, CUR_COLOR)

'7934040023444826650'

'../../data/good_data/7934040023444826650'

'green'

In [5]:
df_gps = pd.read_csv(f"{CUR_PATH}/track.csv", parse_dates=[0, 3])
describe(df_gps)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],46228,2021-04-26 01:56:20.086073344,2021-04-25 03:07:28,2021-04-25 14:09:24.250000128,2021-04-25 22:28:16,2021-04-26 14:14:20.500000,2021-04-27 04:39:33,
lat,float64,46228,55.7166,55.5486,55.6482,55.6956,55.7778,56.024,0.103536
lon,float64,46228,37.6082,37.4001,37.5447,37.6087,37.6656,37.8544,0.0872182
gps_time,datetime64[ns],46228,2021-04-26 01:56:19.444038400,2021-04-25 03:07:23,2021-04-25 14:09:24.250000128,2021-04-25 22:28:16,2021-04-26 14:14:20.500000,2021-04-27 02:49:03,


(46228, 4)

In [6]:
df_accel = pd.read_csv(f"{CUR_PATH}/accelerometer.csv", parse_dates=[0]).sort_values(by="time")
df_accel["time"] = df_accel["time"].dt.tz_convert(None) + pd.DateOffset(hours=3)
describe(df_accel)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],9986,2017-12-18 23:09:27.877084416,2017-12-17 06:53:23.138773,2017-12-18 06:42:53.805615616,2017-12-18 20:43:45.596940544,2017-12-19 17:24:25.226436352,2017-12-20 12:27:59.729558,
x,float64,9986,0.356508,-8.82503,-0.110133,0.483629,0.876278,8.24563,0.948899
y,float64,9986,8.57605,4.33829,8.03973,8.62392,9.08719,17.8847,0.804434
z,float64,9986,4.29169,-12.9862,3.36625,4.34787,5.31034,13.556,1.57485
lat,float64,9986,55.7334,55.5187,55.6534,55.7144,55.7965,56.0238,0.109012
lon,float64,9986,37.595,37.1457,37.5358,37.6033,37.6665,37.9195,0.123373


(9986, 6)

### Первый вариант

Сплитить по фиксированным временным интервалам: pandas.Series.dt.floor

In [7]:
def split_by_time_floor(df: pd.DataFrame, datetime_column: str, group_idx_column: str, freq: str):
    res = df.copy()
    res[group_idx_column] = res[datetime_column].dt.floor(freq=freq)
    res[group_idx_column] = res[group_idx_column] \
        .map({d: i for i, d in enumerate(res[group_idx_column].unique())})
    return res

In [8]:
def plot_time_intervals(m, df, column, label, color, **kwargs):
    lgd_txt = '<span style="color: {col};">{txt}</span>'
    fg = folium.FeatureGroup(name=lgd_txt.format(txt=label, col=color)).add_to(m)
    for idx in df[column].unique():
        track = df[df["idx"] == idx]
        folium.PolyLine(track[["lat", "lon"]], color=color, **kwargs).add_to(fg)

In [9]:
SPLIT_FREQ = "5min"

center = (df_gps["lat"].mean(), df_gps["lon"].mean())
m = folium.Map(location=center, zoom_start=10)

gps_splits = split_by_time_floor(df_gps, "time", "idx", freq=SPLIT_FREQ)
plot_time_intervals(m, gps_splits, "idx", color="blue", weight=10, label="gps", opacity=0.8)

accel_splits = split_by_time_floor(df_accel, "time", "idx", freq=SPLIT_FREQ)
plot_time_intervals(m, accel_splits, "idx", color=CUR_COLOR, weight=5, label="accel", opacity=0.8)

number_of_splits_gps = len(gps_splits.idx.unique())
display(f"Number of splits gps: {number_of_splits_gps}")
number_of_splits_accel = len(accel_splits.idx.unique())
display(f"Number of splits accel: {number_of_splits_accel}")

folium.map.LayerControl('topright', collapsed= False).add_to(m)

m

'Number of splits gps: 428'

'Number of splits accel: 533'

In [11]:
SPLIT_FREQ = "30min"

gps_splits = split_by_time_floor(df_gps, "time", "idx", freq=SPLIT_FREQ)
accel_splits = split_by_time_floor(df_accel, "time", "idx", freq=SPLIT_FREQ)

number_of_splits_gps = len(gps_splits.idx.unique())
display(f"Number of splits gps: {number_of_splits_gps}")
number_of_splits_accel = len(accel_splits.idx.unique())
display(f"Number of splits accel: {number_of_splits_accel}")

display(gps_splits["idx"].unique())
display(accel_splits["idx"].unique())

'Number of splits gps: 85'

'Number of splits accel: 100'

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

### Второй вариант

Сплитить по разности между временными интервалами: pandas.Series.dt.diff + pandas.Series.dt.round

In [12]:
def split_by_time_diff(df: pd.DataFrame, datetime_column: str, group_idx_column: str, freq=None):
    res = df.copy()
    diff_column = f"{datetime_column}_diff"
    res[diff_column] = res[datetime_column].diff()
    res[diff_column].fillna(res[diff_column].max(), inplace=True)
    res[group_idx_column] = res[diff_column].round(freq).cumsum().dt.seconds
    return res

In [13]:
SPLIT_FREQ = "5min"

center = (df_gps["lat"].mean(), df_gps["lon"].mean())
m = folium.Map(location=center, zoom_start=10)

gps_splits = split_by_time_diff(df_gps, "gps_time", "idx", freq=SPLIT_FREQ)
plot_time_intervals(m, gps_splits, "idx", color="blue", weight=10, label="gps", opacity=0.8)

accel_splits = split_by_time_diff(df_accel, "time", "idx", freq=SPLIT_FREQ)
plot_time_intervals(m, accel_splits, "idx", color=CUR_COLOR, weight=5, label="accel", opacity=0.8)

number_of_splits_gps = len(gps_splits.idx.unique())
display(f"Number of splits gps: {number_of_splits_gps}")
number_of_splits_accel = len(accel_splits.idx.unique())
display(f"Number of splits accel: {number_of_splits_accel}")

folium.map.LayerControl('topright', collapsed= False).add_to(m)

m

'Number of splits gps: 25'

'Number of splits accel: 47'

In [14]:
SPLIT_FREQ = "30min"

gps_splits = split_by_time_diff(df_gps, "time", "idx", freq=SPLIT_FREQ)
accel_splits = split_by_time_diff(df_accel, "time", "idx", freq=SPLIT_FREQ)

display(gps_splits["idx"].value_counts())
display(accel_splits["idx"].value_counts())

55800    21293
23400    17711
12600     5345
54000     1739
10800       84
14400       33
61200        3
28800        2
34200        2
70200        2
27000        2
16200        1
25200        1
68400        1
18000        1
41400        1
73800        1
21600        1
52200        1
59400        1
30600        1
57600        1
81000        1
Name: idx, dtype: int64

45000    1759
73800    1557
46800    1537
43200    1264
9000     1219
12600     801
0         790
10800     582
48600     263
41400     214
Name: idx, dtype: int64