In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import folium
from haversine import haversine
from os import listdir


%matplotlib inline

In [2]:
DATA_PATH = "../../data"
FRAUD_PATH = "../../data/drivers_with_gps_and_sl_problems"
PROPER_PATH = "../../data/good_data"
UNKNOWN_PATH = "../../data/unlabeled"

listdir(DATA_PATH)

['unlabeled',
 'good_data',
 'all_accel.csv',
 'all_tracks.csv',
 'drivers_with_gps_and_sl_problems',
 'rides_of_drivers.csv']

In [3]:
def describe(df):
    display(pd.concat((df.dtypes, df.describe(datetime_is_numeric=True).T), axis=1))

In [4]:
def load_gps_data_from_folders(path):
    is_digit = lambda s: (s.startswith('-') and s[1:].isdigit()) or s.isdigit()
    idx = np.array([s for s in listdir(path) if is_digit(s)]).astype(np.int64)
    
    result = pd.read_csv(f"{path}/{idx[0]}/track.csv", parse_dates=[0, 3])
    result = result.sort_values(by="gps_time")
    result["driver_hash"] = idx[0]

    for x in idx[1:]:
        df = pd.read_csv(f"{path}/{x}/track.csv", parse_dates=[0, 3])
        df = df.sort_values(by="gps_time")
        df["driver_hash"] = x
        result = pd.concat((result, df), axis=0)
    return result

In [5]:
df_gps_fraud = load_gps_data_from_folders(FRAUD_PATH)
df_gps_fraud["fraud"] = 1
describe(df_gps_fraud)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],107535,2021-05-10 07:18:06.560756736,2020-12-22 06:35:55,2021-02-11 18:05:14,2021-04-14 19:41:17,2021-07-31 01:47:13,2021-08-20 12:43:05,
lat,float64,107535,55.7452,42.98,55.7361,55.7564,55.7752,56.3264,0.143642
lon,float64,107535,37.6533,37.2836,37.5676,37.6698,37.7251,132.415,0.87641
gps_time,datetime64[ns],107535,2021-05-10 07:18:02.733910016,2020-12-22 06:26:02,2021-02-11 18:05:13.500000,2021-04-14 19:41:17,2021-07-31 01:47:13,2021-08-20 12:26:47,
driver_hash,int64,107535,-2.73693e+18,-8.91541e+18,-6.7159e+18,-3.96276e+18,2.02531e+18,8.56974e+18,4.11427e+18
fraud,int64,107535,1,1,1,1,1,1,0.0


In [6]:
df_gps_proper = load_gps_data_from_folders(PROPER_PATH)
df_gps_proper["fraud"] = 0
describe(df_gps_proper)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],958574,2021-04-11 22:21:43.552323072,2020-11-30 03:00:02,2021-01-16 21:05:52.500000,2021-04-07 22:13:06.500000,2021-06-22 22:16:56.250000128,2021-08-25 22:47:15,
lat,float64,958574,55.734,55.3926,55.6604,55.7435,55.8066,56.024,0.10941
lon,float64,958574,37.6157,36.8129,37.4999,37.613,37.7306,38.1199,0.164338
gps_time,datetime64[ns],958574,2021-04-11 22:21:38.431624192,2020-11-30 03:00:02,2021-01-16 21:05:52.500000,2021-04-07 22:13:06.500000,2021-06-22 22:16:56.250000128,2021-08-25 22:47:15,
driver_hash,int64,958574,8.88151e+17,-8.85545e+18,-2.32611e+18,2.5462e+17,4.80524e+18,8.89355e+18,4.7696e+18
fraud,int64,958574,0,0,0,0,0,0,0.0


In [7]:
df_gps_unknown = load_gps_data_from_folders(UNKNOWN_PATH)
df_gps_unknown["fraud"] = np.nan
describe(df_gps_unknown)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],3984183.0,2021-04-04 04:41:20.900185600,2020-10-25 03:59:58,2021-01-14 07:38:30.500000,2021-04-06 17:14:19,2021-06-17 06:29:28,2021-08-28 02:47:58,
lat,float64,3984180.0,55.7452,48.4742,55.6677,55.7506,55.8301,82.4343,0.183564
lon,float64,3984180.0,37.5975,26.1013,37.4997,37.58,37.6899,135.46,0.420532
gps_time,datetime64[ns],3984183.0,2021-04-04 04:41:19.051149824,2020-10-25 03:59:56,2021-01-14 07:38:30.500000,2021-04-06 17:14:19,2021-06-17 06:29:28,2021-08-28 02:47:58,
driver_hash,int64,3984180.0,2.44972e+17,-9.21858e+18,-4.29668e+18,-3.2897e+15,4.51957e+18,9.20647e+18,5.3914e+18
fraud,float64,0.0,,,,,,,


In [8]:
df_gps = pd.concat((df_gps_fraud, df_gps_proper, df_gps_unknown), axis=0)
describe(df_gps)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],5050292.0,2021-04-06 10:23:00.788006400,2020-10-25 03:59:58,2021-01-16 06:40:41.750000128,2021-04-07 17:01:10.500000,2021-06-22 15:23:15.499999744,2021-08-28 02:47:58,
lat,float64,5050290.0,55.743,42.98,55.6674,55.7496,55.8242,82.4343,0.171211
lon,float64,5050290.0,37.6021,26.1013,37.5009,37.5848,37.7032,135.46,0.401377
gps_time,datetime64[ns],5050292.0,2021-04-06 10:22:58.275877632,2020-10-25 03:59:56,2021-01-16 06:40:38.749999872,2021-04-07 17:01:10.500000,2021-06-22 15:23:15.499999744,2021-08-28 02:47:58,
driver_hash,int64,5050290.0,3.03558e+17,-9.21858e+18,-4.06958e+18,1.02237e+17,4.79299e+18,9.20647e+18,5.27957e+18
fraud,float64,1066110.0,0.100867,0,0,0,0,1,0.301152


In [9]:
df_gps

Unnamed: 0,time,lat,lon,gps_time,driver_hash,fraud
0,2021-04-13 11:11:07,55.765980,37.707788,2021-04-13 11:11:07,2025312357143911724,1.0
1,2021-04-13 11:11:08,55.765980,37.707788,2021-04-13 11:11:07,2025312357143911724,1.0
2,2021-04-13 11:11:08,55.754542,37.697707,2021-04-13 11:11:07,2025312357143911724,1.0
3,2021-04-13 11:11:08,55.754542,37.697707,2021-04-13 11:11:08,2025312357143911724,1.0
4,2021-04-13 11:11:10,55.754542,37.697707,2021-04-13 11:11:09,2025312357143911724,1.0
...,...,...,...,...,...,...
11157,2021-02-05 18:32:06,55.935577,37.517716,2021-02-05 18:32:06,-2035413153418187612,
11158,2021-02-05 18:32:06,55.935580,37.517710,2021-02-05 18:32:06,-2035413153418187612,
11159,2021-02-05 18:32:09,55.935572,37.517723,2021-02-05 18:32:09,-2035413153418187612,
11160,2021-02-05 18:32:10,55.935572,37.517724,2021-02-05 18:32:10,-2035413153418187612,


In [10]:
df_gps.to_csv(f"{DATA_PATH}/all_gps.csv", index=False)
'Done'

'Done'

In [11]:
def load_accel_data_from_folders(path):
    is_digit = lambda s: (s.startswith('-') and s[1:].isdigit()) or s.isdigit()
    idx = np.array([s for s in listdir(path) if is_digit(s)]).astype(np.int64)
    
    result = pd.read_csv(f"{path}/{idx[0]}/accelerometer.csv", parse_dates=[0])
    if result.shape[0] > 0:
        result["time"] = result["time"].dt.tz_convert(None) + pd.DateOffset(hours=3)
        result = result.sort_values(by="time")
    result["driver_hash"] = idx[0]

    for x in idx[1:]:
        df = pd.read_csv(f"{path}/{x}/accelerometer.csv", parse_dates=[0])
        if df.shape[0] > 0:
            df["time"] = df["time"].dt.tz_convert(None) + pd.DateOffset(hours=3)
            df = df.sort_values(by="time")
        df["driver_hash"] = x
        result = pd.concat((result, df), axis=0)
    return result

In [12]:
df_accel_fraud = load_accel_data_from_folders(FRAUD_PATH)
df_accel_fraud["fraud"] = 1
describe(df_accel_fraud)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],105816,2021-05-26 12:35:30.161651200,2020-12-21 09:25:48.474028,2021-03-18 21:17:44.451609856,2021-06-22 17:28:10.526172672,2021-07-29 12:03:39.636103680,2021-08-22 14:45:29.417931,
x,float64,105816,0.176285,-71.1054,-0.872088,0,1.053,65.1529,3.89279
y,float64,105816,3.94255,-74.7916,0.062192,5.04781,8.66703,33.5116,5.63456
z,float64,105816,5.23198,-55.1792,2.22182,5.51863,8.772,76.4756,4.14453
lat,float64,105816,55.7553,55.4259,55.733,55.7566,55.793,55.8995,0.0640573
lon,float64,105816,37.6374,37.2782,37.5725,37.6546,37.7327,37.9712,0.127567
driver_hash,int64,105816,-2.79601e+18,-8.91541e+18,-6.7159e+18,-3.96276e+18,1.29354e+18,8.56974e+18,4.1981e+18
fraud,int64,105816,1,1,1,1,1,1,0.0


In [13]:
df_accel_proper = load_accel_data_from_folders(PROPER_PATH)
df_accel_proper["fraud"] = 0
describe(df_accel_proper)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],229709,2017-09-28 15:26:15.470369792,2015-03-01 05:46:42.832594,2016-01-19 15:17:03.967033088,2017-09-12 08:23:17.829850880,2019-05-19 23:20:39.848606976,2020-04-15 18:07:27.919088,
x,float64,229709,-0.0839609,-19.8078,-0.478541,0.0957681,0.735042,57.0202,1.98267
y,float64,229709,8.11527,-37.0974,7.95354,8.97347,9.51246,23.3913,2.64659
z,float64,229709,3.78649,-65.5062,2.22084,3.40934,5.18501,28.9555,2.46283
lat,float64,229709,55.7377,55.3163,55.6746,55.7453,55.7977,56.178,0.100379
lon,float64,229709,37.6042,36.813,37.4961,37.5936,37.7091,38.4464,0.157964
driver_hash,int64,229709,9.55708e+17,-8.85545e+18,-3.75461e+18,2.69343e+17,4.9574e+18,8.89355e+18,4.96158e+18
fraud,int64,229709,0,0,0,0,0,0,0.0


In [14]:
df_accel_unknown = load_accel_data_from_folders(UNKNOWN_PATH)
df_accel_unknown["fraud"] = np.nan
describe(df_accel_unknown)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],1837464.0,2021-04-04 21:19:12.671973888,2020-10-24 08:45:27.981150,2021-01-13 08:30:16.014356224,2021-04-13 19:06:15.511302656,2021-06-15 15:39:44.727341568,2021-08-29 00:03:16.287344,
x,float64,1837460.0,0.436646,-48.2001,-0.345962,0.201113,0.783502,36.4469,2.28938
y,float64,1837460.0,7.8135,-49.1889,7.776,8.885,9.51231,55.2294,3.18575
z,float64,1837460.0,3.75819,-31.7222,1.96325,3.69126,5.24825,72.5276,2.63675
lat,float64,1837460.0,55.7491,55.3592,55.673,55.7511,55.8301,82.4343,0.179542
lon,float64,1837460.0,37.5986,36.3143,37.4993,37.5836,37.6791,108.856,0.403864
driver_hash,int64,1837460.0,-9.16892e+16,-9.21858e+18,-4.87201e+18,-4.43966e+17,4.42743e+18,9.20647e+18,5.40781e+18
fraud,float64,0.0,,,,,,,


In [15]:
df_accel = pd.concat((df_accel_fraud, df_accel_proper, df_accel_unknown), axis=0)
describe(df_accel)

Unnamed: 0,0,count,mean,min,25%,50%,75%,max,std
time,datetime64[ns],2172989.0,2020-11-22 15:27:12.714645504,2015-03-01 05:46:42.832594,2020-12-18 06:53:34.205120,2021-03-18 16:54:00.549220096,2021-06-12 11:02:18.664983040,2021-08-29 00:03:16.287344,
x,float64,2172990.0,0.368934,-71.1054,-0.371101,0.184359,0.785298,65.1529,2.36914
y,float64,2172990.0,7.6569,-74.7916,7.63271,8.85376,9.50019,55.2294,3.4034
z,float64,2172990.0,3.83295,-65.5062,2.01402,3.69992,5.35365,76.4756,2.73072
lat,float64,2172990.0,55.7482,55.3163,55.6768,55.7512,55.8214,82.4343,0.16893
lon,float64,2172990.0,37.601,36.3143,37.5016,37.5863,37.686,108.856,0.376061
driver_hash,int64,2172990.0,-1.12657e+17,-9.21858e+18,-4.47912e+18,-4.14168e+17,4.43147e+18,9.20647e+18,5.3536e+18
fraud,float64,335525.0,0.315374,0,0,0,1,1,0.464666


In [16]:
df_accel

Unnamed: 0,time,x,y,z,lat,lon,driver_hash,fraud
58,2021-04-12 11:18:26.068266,0.931344,9.692925,2.159570,55.756474,37.695483,2025312357143911724,1.0
4250,2021-04-12 11:18:26.070640,0.933739,9.665392,2.076970,55.756474,37.695483,2025312357143911724,1.0
59,2021-04-12 11:18:26.072441,0.946907,9.671378,2.108095,55.756454,37.695475,2025312357143911724,1.0
4254,2021-04-12 11:18:26.073274,0.938527,9.676167,2.099715,55.756454,37.695470,2025312357143911724,1.0
60,2021-04-12 11:18:26.074903,0.942118,9.677363,2.090138,55.756454,37.695470,2025312357143911724,1.0
...,...,...,...,...,...,...,...,...
1977,2021-02-06 18:07:59.180188,-1.386352,8.256638,4.687729,55.817333,37.566908,-2035413153418187612,
6087,2021-02-06 18:07:59.182144,-1.230728,8.393112,4.254379,55.817330,37.566903,-2035413153418187612,
6086,2021-02-06 18:07:59.182711,-1.222351,8.415863,4.156219,55.817332,37.566905,-2035413153418187612,
8067,2021-02-06 18:07:59.183246,-1.944199,8.464935,4.336975,55.817331,37.566905,-2035413153418187612,


In [17]:
df_accel.to_csv(f"{DATA_PATH}/all_accel.csv", index=False)
'Done'

'Done'

In [18]:
df_gps["gps_time"].diff().describe()

count                        5050291
mean     -1 days +23:59:58.859008917
std        1 days 00:10:34.182964582
min              -290 days +06:02:33
25%                  0 days 00:00:02
50%                  0 days 00:00:03
75%                  0 days 00:00:03
max                291 days 23:30:50
Name: gps_time, dtype: object

In [19]:
df_accel["time"].diff().describe()

count                           2172988
mean        -1 days +23:59:57.426848706
std           3 days 12:36:57.211233964
min      -2148 days +12:28:45.004232992
25%              0 days 00:00:00.000547
50%              0 days 00:00:00.000772
75%              0 days 00:00:00.001918
max           1466 days 06:21:30.886280
Name: time, dtype: object

In [22]:
def map_driver(driver_gps, driver_accel):
    center = (driver_gps["lat"].mean(), driver_gps["lon"].mean())
    color = driver_accel.head(1)["fraud"].replace({1:"red", 0:"green", np.nan:"yellow"}).values[0]
    m = folium.Map(location=center, zoom_start=10, legend=True)
    folium.PolyLine(driver_gps[["lat", "lon"]], color='blue', weight=4, label="tracks", opacity=0.8).add_to(m)
    folium.PolyLine(driver_accel[["lat", "lon"]], color=color, weight=2, label="accel", opacity=0.8).add_to(m)
    return m

In [23]:
FORCE_FRAUD = None

if not FORCE_FRAUD is None:
    cur_driver = np.random.choice(df_gps.loc[df_gps["fraud"] == FORCE_FRAUD, "driver_hash"].unique())
else:
    cur_driver = np.random.choice(df_gps["driver_hash"].unique())

driver_gps = df_gps[df_gps["driver_hash"] == cur_driver]
driver_accel = df_accel[df_accel["driver_hash"] == cur_driver]

display(
    f"Current Driver:{cur_driver}",
    driver_gps.describe(datetime_is_numeric=True).T,
    driver_accel.describe(datetime_is_numeric=True).T
)

map_driver(driver_gps, driver_accel)

'Current Driver:-6916466105927014875'

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time,11105,2021-08-20 07:00:29.858622208,2021-08-19 08:45:26,2021-08-19 21:28:40,2021-08-20 09:29:39,2021-08-20 16:51:52,2021-08-21 01:15:30,
lat,11105,55.8395,55.7274,55.8153,55.827,55.8449,55.96,0.0477252
lon,11105,37.3775,37.2642,37.3176,37.3537,37.3989,37.6606,0.100103
gps_time,11105,2021-08-20 07:00:29.507518976,2021-08-19 08:45:26,2021-08-19 21:28:40,2021-08-20 09:29:39,2021-08-20 16:51:52,2021-08-21 01:15:30,
driver_hash,11105,-6.91647e+18,-6.91647e+18,-6.91647e+18,-6.91647e+18,-6.91647e+18,-6.91647e+18,1024.05
fraud,0,,,,,,,


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time,2459,2021-08-20 09:10:54.929695488,2021-08-19 10:03:24.641019,2021-08-19 22:00:13.045570560,2021-08-19 22:36:13.440205056,2021-08-21 08:53:17.057464832,2021-08-21 16:10:13.776379,
x,2459,0.0706128,-8.50211,-0.557849,-0.189142,0.426168,7.94666,1.80228
y,2459,8.63446,-3.03585,9.08629,9.53401,9.79977,12.9027,2.7298
z,2459,2.38073,-8.14058,0.990002,1.78129,2.67672,12.3496,2.53871
lat,2459,55.8572,55.8118,55.8185,55.8306,55.9078,55.96,0.0489069
lon,2459,37.3515,37.2844,37.3062,37.353,37.394,37.4197,0.0411699
driver_hash,2459,-6.91647e+18,-6.91647e+18,-6.91647e+18,-6.91647e+18,-6.91647e+18,-6.91647e+18,1024.21
fraud,0,,,,,,,
