In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [230]:
import warnings
warnings.filterwarnings('ignore')

In [231]:
df = pd.read_csv('gps_data_outlier.csv')

In [232]:
df.head()

Unnamed: 0,time,lat,lon,gps_time,driver_hash,fraud,gps_delta,gps_distance,gps_speed,gps_accel,gps_outlier,gps_staying,tm_delta,tm_distance,tm_speed,tm_accel,tm_outlier,tm_staying
0,2020-11-10 03:08:12,55.473589,37.70587,2020-11-10 03:08:12,-9218579406240981296,-1,191031533.0,0.0,0.0,inf,0.0,0.0,191031533.0,0.0,0.0,inf,0.0,0.0
1,2020-11-10 03:42:48,55.473589,37.70587,2020-11-10 03:42:48,-9218579406240981296,-1,2076.0,0.0,0.0,0.0,0.0,0.0,2076.0,0.0,0.0,0.0,0.0,0.0
2,2020-11-10 06:38:31,55.473589,37.70587,2020-11-10 06:38:30,-9218579406240981296,-1,10542.0,0.0,0.0,0.0,0.0,0.0,10543.0,0.0,0.0,0.0,0.0,0.0
3,2020-11-10 06:54:36,55.473589,37.70587,2020-11-10 06:54:36,-9218579406240981296,-1,966.0,0.0,0.0,0.0,0.0,0.0,965.0,0.0,0.0,0.0,0.0,0.0
4,2020-11-10 06:56:45,55.47352,37.705963,2020-11-10 06:56:45,-9218579406240981296,-1,129.0,9.6,0.074798,0.00058,1.0,0.0,129.0,9.6,0.074798,0.00058,1.0,0.0


##  Отделяем поездки по 5 мин

In [233]:
df = df.sort_values(by=['driver_hash', 'gps_time'])

In [234]:
def trip_start(df):
    res = [1]
    for i in range(1, len(df)):
        delta = (pd.to_datetime(df['gps_time'].iat[i]) - pd.to_datetime(df['gps_time'].iat[i - 1])).seconds
        if delta > 300:
            res.append(1)
        else:
            res.append(0)
            
    return res

In [235]:
df_with_trip = pd.DataFrame({})
frames = [df_with_trip.copy()]

for i in tqdm(df.driver_hash.unique()):
    group = df[df.driver_hash == i].copy()
    res = trip_start(group)
    group['is_trip_start'] = res
    frames.append(group.copy())
    
df_with_trip = pd.concat(frames)

100%|██████████| 317/317 [18:06<00:00,  3.43s/it]


In [236]:
df_with_trip.head()

Unnamed: 0,time,lat,lon,gps_time,driver_hash,fraud,gps_delta,gps_distance,gps_speed,gps_accel,gps_outlier,gps_staying,tm_delta,tm_distance,tm_speed,tm_accel,tm_outlier,tm_staying,is_trip_start
0,2020-11-10 03:08:12,55.473589,37.70587,2020-11-10 03:08:12,-9218579406240981296,-1,191031533.0,0.0,0.0,inf,0.0,0.0,191031533.0,0.0,0.0,inf,0.0,0.0,1
1,2020-11-10 03:42:48,55.473589,37.70587,2020-11-10 03:42:48,-9218579406240981296,-1,2076.0,0.0,0.0,0.0,0.0,0.0,2076.0,0.0,0.0,0.0,0.0,0.0,1
2,2020-11-10 06:38:31,55.473589,37.70587,2020-11-10 06:38:30,-9218579406240981296,-1,10542.0,0.0,0.0,0.0,0.0,0.0,10543.0,0.0,0.0,0.0,0.0,0.0,1
3,2020-11-10 06:54:36,55.473589,37.70587,2020-11-10 06:54:36,-9218579406240981296,-1,966.0,0.0,0.0,0.0,0.0,0.0,965.0,0.0,0.0,0.0,0.0,0.0,1
4,2020-11-10 06:56:45,55.47352,37.705963,2020-11-10 06:56:45,-9218579406240981296,-1,129.0,9.6,0.074798,0.00058,1.0,0.0,129.0,9.6,0.074798,0.00058,1.0,0.0,0


In [237]:
df_with_trip.to_csv('gps_data_outlier_trips.csv')
# df_with_trip = pd.read_csv('gps_data_outlier_trips.csv')

In [239]:
print(f'Количество поездок: {df_with_trip.is_trip_start.sum()}')

Количество поездок: 17167


## Заполнение пропусков и невалидных данных

In [240]:
df_with_trip = df_with_trip.replace({np.inf: None})
df_with_trip[['gps_speed', 'gps_accel']] = df_with_trip[['gps_speed', 'gps_accel']].fillna(method='ffill')
df_with_trip[['tm_speed', 'tm_accel']] = df_with_trip[['tm_speed', 'tm_accel']].fillna(method='ffill')

## Добавляем бинарные признаки рабочих дней и часов пик

In [241]:
def is_buisness_day(date):
    if np.is_busday(date.split()[0]):
        return 1
    return 0

def is_rush_hour(time):
    hour = pd.to_datetime(time).hour
    if 8 <= hour < 10 or 17 <= hour < 20:
        return 1
    return 0

In [242]:
df_with_trip['is_business_day'] = df_with_trip.gps_time.apply(is_buisness_day)
df_with_trip['is_rush_hour'] = df_with_trip.gps_time.apply(is_rush_hour)

In [243]:
df_with_trip.to_csv('gps_data_outlier_with_flags.csv')

## Новые признаки на основе агрегации по поездкам

In [244]:
new_columns = [
    'trip_duration',
    'trip_mean_timedelta',
    'trip_median_timedelta',
    'trip_max_timedelta',
    'trip_cum_timedelta',
    'trip_length',
    'trip_mean_dist',
    'trip_median_dist',
    'trip_max_dist',
    'trip_cum_dist',
    'trip_mean_speed',
    'trip_median_speed',
    'trip_max_speed',
    'trip_mean_accel',
    'trip_median_accel',
    'trip_max_accel',
]

for pref in ['gps_', 'tm_']:
    for col in new_columns:
        df_with_trip[pref + col] = np.ones(len(df_with_trip))

In [245]:
initial_states = ['gps_delta', 'gps_distance', 'gps_speed', 'gps_accel', 'gps_outlier', 'gps_staying', 
  'tm_delta', 'tm_distance', 'tm_speed', 'tm_accel', 'tm_outlier', 'tm_staying']
frames = []

for driver in tqdm(df_with_trip.driver_hash.unique()):
    group = df_with_trip[df_with_trip.driver_hash == driver]
    indices = list(np.where(group.is_trip_start == 1)[0])
    indices.append(len(group))
    driver_df = pd.DataFrame({})

    for i in range(len(indices) - 1):
        trip = group.iloc[indices[i]:indices[i+1], ::].copy()
        first_row = trip.iloc[0, ::]
        first_row[initial_states] *= 0
        trip.iloc[0, ::] = first_row
    
        trip['gps_trip_duration'] *= trip.gps_delta.sum()
        trip['gps_trip_mean_timedelta'] *= trip.gps_delta.mean()
        trip['gps_trip_median_timedelta'] *= trip.gps_delta.median()
        trip['gps_trip_max_timedelta'] *= trip.gps_delta.max()
        trip['gps_trip_cum_timedelta'] *= trip.gps_delta.cumsum()
    
        trip['gps_trip_length'] *= trip.gps_distance.sum()
        trip['gps_trip_mean_dist'] *= trip.gps_distance.mean()
        trip['gps_trip_median_dist'] *= trip.gps_distance.median()
        trip['gps_trip_max_dist'] *= trip.gps_distance.max()
        trip['gps_trip_cum_dist'] *= trip.gps_distance.cumsum()
    
        trip['gps_trip_mean_speed'] *= trip.gps_speed.mean()
        trip['gps_trip_median_speed'] *= trip.gps_speed.median()
        trip['gps_trip_max_speed'] *= trip.gps_speed.max()
    
        trip['gps_trip_mean_accel'] *= trip.gps_accel.mean()
        trip['gps_trip_median_accel'] *= trip.gps_accel.median()
        trip['gps_trip_max_accel'] *= trip.gps_accel.max()
    
        trip['tm_trip_duration'] *= trip.tm_delta.sum()
        trip['tm_trip_mean_timedelta'] *= trip.tm_delta.mean()
        trip['tm_trip_median_timedelta'] *= trip.tm_delta.median()
        trip['tm_trip_max_timedelta'] *= trip.tm_delta.max()
        trip['tm_trip_cum_timedelta'] *= trip.tm_delta.cumsum()

        trip['tm_trip_length'] *= trip.tm_distance.sum()
        trip['tm_trip_mean_dist'] *= trip.tm_distance.mean()
        trip['tm_trip_median_dist'] *= trip.tm_distance.median()
        trip['tm_trip_max_dist'] *= trip.tm_distance.max()
        trip['tm_trip_cum_dist'] *= trip.tm_distance.cumsum()

        trip['tm_trip_mean_speed'] *= trip.tm_speed.mean()
        trip['tm_trip_median_speed'] *= trip.tm_speed.median()
        trip['tm_trip_max_speed'] *= trip.tm_speed.max()

        trip['tm_trip_mean_accel'] *= trip.tm_accel.mean()
        trip['tm_trip_median_accel'] *= trip.tm_accel.median()
        trip['tm_trip_max_accel'] *= trip.tm_accel.max()
    
        driver_df = pd.concat([driver_df, trip.copy()])
    
    frames.append(driver_df.copy())
    
preproc_df = pd.concat(frames)

100%|██████████| 317/317 [11:32<00:00,  2.19s/it]


In [263]:
preproc_df.loc[2243572]

time                         2021-07-24 21:32:04
lat                                      55.7486
lon                                      37.8155
gps_time                     2021-07-24 21:32:04
driver_hash                  -758233304813720508
fraud                                         -1
gps_delta                                      3
gps_distance                                63.3
gps_speed                                21.1008
gps_accel                                 -14.39
gps_outlier                                    0
gps_staying                                    0
tm_delta                                       3
tm_distance                                 63.3
tm_speed                                 21.1008
tm_accel                                  -14.39
tm_outlier                                     0
tm_staying                                     0
is_trip_start                                  0
is_buisness_day                                0
is_rush_hour        

In [247]:
preproc_df.to_csv('gps_data_outlier_preprocessed.csv')

## Кодирование циклических переменных времени в sin/cos

[Статья](http://blog.davidkaleko.com/feature-engineering-cyclical-features.html)

In [316]:
def sin_transform(x, part='h'):
    freq = {'h': 24, 'm': 60, 's': 60}
    x = pd.to_datetime(x)
    
    if part == 'h':
        x = x.hour
    elif part == 'm':
        x = x.minute
    else:
        x = x.second
        
    return np.sin(x * (2. * np.pi / freq[part]))

def cos_transform(x, part='h'):
    freq = {'h': 24, 'm': 60, 's': 60}
    x = pd.to_datetime(x)
    
    if part == 'h':
        x = x.hour
    elif part == 'm':
        x = x.minute
    else:
        x = x.second
        
    return np.cos(x * (2. * np.pi / freq[part]))

In [317]:
preproc_df['sin_hours'] = [sin_transform(i, part='h') for i in preproc_df.gps_time.values]
preproc_df['sin_min'] = [sin_transform(i, part='m') for i in preproc_df.gps_time.values]
preproc_df['sin_sec'] = [sin_transform(i, part='s') for i in preproc_df.gps_time.values]

preproc_df['cos_hours'] = [cos_transform(i, part='h') for i in preproc_df.gps_time.values]
preproc_df['cos_min'] = [cos_transform(i, part='m') for i in preproc_df.gps_time.values]
preproc_df['cos_sec'] = [cos_transform(i, part='s') for i in preproc_df.gps_time.values]

In [325]:
preproc_df.to_csv('gps_data_outlier_preprocessed.csv')

In [326]:
preproc_df.head()

Unnamed: 0,time,lat,lon,gps_time,driver_hash,fraud,gps_delta,gps_distance,gps_speed,gps_accel,...,tm_trip_max_speed,tm_trip_mean_accel,tm_trip_median_accel,tm_trip_max_accel,sin_hours,sin_min,sin_sec,cos_hours,cos_min,cos_sec
0,2020-11-10 03:08:12,55.4736,37.7059,2020-11-10 03:08:12,-9218579406240981296,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.707107,0.743145,0.9510565,0.7071068,0.669131,0.309017
1,2020-11-10 03:42:48,55.4736,37.7059,2020-11-10 03:42:48,-9218579406240981296,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.707107,-0.951057,-0.9510565,0.7071068,-0.309017,0.309017
2,2020-11-10 06:38:31,55.4736,37.7059,2020-11-10 06:38:30,-9218579406240981296,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.743145,1.224647e-16,6.123234000000001e-17,-0.669131,-1.0
3,2020-11-10 06:54:36,55.4736,37.7059,2020-11-10 06:54:36,-9218579406240981296,-1,0,0.0,0.0,0.0,...,4259.41581,266.233163,0.00029,1419.780337,1.0,-0.587785,-0.5877853,6.123234000000001e-17,0.809017,-0.809017
4,2020-11-10 06:56:45,55.4735,37.706,2020-11-10 06:56:45,-9218579406240981296,-1,129,9.6,0.074798,0.00058,...,4259.41581,266.233163,0.00029,1419.780337,1.0,-0.406737,-1.0,6.123234000000001e-17,0.913545,-1.83697e-16
