In [1]:
import pandas as pd
import numpy as np
from polyline.codec import PolylineCodec
from geopy import distance
from datetime import datetime

In [2]:
# we don't need all dataset (only subset)
df = pd.read_csv('./final_data/train.csv', nrows=1000)

In [3]:
df.head(5)

Unnamed: 0,Id,main_id_locality,ETA,RTA,OrderedDate,latitude,del_latitude,longitude,del_longitude,EDA,...,GoodArrived,ready_latitude,ready_longitude,onway_latitude,onway_longitude,arrived_latitude,arrived_longitude,center_latitude,center_longitude,route
0,0,1078,226.0,188.0,2020-02-12 19:12:06,55.826019,55.825581,49.134529,49.126949,1.0,...,2020-02-12 19:18:14,55.825647,49.134115,55.826261,49.134137,55.825706,49.127136,55.794388,49.111531,gnvsIaq{jHChA??uC???OPG^F^NRzKBd@AN[r@???`@`@`...
1,1,1078,718.0,725.0,2020-02-12 19:12:22,55.795502,55.820911,49.13147,49.11536,5.0,...,2020-02-12 19:31:47,55.795591,49.132281,55.795544,49.131552,55.821531,49.115685,55.794388,49.111531,sqpsI}~zjHyAr]e@lMk@fLaBlb@i@rLKhBCdAUxEGlCg@f...
2,2,1078,612.0,764.0,2020-02-12 19:12:44,55.79105,55.819962,49.22607,49.176628,5.0,...,2020-02-12 19:28:09,55.791099,49.226066,55.791099,49.226066,55.819765,49.177432,55.794388,49.111531,auosI}mmkH?LHd@KhC??o@w@[g@m@iAUk@??{G|OiB`Ek@...
3,3,1078,1560.0,1412.0,2020-02-12 19:12:44,55.753899,55.82468,49.188519,49.0937,13.0,...,2020-02-12 19:42:41,55.754116,49.188853,55.754116,49.188853,55.824734,49.094013,55.794388,49.111531,{lhsIiffkHmKN_C?mIPwMJ??Si@gA{B??Wq@MRCJTp@hAd...
4,4,1078,1528.0,893.0,2020-02-12 19:12:45,55.822361,55.786758,49.069092,49.143501,9.0,...,2020-02-12 19:32:19,55.822483,49.069659,55.822617,49.069229,55.786936,49.14333,55.794388,49.111531,yxusI{xnjHgAfG??}IuHkAqA??pIoe@VsA??dAkG`BuH??...


# 1. RTA based on driver data

In [4]:
def calc_rta(good_arrived, client_collected):
    ga_dt = datetime.strptime(good_arrived, '%Y-%m-%d %H:%M:%S')
    cc_dt = datetime.strptime(client_collected, '%Y-%m-%d %H:%M:%S')
    assert ga_dt >= cc_dt
    dif = ga_dt - cc_dt
    #assert dif.days == 0 # ( for taxi?)
    if dif.days != 0:
        print(good_arrived, client_collected)
    dif_in_seconds = dif.seconds
    return float(dif_in_seconds)

In [5]:
df['driver_rta'] = df.apply(lambda row: calc_rta(row['GoodArrived'], row['ClientCollected']), axis=1)

In [6]:
df.head(5)

Unnamed: 0,Id,main_id_locality,ETA,RTA,OrderedDate,latitude,del_latitude,longitude,del_longitude,EDA,...,ready_latitude,ready_longitude,onway_latitude,onway_longitude,arrived_latitude,arrived_longitude,center_latitude,center_longitude,route,driver_rta
0,0,1078,226.0,188.0,2020-02-12 19:12:06,55.826019,55.825581,49.134529,49.126949,1.0,...,55.825647,49.134115,55.826261,49.134137,55.825706,49.127136,55.794388,49.111531,gnvsIaq{jHChA??uC???OPG^F^NRzKBd@AN[r@???`@`@`...,188.0
1,1,1078,718.0,725.0,2020-02-12 19:12:22,55.795502,55.820911,49.13147,49.11536,5.0,...,55.795591,49.132281,55.795544,49.131552,55.821531,49.115685,55.794388,49.111531,sqpsI}~zjHyAr]e@lMk@fLaBlb@i@rLKhBCdAUxEGlCg@f...,725.0
2,2,1078,612.0,764.0,2020-02-12 19:12:44,55.79105,55.819962,49.22607,49.176628,5.0,...,55.791099,49.226066,55.791099,49.226066,55.819765,49.177432,55.794388,49.111531,auosI}mmkH?LHd@KhC??o@w@[g@m@iAUk@??{G|OiB`Ek@...,764.0
3,3,1078,1560.0,1412.0,2020-02-12 19:12:44,55.753899,55.82468,49.188519,49.0937,13.0,...,55.754116,49.188853,55.754116,49.188853,55.824734,49.094013,55.794388,49.111531,{lhsIiffkHmKN_C?mIPwMJ??Si@gA{B??Wq@MRCJTp@hAd...,1412.0
4,4,1078,1528.0,893.0,2020-02-12 19:12:45,55.822361,55.786758,49.069092,49.143501,9.0,...,55.822483,49.069659,55.822617,49.069229,55.786936,49.14333,55.794388,49.111531,yxusI{xnjHgAfG??}IuHkAqA??pIoe@VsA??dAkG`BuH??...,893.0


# 2. Initial metric value (ETA, driver_rta)

In [7]:
from sklearn.metrics import mean_squared_error

# for example, rmse

def rta_metric(eta, rta):
    return np.sqrt(mean_squared_error(eta, rta))

# 3. Missing value columns

In [8]:
df.isnull().any()

Id                    False
main_id_locality      False
ETA                   False
RTA                   False
OrderedDate           False
latitude              False
del_latitude          False
longitude             False
del_longitude         False
EDA                   False
RDA                   False
ReadyForCollection    False
ClientCollected       False
GoodArrived           False
ready_latitude         True
ready_longitude        True
onway_latitude         True
onway_longitude        True
arrived_latitude       True
arrived_longitude      True
center_latitude       False
center_longitude      False
route                 False
driver_rta            False
dtype: bool

There are some missing values

# 4. Additional columns

In order to use some distance function, we added some useful columns

In [9]:
def time_dif(eta, driver_rta):
    """
    some heuristics for incorrect data filled in by a driver
    """
    return np.square(eta - driver_rta)

In [10]:
df['time_dif'] = df.apply(lambda row: time_dif(row['ETA'], row['driver_rta']), axis=1)

In [11]:
def start_point_dif(latitude, longitude,
                    onway_latitude, onway_longitude):
    start = latitude, longitude
    try:
        onway = onway_latitude, onway_longitude
        return distance.great_circle(start, onway).m
    except:
        return None

In [12]:
df['start_dif'] = df.apply(lambda row: start_point_dif(row['latitude'], row['longitude'],
                                                   row['onway_latitude'], row['onway_longitude']), axis=1)

In [13]:
def end_point_dif(del_latitude, del_longitude,
                 arrived_latitude, arrived_longitude):
    end = del_latitude, del_longitude
    try:
        arrived = arrived_latitude, arrived_longitude
        return distance.great_circle(end, arrived).m
    except:
        return None

In [14]:
df['end_dif'] = df.apply(lambda row: start_point_dif(row['del_latitude'], row['del_longitude'],
                                                   row['arrived_latitude'], row['arrived_longitude']), axis=1)

In [15]:
df.dropna(subset=['time_dif', 'start_dif', 'end_dif'], inplace=True)

In [16]:
df.isnull().any()

Id                    False
main_id_locality      False
ETA                   False
RTA                   False
OrderedDate           False
latitude              False
del_latitude          False
longitude             False
del_longitude         False
EDA                   False
RDA                   False
ReadyForCollection    False
ClientCollected       False
GoodArrived           False
ready_latitude        False
ready_longitude       False
onway_latitude        False
onway_longitude       False
arrived_latitude      False
arrived_longitude     False
center_latitude       False
center_longitude      False
route                 False
driver_rta            False
time_dif              False
start_dif             False
end_dif               False
dtype: bool

In [17]:
df.shape

(995, 27)

# 5. DBSCAN

In [18]:
X = df.iloc[:, -3:]
X

Unnamed: 0,time_dif,start_dif,end_dif
0,1444.0,36.337392,18.215878
1,49.0,6.983666,71.764889
2,23104.0,5.426626,54.780441
3,21904.0,31.961390,20.453490
4,403225.0,29.757272,22.437192
...,...,...,...
995,109561.0,11.894529,44.113892
996,22500.0,26.578383,74.984939
997,187489.0,170.545390,42.114903
998,2809.0,37.221438,22.269535


In [19]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

In [20]:
X = StandardScaler().fit_transform(X)
X

array([[-0.63115279, -0.02932349, -0.10852353],
       [-0.64225531, -0.35781957,  0.07028301],
       [-0.45876528, -0.37524433,  0.01356991],
       ...,
       [ 0.84954132,  1.47259152, -0.02872183],
       [-0.62028904, -0.01943018, -0.09498789],
       [-0.42068245,  0.57393605,  0.02074379]])

In [21]:
db = DBSCAN(eps=0.5, min_samples=7).fit(X)

In [22]:
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print(n_clusters_, n_noise_)

1 35


In [23]:
labels

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,
       -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,
        0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

**Save points without noisy data**

In [24]:
# reset index after dropping missing values earlier
df.reset_index(inplace=True)

In [25]:
good_points = []
for idx in range(len(labels)):
    if labels[idx] == 0:
        good_points.append(
            [df['time_dif'][idx], df['start_dif'][idx], df['end_dif'][idx]]
        )


In [26]:
len(good_points)

960

In [27]:
df.shape

(995, 28)

In [28]:
good_points[:5]

[[1444.0, 36.33739196178953, 18.215878331474354],
 [49.0, 6.983666440259884, 71.76488868064301],
 [23104.0, 5.426626081726999, 54.78044102404102],
 [21904.0, 31.961390437602823, 20.453490377542874],
 [403225.0, 29.75727249569987, 22.437192279089864]]

# 6. For full dataset

In [29]:
df = pd.read_csv('./final_data/train.csv', nrows=5000)

In [30]:
df['driver_rta'] = df.apply(lambda row: calc_rta(row['GoodArrived'], row['ClientCollected']), axis=1)

In [31]:
df['time_dif'] = df.apply(lambda row: time_dif(row['ETA'], row['driver_rta']), axis=1)

In [32]:
df['start_dif'] = df.apply(lambda row: start_point_dif(row['latitude'], row['longitude'],
                                                   row['onway_latitude'], row['onway_longitude']), axis=1)

In [33]:
df['end_dif'] = df.apply(lambda row: start_point_dif(row['del_latitude'], row['del_longitude'],
                                                   row['arrived_latitude'], row['arrived_longitude']), axis=1)

In [34]:
df.dropna(subset=['time_dif', 'start_dif', 'end_dif'], inplace=True)

In [35]:
# reset index after dropping missing values earlier
df.reset_index(inplace=True)

In [36]:
df.shape

(4982, 28)

In [37]:
df.head(5)

Unnamed: 0,index,Id,main_id_locality,ETA,RTA,OrderedDate,latitude,del_latitude,longitude,del_longitude,...,onway_longitude,arrived_latitude,arrived_longitude,center_latitude,center_longitude,route,driver_rta,time_dif,start_dif,end_dif
0,0,0,1078,226.0,188.0,2020-02-12 19:12:06,55.826019,55.825581,49.134529,49.126949,...,49.134137,55.825706,49.127136,55.794388,49.111531,gnvsIaq{jHChA??uC???OPG^F^NRzKBd@AN[r@???`@`@`...,188.0,1444.0,36.337392,18.215878
1,1,1,1078,718.0,725.0,2020-02-12 19:12:22,55.795502,55.820911,49.13147,49.11536,...,49.131552,55.821531,49.115685,55.794388,49.111531,sqpsI}~zjHyAr]e@lMk@fLaBlb@i@rLKhBCdAUxEGlCg@f...,725.0,49.0,6.983666,71.764889
2,2,2,1078,612.0,764.0,2020-02-12 19:12:44,55.79105,55.819962,49.22607,49.176628,...,49.226066,55.819765,49.177432,55.794388,49.111531,auosI}mmkH?LHd@KhC??o@w@[g@m@iAUk@??{G|OiB`Ek@...,764.0,23104.0,5.426626,54.780441
3,3,3,1078,1560.0,1412.0,2020-02-12 19:12:44,55.753899,55.82468,49.188519,49.0937,...,49.188853,55.824734,49.094013,55.794388,49.111531,{lhsIiffkHmKN_C?mIPwMJ??Si@gA{B??Wq@MRCJTp@hAd...,1412.0,21904.0,31.96139,20.45349
4,4,4,1078,1528.0,893.0,2020-02-12 19:12:45,55.822361,55.786758,49.069092,49.143501,...,49.069229,55.786936,49.14333,55.794388,49.111531,yxusI{xnjHgAfG??}IuHkAqA??pIoe@VsA??dAkG`BuH??...,893.0,403225.0,29.757272,22.437192


In [38]:
INITIAL_METRIC = rta_metric(df['ETA'], df['driver_rta'])
INITIAL_SIZE = df.shape[0]
print(INITIAL_METRIC, INITIAL_SIZE)

367.91136838913235 4982


In [39]:
from sklearn.metrics.pairwise import euclidean_distances
THRESHOLD = 1000
def calc_dist(time_dif, start_dif, end_dif):
    idx = 0
    for p1 in good_points:
        p2 = time_dif, start_dif, end_dif
        dist = euclidean_distances((p1,p2))[0][1]
        if dist <= THRESHOLD:
            return True
        # don't wait a long!
        idx += 1
        if idx >= 300:
            break
    return False

In [40]:
df['is_good'] = df.apply(lambda row: calc_dist(row['time_dif'], row['start_dif'],
                                                   row['end_dif']), axis=1)

In [41]:
df.head(5)

Unnamed: 0,index,Id,main_id_locality,ETA,RTA,OrderedDate,latitude,del_latitude,longitude,del_longitude,...,arrived_latitude,arrived_longitude,center_latitude,center_longitude,route,driver_rta,time_dif,start_dif,end_dif,is_good
0,0,0,1078,226.0,188.0,2020-02-12 19:12:06,55.826019,55.825581,49.134529,49.126949,...,55.825706,49.127136,55.794388,49.111531,gnvsIaq{jHChA??uC???OPG^F^NRzKBd@AN[r@???`@`@`...,188.0,1444.0,36.337392,18.215878,True
1,1,1,1078,718.0,725.0,2020-02-12 19:12:22,55.795502,55.820911,49.13147,49.11536,...,55.821531,49.115685,55.794388,49.111531,sqpsI}~zjHyAr]e@lMk@fLaBlb@i@rLKhBCdAUxEGlCg@f...,725.0,49.0,6.983666,71.764889,True
2,2,2,1078,612.0,764.0,2020-02-12 19:12:44,55.79105,55.819962,49.22607,49.176628,...,55.819765,49.177432,55.794388,49.111531,auosI}mmkH?LHd@KhC??o@w@[g@m@iAUk@??{G|OiB`Ek@...,764.0,23104.0,5.426626,54.780441,True
3,3,3,1078,1560.0,1412.0,2020-02-12 19:12:44,55.753899,55.82468,49.188519,49.0937,...,55.824734,49.094013,55.794388,49.111531,{lhsIiffkHmKN_C?mIPwMJ??Si@gA{B??Wq@MRCJTp@hAd...,1412.0,21904.0,31.96139,20.45349,True
4,4,4,1078,1528.0,893.0,2020-02-12 19:12:45,55.822361,55.786758,49.069092,49.143501,...,55.786936,49.14333,55.794388,49.111531,yxusI{xnjHgAfG??}IuHkAqA??pIoe@VsA??dAkG`BuH??...,893.0,403225.0,29.757272,22.437192,True


In [42]:
selector = (df['is_good'] == True)
df = df[selector]
CUR_SIZE = df.shape[0]
CUR_METRIC = rta_metric(df['ETA'], df['driver_rta'])

# Conclusion

In [43]:
CUR_SIZE, CUR_METRIC

(4204, 209.93020905444777)

In [44]:
df.shape

(4204, 29)

In [45]:
CUR_SIZE / INITIAL_SIZE

0.8438378161380972

In [46]:
INITIAL_METRIC

367.91136838913235