In [1]:
"""
Count total distances covered by a self-driving car in autopilot mode and in manual control mode
"""

'\nCount total distances covered by a self-driving car in autopilot mode and in manual control mode\n'

In [2]:
from urllib.request import urlopen
from math import radians
import numpy as np
import pandas as pd

In [3]:
def get_switch(s):
    """ Get control_switch_on state
    
        Args:
            s - string
            
        Returns:
            time 
            switch state: True/False
    """
    switch, ts = s.split(',')
    _, control = switch.split(':')
    control = control == 'true'
    _, ts = ts.split(':')
    ts = float(ts.replace('}\n',''))
    #ts = ts.replace('}\n','')
    return(ts, control)

In [4]:
def get_geo(s):
    """Get geo location
    
        Args:
            s - string
            
        Returns:
            time, lat, long
            
    """
    geo, ts = s.split('},')
    _, ts = ts.split(':')
    ts = ts.replace('}\n','')
    lat, lon = geo.split(',')
    _, lon = lon.split(':')
    _, lat = lat.split('"lat":')
    return(float(ts), float(lat), float(lon))
    #return ts,lat,lon

In [5]:
%%time
#file_name = "data"
#with open(file_name) as file: 

url = "https://sdcimages.s3.yandex.net/test_task/data"
file  = urlopen(url)

count = 0
switch_times = []
switch_on = []
geo_times = []
lats = []
lons = []
for string in file: 
    count += 1
    line = string.decode("utf-8")
    if line.find('control_switch_on') > 0: 
        ts, on = get_switch(line)
        switch_times.append(ts)
        switch_on.append(on)
    elif line.find('geo') > 0:
        ts, lat, lon = get_geo(line)
        geo_times.append(ts)
        lats.append(lat)
        lons.append(lon)
    else:
        print("*** Unknown format: ", line)
        break;
            
print("*** Total lines: ", count)


*** Total lines:  55884
CPU times: user 388 ms, sys: 22.1 ms, total: 410 ms
Wall time: 1.37 s


In [6]:
switch_on_df = pd.DataFrame({'time':switch_times, 'on':switch_on})
switch_on_df = switch_on_df.sort_values(by='time').copy()
print('*** control_switch_on:')
print(switch_on_df.info())
#print(switch_on_df.head())

*** control_switch_on:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24737 entries, 1563 to 8020
Data columns (total 2 columns):
time    24737 non-null float64
on      24737 non-null bool
dtypes: bool(1), float64(1)
memory usage: 410.7 KB
None


In [7]:
print('*** Switch value counts:')
print(switch_on_df['on'].value_counts())

*** Switch value counts:
True     19982
False     4755
Name: on, dtype: int64


In [8]:
geo_df = pd.DataFrame({'time':geo_times, 'lat':lats, 'lon':lons})
geo_df = geo_df.sort_values(by='time').copy()
print("*** locations: ")
print(geo_df.info())
#geo_df.head()

*** locations: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 31147 entries, 256 to 8952
Data columns (total 3 columns):
time    31147 non-null float64
lat     31147 non-null float64
lon     31147 non-null float64
dtypes: float64(3)
memory usage: 973.3 KB
None


In [9]:
# Left join locations with `control_switch_on` matching on nearest time
df = pd.merge_asof(geo_df, switch_on_df.assign(time=switch_on_df["time"].astype(float)), on="time")

In [10]:
print("*** Left join locations with `control_switch_on` ")
print(df.info())
print(df.head())

*** Left join locations with `control_switch_on` 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 31147 entries, 0 to 31146
Data columns (total 4 columns):
time    31147 non-null float64
lat     31147 non-null float64
lon     31147 non-null float64
on      30920 non-null object
dtypes: float64(3), object(1)
memory usage: 1.2+ MB
None
           time        lat         lon   on
0  1.546825e+18  36.108921 -115.155588  NaN
1  1.546825e+18  36.108921 -115.155588  NaN
2  1.546825e+18  36.108921 -115.155588  NaN
3  1.546825e+18  36.108921 -115.155588  NaN
4  1.546825e+18  36.108921 -115.155588  NaN


In [11]:
print("*** Switch value counts (joined with locations):\n",df['on'].value_counts())

*** Switch value counts (joined with locations):
 True     24976
False     5944
Name: on, dtype: int64


In [12]:
no_switch_info = df[df['time'] <= min(switch_on_df['time'].values)].copy()
print("*** Number of location records for which value of `control_switch_on` is not konown: ",
      len(no_switch_info))

*** Number of location records for which value of `control_switch_on` is not konown:  227


In [13]:
# Drop records with unknown switch
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30920 entries, 227 to 31146
Data columns (total 4 columns):
time    30920 non-null float64
lat     30920 non-null float64
lon     30920 non-null float64
on      30920 non-null object
dtypes: float64(3), object(1)
memory usage: 1.2+ MB


In [14]:
#df.head()

In [15]:
def distance(s_lat, s_lng, e_lat, e_lng):
    """  Find distance between two latitude-longitude coordinates with Haversine formula
    
        Args:
            s_lat, s_lng - coordinates of the first point
            e_lat, e_lng - coordinates of the second point
            
        Returns:
            distances in km
    """    
   
    
    R = 6371 # approximate radius of earth in km
    
    s_lat = np.deg2rad(s_lat)                    
    s_lng = np.deg2rad(s_lng)     
    e_lat = np.deg2rad(e_lat)                       
    e_lng = np.deg2rad(e_lng)  
    
    d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(
        e_lat) * np.sin((e_lng - s_lng)/2)**2
    
    return 2 * R * np.arcsin(np.sqrt(d)) 

In [16]:
# Calculate the distance between adjacent points in a vectorize way with shift:
df['dist'] = distance(df['lat'], df['lon'], 
                      df['lat'].shift(-1), df['lon'].shift(-1))

ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'

In [17]:
df.head()

Unnamed: 0,time,lat,lon,on,dist
227,1.546825e+18,36.108921,-115.155588,False,7.073231e-13
228,1.546825e+18,36.108921,-115.155588,False,1.414646e-12
229,1.546825e+18,36.108921,-115.155588,False,0.0
230,1.546825e+18,36.108921,-115.155588,False,7.073231e-13
231,1.546825e+18,36.108921,-115.155588,False,7.073231e-13


In [18]:
total_dist = df['dist'].sum() # total distance
auto_pilot_on_distance = df.loc[df['on'],'dist'].sum() # with autopilot 
auto_pilot_off_distance = total_dist - auto_pilot_on_distance

In [19]:
print("*** Distances: ")
print("    - On autopilot (control_switch_on = true): {} km ".format(auto_pilot_on_distance))
print("    - With manual control (control_switch_on = false): {} km".format(auto_pilot_off_distance))

*** Distances: 
    - On autopilot (control_switch_on = true): 24486.18852885283 km 
    - With manual control (control_switch_on = false): 24482.164359693365 km
