## Loading sample file

In [1]:
import pandas as pd
from utils import load_df, DATASETS_DIR


df = load_df()

## Data preprocessing

In [2]:
def normalize(df):
    normalized = df.explode(column='cars_list', ignore_index=True)
    normalized = normalized.rename(columns={'cars_list': 'car_id'})
    normalized = normalized.astype(dtype={'car_id': 'int64'})
    return normalized


def round_coordinates(df):
    for col in ['latitude', 'longitude']:
        df[col] = df[col].round(decimals=3)
    return df


preprocessed = normalize(df)
preprocessed = round_coordinates(preprocessed)

preprocessed

Unnamed: 0,latitude,longitude,car_id,timestamp
0,32.090,34.797,138,2020-02-25 10:48:02+00:00
1,32.116,34.795,64,2020-02-25 10:48:02+00:00
2,32.119,34.822,120,2020-02-25 10:48:02+00:00
3,32.112,34.838,110,2020-02-25 10:48:02+00:00
4,32.143,34.793,190,2020-02-25 10:48:02+00:00
...,...,...,...,...
1773975,32.075,34.779,180,2019-10-03 15:15:02+00:00
1773976,32.102,34.792,214,2019-10-03 15:15:02+00:00
1773977,32.102,34.792,145,2019-10-03 15:15:02+00:00
1773978,32.131,34.795,243,2019-10-03 15:15:02+00:00


## Exploring data

In [3]:
total_ids = len(preprocessed.car_id.unique())
print(f'Total number of unique car_ids: {total_ids}')

max_id = preprocessed.car_id.max()
print(f'Max car_id: {max_id}')

grouped_by_id = preprocessed.groupby(['car_id']).size()
idx_max = grouped_by_id.idxmax()
print(f'Most popular: car_id={idx_max}, count={grouped_by_id[idx_max]}')

Total number of unique car_ids: 261
Max car_id: 272
Most popular: car_id=180, count=7327


## Choosing first and last records with unique coordinates for `car_id=3`

In [4]:
CAR_ID_3_FILE = 'car-id-3.csv'


def filter_unique_for_id(df, id):
    df = df[df['car_id'] == id].sort_values(by=['timestamp'])
    return pd.concat([
        df.drop_duplicates(['latitude', 'longitude'], keep='first'),
        df.drop_duplicates(['latitude', 'longitude'], keep='last'),
    ]).sort_values(by=['timestamp'])


df_car_id_3 = filter_unique_for_id(df=preprocessed, id=3)
df_car_id_3.to_csv(DATASETS_DIR + CAR_ID_3_FILE, index=False)
df_car_id_3

Unnamed: 0,latitude,longitude,car_id,timestamp
436965,32.119,34.820,3,2018-07-14 00:02:13+00:00
468927,32.119,34.820,3,2018-07-14 09:08:08+00:00
424428,32.076,34.772,3,2018-07-14 09:33:23+00:00
424428,32.076,34.772,3,2018-07-14 09:33:23+00:00
419997,32.076,34.771,3,2018-07-14 09:38:25+00:00
...,...,...,...,...
371981,32.057,34.761,3,2020-02-17 22:24:03+00:00
386164,32.075,34.801,3,2020-02-17 22:57:02+00:00
352305,32.075,34.801,3,2020-02-17 23:57:02+00:00
896,32.114,34.828,3,2020-02-24 21:03:02+00:00


## Show on map

In [5]:
import folium 


def show_on_map(df):
    cars_map = folium.Map([32.09, 34.8], zoom_start=13, tiles="Stamen toner")
    for latitude, longitude, car_id in zip(df.latitude, df.longitude, df.car_id):
        folium.CircleMarker(
            [latitude, longitude],radius=10, color=None,
            fill_color='red',fill_opacity=0.3,
            tooltip="car_id : "+str(car_id),
        ).add_to(cars_map)
    return cars_map


show_on_map(df_car_id_3)