In [88]:
# Test the distance covered correction method

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pydeck as pdk

In [90]:
# Load csv file as pandas dataframe
df = pd.read_csv("./data/path.csv", sep="\t")

In [91]:
# Sort by time ascending
df = df.sort_values(by="time")

In [92]:
# Rename latitude and longitude columns
df = df.rename(columns={"latitude": "lat", "longitude": "lon"})

In [93]:
df.describe()

Unnamed: 0,idx,moveId,time,lat,lon,accuracy,speed,segmentLength,segmentSpeed,closeToRail,closeToTram,closeToBus,onWater,isPowerSaving
count,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,51.5,1.0,1740065000.0,46.883868,7.295588,217.524939,6.760323,683.520833,20.785245,0.365766,4.812979,1.576886,0.020833,0.0
std,14.0,0.0,513.2408,0.037151,0.080498,422.29731,10.433594,553.863375,19.79511,0.417863,0.906458,1.930658,0.144338,0.0
min,28.0,1.0,1740064000.0,46.802477,7.150853,10.0,0.0,0.0,0.0,0.000666,0.466773,0.021098,0.0,0.0
25%,39.75,1.0,1740065000.0,46.863601,7.22785,40.375,0.0,151.75,6.277095,0.058591,5.0,0.333431,0.0,0.0
50%,51.5,1.0,1740065000.0,46.886758,7.313016,98.400002,0.594964,601.0,18.605471,0.18584,5.0,0.520284,0.0,0.0
75%,63.25,1.0,1740066000.0,46.912727,7.361409,100.0,12.092499,1080.0,29.105625,0.549612,5.0,1.689202,0.0,0.0
max,75.0,1.0,1740066000.0,46.944105,7.405831,1799.999023,33.360001,2107.0,107.210121,1.772755,5.0,5.0,1.0,0.0


In [108]:
# Compute distance using Haversine formula
def compute_distance(old_location, new_location):
    R = 6_371_000  # Radius of the Earth in meters
    lat1, lon1 = np.radians(old_location)
    lat2, lon2 = np.radians(new_location)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

In [160]:
# Plot consecutive points as lines on a map, with pydeck (in the notebook!)
# Build paths from consecutive points
def plot_paths(df, lon, lat, color=[255, 0, 0]):
    paths = []
    for i in range(len(df) - 1):
        paths.append(
            {
                "path": [
                    [df[lon].iloc[i], df[lat].iloc[i]],
                    [df[lon].iloc[i + 1], df[lat].iloc[i + 1]],
                ],
                "name": f"Path {i}",
            }
        )

    # Create a layer with the paths
    path_layer = pdk.Layer(
        "PathLayer",
        data=paths,
        get_path="path",
        get_color=color,
        get_width=5,
        pickable=True,
        auto_highlight=True,
    )

    # Create a deck with the path layer
    deck = pdk.Deck(
        path_layer,
        initial_view_state=pdk.ViewState(
            latitude=df[lat].mean(),
            longitude=df[lon].mean(),
            zoom=12,
            pitch=20,
        ),
    )

    return deck

In [161]:
# Plot original paths
plot_paths(df, "lon", "lat").show()

In [162]:
# Compute total distance as sum of segmentLength
print("Total distance: ", df["segmentLength"].sum())

Total distance:  32809


In [163]:
# Clean distance covered by vehicle
def clean_distance(df, vehicle = "TRAIN", window_size=1, threshold=0.8):
    # Filter activities based on vehicle type and threshold
    if vehicle == "TRAIN":
        df_c = df[df["closeToRail"] < threshold]
    elif vehicle == "TRAM":
        df_c = df[df["closeToTram"] < threshold]
    elif vehicle == "BUS":
        df_c = df[df["closeToBus"] < threshold]

    # Ensure the dataframe is not empty
    if df_c.empty:
        total_distance = df["segmentLength"].sum()
    else:
        # Add initial and final activities if not already in the list
        if df_c.iloc[0]["time"] != df_c.iloc[0]["time"]:
            df_c = pd.concat([df_c.iloc[[0]], df])
        if df_c.iloc[-1]["time"] != df_c.iloc[-1]["time"]:
            df_c = pd.concat([df_c, df_c.iloc[[-1]]])

        # Apply a moving window to reduce variations in the data
        df_c = df_c.copy()
        df_c.loc[:, "lat_avg"] = df_c["lat"].rolling(window=window_size, min_periods=1).mean()
        df_c.loc[:, "lon_avg"] = df_c["lon"].rolling(window=window_size, min_periods=1).mean()

        # Add back a copy of the initial and final activities to df_c, adding the original locations as "lat_avg" and "lon_avg"
        df_initial = df.iloc[[0]].copy()
        df_initial["lat_avg"] = df_initial["lat"]
        df_initial["lon_avg"] = df_initial["lon"]

        df_final = df.iloc[[-1]].copy()
        df_final["lat_avg"] = df_final["lat"]
        df_final["lon_avg"] = df_final["lon"]

        df_c = pd.concat([df_initial, df_c, df_final])

        # Compute the distance between averaged locations
        total_distance = 0
        for i in range(1, len(df_c)):
            old_location = (df_c.iloc[i - 1]["lat_avg"], df_c.iloc[i - 1]["lon_avg"])
            new_location = (df_c.iloc[i]["lat_avg"], df_c.iloc[i]["lon_avg"])
            
            total_distance += compute_distance(old_location, new_location)

    # If the distance is too short, return the original distance
    if total_distance < 1:  # Define MIN_DISTANCE as 1 for this example
        total_distance = df["segmentLength"].sum()

    return total_distance, df_c


In [185]:
# For each window size, compute the distance
results = {}
for window_size in range(1, 5):
    for threshold in [0.5, 0.8, 1, 1.5]:
        results[f"w{window_size}t{threshold}"] = clean_distance(df, vehicle="TRAIN", window_size = window_size, threshold=threshold)

# Ground truth
real_distance = 28_100

# Print the three best results from the most similar to the real_distance
best_results = sorted(results.items(), key=lambda x: abs(x[1][0] - real_distance))[:3]
for name, (distance, df_c) in best_results:
    print(name, distance)

w2t1 28445.795879515437
w2t1.5 28504.321543984846
w1t0.8 27540.755797877926


In [186]:
# >Get the best result
best_result = best_results[0]
df_c = best_result[1][1]

# Plot the best result and the original on the same map
layers = [
    pdk.Layer(
        "PathLayer",
        data=[
            {
                "path": [
                    [df["lon"].iloc[i], df["lat"].iloc[i]],
                    [df["lon"].iloc[i + 1], df["lat"].iloc[i + 1]],
                ],
                "name": f"Original Path {i}",
            }
            for i in range(len(df) - 1)
        ],
        get_path="path",
        get_color=[255, 0, 0],
        get_width=5,
        pickable=True,
        auto_highlight=True,
    ),
    pdk.Layer(
        "PathLayer",
        data=[
            {
                "path": [
                    [df_c["lon_avg"].iloc[i], df_c["lat_avg"].iloc[i]],
                    [df_c["lon_avg"].iloc[i + 1], df_c["lat_avg"].iloc[i + 1]],
                ],
                "name": f"Cleaned Path {i}",
            }
            for i in range(len(df_c) - 1)
        ],
        get_path="path",
        get_color=[0, 255, 0],
        get_width=5,
        pickable=True,
        auto_highlight=True,
    ),
]

# Create a deck with the path layer
deck = pdk.Deck(
    layers,
    initial_view_state=pdk.ViewState(
        latitude=df["lat"].mean(),
        longitude=df["lon"].mean(),
        zoom=12,
        pitch=20,
    ),
)

deck.show()

In [None]:
# Plot corrected paths
plot_paths(df_c, "lon_avg", "lat_avg").show()