In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq
import os

In [22]:
import glob

# Set the directory containing your parquet files
parquet_dir = "./parquet"  # Update if needed

# Find all .parquet files in the directory
parquet_files = sorted(glob.glob(os.path.join(parquet_dir, "*.parquet")))

print(f"Found {len(parquet_files)} Parquet files.")

# Read and concatenate all parquet files into a single DataFrame
if parquet_files:
	df_all = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
	# Show basic info
	df_all.info()
	df_all.head()
else:
	print("No Parquet files found in the directory.")
	df_all = pd.DataFrame()


Found 40 Parquet files.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39775 entries, 0 to 39774
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mf_bytes     39775 non-null  object 
 1   TimeStamp    39775 non-null  float64
 2   MessageType  39775 non-null  object 
 3   Geohash      39775 non-null  object 
 4   Latitude     39775 non-null  float64
 5   Longitude    39775 non-null  float64
dtypes: float64(3), object(3)
memory usage: 1.8+ MB


In [29]:
import pandas as pd

# Convert float timestamp to datetime
df_all['Time'] = pd.to_datetime(df_all['TimeStamp'], unit='s')

# Optional: round to nearest 1 minute
df_all['TimeBucket'] = df_all['Time'].dt.floor('1MIN')  # Change to '1min', '30s' etc. as needed

grouped = df_all.groupby(['TimeBucket', 'Geohash'])

group_sizes = grouped.size().reset_index(name='Count')
group_sizes.sort_values('Count', ascending=False).head(10)



Unnamed: 0,TimeBucket,Geohash,Count
7136,2025-08-06 00:07:00,9t9p75yd,108
6420,2025-08-05 23:03:00,9t9p75y9,56
7097,2025-08-06 00:05:00,9t9p75yd,43
7109,2025-08-06 00:06:00,9t9p75yd,34
6944,2025-08-05 23:29:00,9tbq8c1g,30
6736,2025-08-05 23:25:00,9tbq2v6h,30
6891,2025-08-05 23:28:00,9tbq8b1c,30
6987,2025-08-05 23:30:00,9tbq8ccy,28
6534,2025-08-05 23:07:00,9tcj2hb8,20
7642,2025-08-06 00:59:00,9tcj2hb8,20


In [38]:
focus_geohashes = ['9tbq2v6h', '9tbq8b1c', '9tbq8c1g', '9tbq8ccy']
df_focus = df_all[df_all['Geohash'].isin(focus_geohashes)]

print("Message count per geohash:")
print(df_focus['Geohash'].value_counts())

# Sort by geohash and time for better readability
df_focus_sorted = df_focus.sort_values(by=["Geohash", "Time"])
df_focus_sorted.reset_index(drop=True, inplace=True)
df_focus_sorted



Message count per geohash:
Geohash
9tbq8b1c    36
9tbq2v6h    30
9tbq8c1g    30
9tbq8ccy    28
Name: count, dtype: int64


Unnamed: 0,mf_bytes,TimeStamp,MessageType,Geohash,Latitude,Longitude,Time,TimeBucket
0,b'\x00\x14%\x01\xfd\xa8\xa1\x07\xc1\x03d\xcc)\...,1.754436e+09,BasicSafetyMessage,9tbq2v6h,33.471927,-112.112441,2025-08-05 23:25:01.161494016,2025-08-05 23:25:00
1,b'\x00\x14%\x01\xfd\xa8\xa1\x07\xc1\x03d\xcc)\...,1.754436e+09,BasicSafetyMessage,9tbq2v6h,33.471927,-112.112441,2025-08-05 23:25:01.162800128,2025-08-05 23:25:00
2,b'\x00\x14%\x02=\xa8\xa1\x07\xc1\xfe$\xcc)\xfe...,1.754436e+09,BasicSafetyMessage,9tbq2v6h,33.471974,-112.112455,2025-08-05 23:25:02.156740864,2025-08-05 23:25:00
3,b'\x00\x14%\x02=\xa8\xa1\x07\xc1\xfe$\xcc)\xfe...,1.754436e+09,BasicSafetyMessage,9tbq2v6h,33.471974,-112.112455,2025-08-05 23:25:02.158128128,2025-08-05 23:25:00
4,b'\x00\x14%\x02}\xa8\xa1\x07\xc2\xf8$\xcc*\x87...,1.754436e+09,BasicSafetyMessage,9tbq2v6h,33.472001,-112.112463,2025-08-05 23:25:03.156142080,2025-08-05 23:25:00
...,...,...,...,...,...,...,...,...
119,b'\x00\x14%\x1a\xbd\xa8\xa1\x07\xe6 \xe4\xce\x...,1.754437e+09,BasicSafetyMessage,9tbq8ccy,33.497114,-112.112921,2025-08-05 23:30:39.179267840,2025-08-05 23:30:00
120,b'\x00\x14%\x1a\xfd\xa8\xa1\x07\xe7\x1a\xe4\xc...,1.754437e+09,BasicSafetyMessage,9tbq8ccy,33.497118,-112.112920,2025-08-05 23:30:40.167043840,2025-08-05 23:30:00
121,b'\x00\x14%\x1a\xfd\xa8\xa1\x07\xe7\x1a\xe4\xc...,1.754437e+09,BasicSafetyMessage,9tbq8ccy,33.497118,-112.112920,2025-08-05 23:30:40.168000000,2025-08-05 23:30:00
122,"b""\x00\x14%\x1b=\xa8\xa1\x07\xe8\x14d\xce\x15i...",1.754437e+09,BasicSafetyMessage,9tbq8ccy,33.497135,-112.112918,2025-08-05 23:30:41.141488128,2025-08-05 23:30:00


In [61]:
import os
import subprocess
import pandas as pd
from pathlib import Path

# 1. Detect repo root and set decoder path RELATIVE to repo root
notebook_dir = Path.cwd()
repo_root = None

# Traverse up to find .git as marker for root
for parent in notebook_dir.parents:
    if (parent / ".git").exists():
        repo_root = parent
        break
if repo_root is None:
    repo_root = notebook_dir  # fallback if not using git

DECODER_PATH = repo_root / "libsm/b2v-libsm/build/bin/decodeToJER"
print("Detected repo root:", repo_root)
print("Decoder Path:", DECODER_PATH)
if not DECODER_PATH.exists():
    raise FileNotFoundError(f"decodeToJER not found at {DECODER_PATH}")

# 2. Focused geohashes
focus_geohashes = ['9tbq2v6h', '9tbq8b1c', '9tbq8c1g', '9tbq8ccy']
df_focus = df_all[df_all['Geohash'].isin(focus_geohashes)].copy()
print("BSMs in focus:", len(df_focus))

# 3. Convert mf_bytes to hex
def mf_bytes_to_hex(val):
    if isinstance(val, (bytes, bytearray)):
        return val.hex()
    if isinstance(val, str) and val.startswith("b'"):  # as string repr
        return eval(val).hex()
    return None

df_focus["mf_hex"] = df_focus["mf_bytes"].apply(mf_bytes_to_hex)

# 4. Decode each BSM using the C binary
def decode_bsm_hex(hex_str):
    try:
        result = subprocess.run(
            [str(DECODER_PATH), "-i", hex_str],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
            text=True,
            timeout=3,
        )
        return result.stdout
    except Exception as e:
        print(f"[DecodeError] {e}")
        return None

df_focus["jer"] = df_focus["mf_hex"].apply(decode_bsm_hex)
print("Decoded BSMs:", df_focus['jer'].notnull().sum())

# 5. (Optional) Show a few decoded outputs for inspection
for jer in df_focus["jer"].dropna().head(3):
    print(jer)


Detected repo root: /home/mj/driveAZ_Analytics
Decoder Path: /home/mj/driveAZ_Analytics/libsm/b2v-libsm/build/bin/decodeToJER
BSMs in focus: 124
Decoded BSMs: 124
{
    "messageId": 20,
    "value": {
        "BasicSafetyMessage": {
            "coreData": {
                "msgCnt": 7,
                "id": "F6A2841F",
                "secMark": 1037,
                "lat": 334719271,
                "long": -1121124409,
                "elev": 3355,
                "accuracy": {
                    "semiMajor": 255,
                    "semiMinor": 255,
                    "orientation": 65535
                },
                "transmission": "unavailable",
                "speed": 365,
                "heading": 27871,
                "angle": 127,
                "accelSet": {
                    "long": 30,
                    "lat": 5,
                    "vert": -4,
                    "yaw": 212
                },
                "brakes": {
                    "wheelBrakes": 

In [62]:
import json
import numpy as np

# 1. Extract id and secMark from the decoded JER string (for each row)
def extract_id_secmark(jer_str):
    try:
        jer = json.loads(jer_str)
        bsm = jer["value"]["BasicSafetyMessage"]["coreData"]
        return bsm.get("id"), bsm.get("secMark")
    except Exception as e:
        return None, None

df_focus[["bsm_id", "bsm_secMark"]] = df_focus["jer"].apply(lambda x: pd.Series(extract_id_secmark(x)))

# 2. Check which BSMs have the same id and nearby timestamps (TimeStamp or secMark)
# Sort for easier comparison
df_focus_sorted = df_focus.sort_values(["bsm_id", "TimeStamp"])

# Compute time difference (in seconds) to previous message with same id
df_focus_sorted["prev_TimeStamp"] = df_focus_sorted.groupby("bsm_id")["TimeStamp"].shift(1)
df_focus_sorted["dt_sec"] = df_focus_sorted["TimeStamp"] - df_focus_sorted["prev_TimeStamp"]

# Show BSMs with dt_sec < threshold (e.g., 2 seconds)
threshold = 2
nearby = df_focus_sorted[df_focus_sorted["dt_sec"].notnull() & (df_focus_sorted["dt_sec"] < threshold)]

print(f"BSMs with repeated id within {threshold} seconds:")
display(nearby[["bsm_id", "TimeStamp", "dt_sec", "Geohash", "Latitude", "Longitude"]].head(10))


BSMs with repeated id within 2 seconds:


Unnamed: 0,bsm_id,TimeStamp,dt_sec,Geohash,Latitude,Longitude
22436,F6A2841F,1754436000.0,0.001306,9tbq2v6h,33.471927,-112.112441
22439,F6A2841F,1754436000.0,0.993941,9tbq2v6h,33.471974,-112.112455
22440,F6A2841F,1754436000.0,0.001387,9tbq2v6h,33.471974,-112.112455
22443,F6A2841F,1754436000.0,0.998014,9tbq2v6h,33.472001,-112.112463
22444,F6A2841F,1754436000.0,0.00098,9tbq2v6h,33.472001,-112.112463
22447,F6A2841F,1754436000.0,0.998963,9tbq2v6h,33.472012,-112.112466
22448,F6A2841F,1754436000.0,0.000914,9tbq2v6h,33.472012,-112.112466
22451,F6A2841F,1754436000.0,1.019319,9tbq2v6h,33.472013,-112.112466
22452,F6A2841F,1754436000.0,0.001053,9tbq2v6h,33.472013,-112.112466
22455,F6A2841F,1754436000.0,0.978684,9tbq2v6h,33.472012,-112.112466


In [None]:
import plotly.express as px

# Ensure Time is datetime and round/floor to seconds
df_focus_sorted["Time"] = pd.to_datetime(df_focus_sorted["Time"])
df_focus_sorted["Time_sec"] = df_focus_sorted["Time"].dt.floor("S")

# (Optional) Convert bsm_id to string for display
df_focus_sorted["bsm_id"] = df_focus_sorted["bsm_id"].astype(str)

# Sort by Time for animation
df_anim = df_focus_sorted.sort_values("Time_sec")

fig = px.scatter_map(
    df_anim,
    lat="Latitude",
    lon="Longitude",
    color="bsm_id",      # Color by vehicle
    animation_frame=df_anim["Time_sec"].dt.strftime('%Y-%m-%d %H:%M:%S'),
    hover_name="bsm_id",
    zoom=12,
    height=600,
    map_style="open-street-map"  # Same style as before
)

fig.update_layout(
    title="BSM Movement by Vehicle ID (Per Second)",
    margin={"r":0, "t":30, "l":0, "b":0},
)
fig.show()



*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

