In [2]:
import geopandas as gpd

points = gpd.read_file("points.gpkg")

In [3]:
import re

def to_snake_case(s):
    s = s.strip()               # remove leading/trailing spaces
    s = s.lower()               # lowercase
    s = re.sub(r'[ -]+', '_', s) # replace spaces/underscores with underscore
    s = re.sub(r'[^\w-]', '', s) # remove other non-word characters
    return s

points.columns = [to_snake_case(col) for col in points.columns]

In [4]:
import pandas as pd

# Sort by trajectoryid, epoch, objectid
points = points.sort_values(['trajectoryid', 'epoch', 'objectid']).reset_index(drop=True)

# Add milliseconds offset within each (trajectoryid, epoch) group
points['ms_offset'] = points.groupby(['trajectoryid', 'epoch']).cumcount() * 100

# Apply offset as timedelta (1 ms per duplicate)
points['epoch'] = points['epoch'] + pd.to_timedelta(points['ms_offset'], unit='ms')

# Drop helper column
points = points.drop(columns='ms_offset')

In [5]:
points_wsg84 = points.to_crs(epsg=4326)
points["lat"] = points_wsg84.geometry.y
points["lon"] = points_wsg84.geometry.x

In [6]:
# Convert trajectoryid to string type
points['trajectoryid'] = points['trajectoryid'].astype(str).str.replace('.0', '')  # Remove .0 from the string conversion
print("New trajectoryid dtype:", points['trajectoryid'].dtype)
print("\nSample of trajectoryid values:")
print(points['trajectoryid'].head())

New trajectoryid dtype: object

Sample of trajectoryid values:
0    15419
1    15419
2    15419
3    15419
4    15419
Name: trajectoryid, dtype: object


In [7]:
print(f"{points['trajectoryid'].nunique()} Trajektorien")
print("Datenpunkte pro Trajektorie:")
print(points["trajectoryid"].value_counts())

175 Trajektorien
Datenpunkte pro Trajektorie:
trajectoryid
15728    21114
16442    20582
15433    20311
15419    20106
16653    19793
         ...  
17720       56
16101       40
16471       28
15432        3
17096        2
Name: count, Length: 175, dtype: int64


In [8]:
from math import ceil
from tqdm import tqdm

original_trajectories_count = points['trajectoryid'].nunique()

records = []
for traj_id, group in tqdm(points.groupby('trajectoryid')):
    group_sorted = group.sort_values('objectid')  # Sort by objectid (epoch nicht streng monoton steigend)
    n = len(group_sorted)
    if n == 0:
        continue
    # number of splits needed (each split <= 2000)
    no_of_splits = int(ceil(n / 2000))
    # distribute n into k as evenly as possible
    base = n // no_of_splits
    remainder = n % no_of_splits
    sizes = [base + 1] * remainder + [base] * (no_of_splits - remainder)
    i = 0
    for split_i, size in enumerate(sizes):
        end = i + size
        part = group_sorted.iloc[i:end].copy()
        part['trajectoryid'] = f"{traj_id}_{split_i}" if len(sizes) > 1 else traj_id
        part['original_trajectoryid'] = traj_id
        records.append(part)
        i = end

# concatenate all parts into a new GeoDataFrame
points_split = gpd.GeoDataFrame(pd.concat(records, ignore_index=True), crs=points.crs)

print(f"Original trajectories: {original_trajectories_count}")
print(f"Split trajectories: {points_split['trajectoryid'].nunique()}")
print("Counts per split:")
print(points_split['trajectoryid'].value_counts())

100%|██████████| 175/175 [00:01<00:00, 156.12it/s]


Original trajectories: 175
Split trajectories: 773
Counts per split:
trajectoryid
16066_0    1999
16066_1    1998
15777_2    1997
15777_1    1997
15777_0    1997
           ... 
17720        56
16101        40
16471        28
15432         3
17096         2
Name: count, Length: 773, dtype: int64


In [9]:
from shapely.geometry import LineString

records = []
for traj_id, group in tqdm(points_split.groupby('trajectoryid')):
    group_sorted = group.sort_values('objectid')
    geoms = list(group_sorted.geometry)
    if len(geoms) == 0:
        continue
    if len(geoms) == 1:
        # single point trajectory -> keep point (buffer will still work)
        line_geom = geoms[0]
    else:
        coords = [(pt.x, pt.y) for pt in geoms]
        line_geom = LineString(coords).simplify(0.1)
    records.append({'trajectoryid': traj_id, 'geometry': line_geom})

# GeoDataFrame of lines (or single-point geometries for 1-point trajectories)
trajectory_lines = gpd.GeoDataFrame(records, crs=points_split.crs)

# Buffer by 0.5 meter (CRS is EPSG:31256 so units are meters)
trajectories = trajectory_lines.copy()
trajectories['geometry'] = trajectories.geometry.buffer(0.5, cap_style=3, join_style=2)

print("Trajectories (lines):", len(trajectory_lines))
print("Buffered geometries:", len(trajectories))

# Optionally inspect first rows
trajectory_lines.head(), trajectories.head()

100%|██████████| 773/773 [00:21<00:00, 36.71it/s]


Trajectories (lines): 773
Buffered geometries: 773


(  trajectoryid                                           geometry
 0      15419_0  LINESTRING (-601.117 337164.375, -639.231 3371...
 1      15419_1  LINESTRING (-820.342 337103.818, -793.796 3371...
 2     15419_10  LINESTRING (-474.25 338311, -465.951 338307.56...
 3      15419_2  LINESTRING (-972.725 337357.733, -960.77 33735...
 4      15419_3  LINESTRING (-993.78 336955.96, -988.094 336953...,
   trajectoryid                                           geometry
 0      15419_0  POLYGON ((-2490.5 337001.286, -2492.318 337002...
 1      15419_1  POLYGON ((-793.884 337109.27, -764.303 337114....
 2     15419_10  POLYGON ((-942.273 338396.614, -943.466 338392...
 3      15419_2  POLYGON ((-1413.771 337354.674, -1413.757 3373...
 4      15419_3  POLYGON ((-987.913 336954.407, -979.113 336950...)

In [10]:
trajectories.insert(1, 'download_id', None)
trajectories.insert(2, 'download_expires_at', None)
trajectories.insert(3, 'is_sensor1_completed', False)
trajectories.insert(4, 'is_sensor2_completed', False)
trajectories.insert(5, 'is_sensor3_completed', False)
trajectories.insert(6, 'is_sensor4_completed', False)

In [11]:
points.to_file("points.gpkg", driver="GPKG", layer="kappazunder_image_punkte__ogdwienkappazunderimagepogd", index=False)
trajectories.to_file("trajectories.gpkg", driver="GPKG", index=False)