In [1]:
pip install numpy soundfile scipy



In [3]:
from google.colab import files
uploaded = files.upload()
gpx_path = list(uploaded.keys())[0]

Saving vnexpress.gpx to vnexpress.gpx


In [6]:
"""

# --- Google Colab Support ---
# If running in Google Colab, you can upload a GPX file interactively by running:
#
# from google.colab import files
# uploaded = files.upload()
# gpx_path = list(uploaded.keys())[0]
# !python reconstruct_footfalls.py "$gpx_path" output.wav
#
# Then download output.wav using:
# files.download('output.wav')

reconstruct_footfalls.py

Reconstruct pleasant-sounding footfall audio from a GPX file.

Usage:
    python reconstruct_footfalls.py input.gpx output.wav

What it does (short):
- Parses a GPX track (lat, lon, time). If no timestamps present, it linearly maps points across 60s.
- Computes ground speed and maps it onto a step rate (cadence) using a simple stride-length model.
- Generates footfall events (left/right alternating) with slight randomness and per-step variation.
- Synthesizes each footfall as the mix of a short filtered noise "impact" and a low-frequency tonal "body" with an exponential decay.
- Pans left/right and applies mild reverberation (simple Schroeder-style reverb) so the result is pleasant.
- Exports a 60-second WAV (float32).

Dependencies:
    numpy, soundfile, scipy

"""

import sys
import xml.etree.ElementTree as ET
from datetime import datetime
import numpy as np
import soundfile as sf
from scipy.signal import lfilter, butter

SAMPLE_RATE = 44100
DURATION = 60.0  # seconds

# ---------- utilities ----------

def haversine_meters(lat1, lon1, lat2, lon2):
    # haversine formula
    R = 6371000.0
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2.0) ** 2
    return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))


def parse_gpx(gpx_path):
    tree = ET.parse(gpx_path)
    root = tree.getroot()
    # GPX namespace handling
    ns = {'default': root.tag[root.tag.find('{')+1:root.tag.find('}')] } if '}' in root.tag else {}

    points = []  # list of (lat, lon, time_or_None)
    # look for trkpt under trk/trkseg
    for trk in root.findall('.//{*}trk'):
        for trkseg in trk.findall('.//{*}trkseg'):
            for trkpt in trkseg.findall('.//{*}trkpt'):
                lat = float(trkpt.attrib.get('lat'))
                lon = float(trkpt.attrib.get('lon'))
                tnode = trkpt.find('{*}time')
                t = None
                if tnode is not None and tnode.text:
                    try:
                        # try parsing ISO8601-ish
                        t = datetime.fromisoformat(tnode.text.replace('Z', '+00:00'))
                    except Exception:
                        try:
                            t = datetime.strptime(tnode.text, '%Y-%m-%dT%H:%M:%SZ')
                        except Exception:
                            t = None
                points.append((lat, lon, t))
    # fallback: sometimes points are directly under rte/rtept
    if not points:
        for rtept in root.findall('.//{*}rtept'):
            lat = float(rtept.attrib.get('lat'))
            lon = float(rtept.attrib.get('lon'))
            tnode = rtept.find('{*}time')
            t = None
            if tnode is not None and tnode.text:
                try:
                    t = datetime.fromisoformat(tnode.text.replace('Z', '+00:00'))
                except Exception:
                    t = None
            points.append((lat, lon, t))

    if not points:
        raise ValueError('No track points found in GPX file')
    return points


# ---------- mapping GPX to timeline ----------

def compute_speeds_and_times(points):
    # points: list of (lat, lon, datetime or None)
    n = len(points)
    lats = np.array([p[0] for p in points])
    lons = np.array([p[1] for p in points])
    times = [p[2] for p in points]

    # compute distances between successive points
    dists = np.zeros(n)
    dt_secs = np.zeros(n)
    for i in range(1, n):
        dists[i] = haversine_meters(lats[i-1], lons[i-1], lats[i], lons[i])
        if times[i] and times[i-1]:
            dt = (times[i] - times[i-1]).total_seconds()
            dt_secs[i] = max(dt, 1e-3)
        else:
            dt_secs[i] = 0.0

    # if timestamps exist for at least 2 points, create absolute times array
    if sum(1 for t in times if t is not None) >= 2:
        # create array of seconds since first timestamp
        # fill missing timestamps by linear interpolation of point indices
        # map provided timestamps to seconds
        provided_idx = [i for i, t in enumerate(times) if t is not None]
        first_time = times[provided_idx[0]]
        sec_times = np.full(n, np.nan)
        for i in provided_idx:
            sec_times[i] = (times[i] - first_time).total_seconds()
        # interpolate gaps
        notnan = ~np.isnan(sec_times)
        idx = np.arange(n)
        sec_times = np.interp(idx, idx[notnan], sec_times[notnan])
        # now compute dt_secs from sec_times
        dt_secs = np.diff(np.concatenate(([0.0], sec_times)))
        dt_secs = np.maximum(dt_secs, 1e-3)
        speeds = dists / dt_secs
        timeline_seconds = sec_times
        source_duration = sec_times[-1]
    else:
        # no timestamps or only one timestamp -> map points evenly across DURATION
        idx = np.arange(n)
        timeline_seconds = idx / (n - 1) * DURATION
        dt_secs = np.diff(np.concatenate(([0.0], timeline_seconds)))
        dt_secs = np.maximum(dt_secs, 1e-3)
        speeds = dists / dt_secs
        source_duration = min(timeline_seconds[-1], DURATION)

    # clamp speeds (avoid infinities)
    speeds = np.nan_to_num(speeds, nan=0.0, posinf=0.0, neginf=0.0)
    return timeline_seconds, speeds, source_duration


# ---------- estimate cadence (steps/sec) ----------

def estimate_step_rate(speeds_m_s, stride_length_m=0.78):
    # Basic model: step_rate = speed / stride_length. stride_length default ~0.78m per step
    # Add physiological bounds for walking/running cadence
    step_rates = speeds_m_s / stride_length_m
    # clamp cadence between 0.5 steps/s (30 spm) and 4 steps/s (240 spm)
    step_rates = np.clip(step_rates, 0.4, 4.0)
    return step_rates


# ---------- generate footfall event times ----------

def synthesize_events(timeline_seconds, step_rates):
    # timeline_seconds & step_rates are per original GPX point
    # We'll resample into denser timeline and produce events by integrating instantaneous rate
    total_len = len(timeline_seconds)
    # create a fine time grid across source_duration
    source_duration = timeline_seconds[-1]
    dt = 0.1  # coarse grid for rate integration to create events
    grid = np.arange(0.0, source_duration + 1e-8, dt)
    # interpolate step rate onto grid
    step_rate_grid = np.interp(grid, timeline_seconds, step_rates)

    # integrate rate to produce cumulative expected steps
    expected_steps = np.cumsum(step_rate_grid * dt)
    # produce integer step indices from 0 to floor(last)
    n_steps = int(np.floor(expected_steps[-1]))
    if n_steps < 2:
        # ensure at least two steps for audible result
        n_steps = max(2, n_steps)

    # find times where expected_steps crosses integers
    event_times = []
    cur_step = 1
    for i in range(1, len(grid)):
        while expected_steps[i] >= cur_step and cur_step <= n_steps:
            # linear interpolate between grid[i-1] and grid[i]
            alpha = ((cur_step - expected_steps[i-1]) / (expected_steps[i] - expected_steps[i-1] + 1e-12))
            t = grid[i-1] + alpha * (grid[i] - grid[i-1])
            event_times.append(t)
            cur_step += 1
    event_times = np.array(event_times)

    # if nothing found (rare), fill evenly
    if len(event_times) == 0:
        event_times = np.linspace(0, source_duration, 30)

    # add jitter proportional to rate (faster -> less jitter)
    # add small per-step variability (normal with sigma 0.02s scaled by rate)
    sigma = np.clip(0.04 * (1.0 / (1.0 + np.interp(event_times, timeline_seconds, step_rates))), 0.01, 0.06)
    jitter = np.random.normal(scale=sigma)
    event_times = event_times + jitter
    # keep in bounds
    event_times = np.clip(event_times, 0.0, source_duration)

    # alternate left/right
    sides = np.array(['L' if i % 2 == 0 else 'R' for i in range(len(event_times))])
    return event_times, sides


# ---------- synthesis building blocks (ASPHALT FOOTSTEPS) ----------

# Asphalt footstep model:
# - Sharp heel transient (2–6 kHz band)
# - Mid-frequency slap (300–900 Hz)
# - Low-frequency body thump (70–160 Hz)
# - Optional toe-off transient ~40–60 ms after heel strike
# - Small randomization for realism

import random

def make_asphalt_heel(sr=SAMPLE_RATE):
    # short, sharp transient
    n = int(0.03 * sr)
    noise = np.random.randn(n)
    b, a = butter(2, [2000/(sr/2), 6000/(sr/2)], btype='band')
    heel = lfilter(b, a, noise)
    env = exponential_decay_envelope(n, tau_seconds=0.012, sr=sr)
    heel = heel * env
    heel /= (np.max(np.abs(heel)) + 1e-12)
    return heel


def make_asphalt_mid(sr=SAMPLE_RATE):
    n = int(0.06 * sr)
    noise = np.random.randn(n)
    b, a = butter(2, [300/(sr/2), 900/(sr/2)], btype='band')
    mid = lfilter(b, a, noise)
    env = exponential_decay_envelope(n, tau_seconds=0.04, sr=sr)
    mid = mid * env
    mid /= (np.max(np.abs(mid)) + 1e-12)
    return mid


def make_asphalt_body(freq=120.0, sr=SAMPLE_RATE):
    n = int(0.2 * sr)
    t = np.arange(n) / sr
    tone = np.sin(2*np.pi*freq*t)
    tone += 0.4*np.sin(2*np.pi*(freq*2.1)*t)
    env = exponential_decay_envelope(n, tau_seconds=0.09, sr=sr)
    tone = tone * env
    tone /= (np.max(np.abs(tone)) + 1e-12)
    return tone


def make_asphalt_toe(sr=SAMPLE_RATE):
    n = int(0.02 * sr)
    noise = np.random.randn(n)
    b, a = butter(2, [2500/(sr/2), 7000/(sr/2)], btype='band')
    toe = lfilter(b, a, noise)
    env = exponential_decay_envelope(n, tau_seconds=0.008, sr=sr)
    toe = toe * env
    toe /= (np.max(np.abs(toe)) + 1e-12)
    return toe


def exponential_decay_envelope(length_samples, tau_seconds, sr=SAMPLE_RATE):
    t = np.arange(length_samples) / sr
    env = np.exp(-t / tau_seconds)
    # normalize to peak 1
    env = env / np.max(np.abs(env) + 1e-12)
    return env


def make_impact(duration_s=0.06, sr=SAMPLE_RATE):
    # impact: burst of bandpassed noise (short)
    n = int(duration_s * sr)
    # white noise
    noise = np.random.randn(n)
    # highpass then lowpass to make it clicky but not harsh
    b, a = butter(2, [300.0 / (sr / 2), 5000.0 / (sr / 2)], btype='band')
    impact = lfilter(b, a, noise)
    env = exponential_decay_envelope(n, tau_seconds=0.02, sr=sr)
    impact = impact * env
    # gentle amplitude normalization
    impact = impact * 0.8 / (np.max(np.abs(impact)) + 1e-12)
    return impact


def make_body_tone(duration_s=0.25, freq=150.0, sr=SAMPLE_RATE):
    n = int(duration_s * sr)
    t = np.arange(n) / sr
    tone = np.sin(2 * np.pi * freq * t)
    # add slight harmonic
    tone += 0.4 * np.sin(2 * np.pi * (freq * 2.0) * t)
    env = exponential_decay_envelope(n, tau_seconds=0.08, sr=sr)
    tone = tone * env
    tone = tone / (np.max(np.abs(tone)) + 1e-12)
    return tone


def simple_reverb(stereo_signal, sr=SAMPLE_RATE):
    # small Schroeder-style reverb: sum of combs + all-pass
    # stereo_signal: shape (2, N)
    out = stereo_signal.copy()
    N = stereo_signal.shape[1]
    # comb filters (feedforward delays)
    comb_delays = [0.029, 0.037, 0.041]  # seconds
    gains = [0.7, 0.65, 0.6]
    for d, g in zip(comb_delays, gains):
        k = int(d * sr)
        if k >= N:
            continue
        delayed = np.zeros_like(out)
        delayed[:, k:] = out[:, :-k] * g
        out += delayed
    # tiny all-pass
    ap_delay = int(0.005 * sr)
    if ap_delay < N:
        ap = np.zeros_like(out)
        ap[:, ap_delay:] = out[:, :-ap_delay] * 0.5
        out = out - ap + 0.5 * ap
    # mild damping
    out *= 0.9
    return out


# ---------- render ----------

def render_audio(event_times, sides, timeline_seconds, source_duration, step_rate_fn_interp, out_duration=DURATION, sr=SAMPLE_RATE):
    # Map source_time -> output_time by scaling to out_duration
    # No scaling — use absolute times
    scale = 1.0
    event_times_out = event_times * scale

    n_samples = int(out_duration * sr)
    stereo = np.zeros((2, n_samples), dtype=np.float32)

    # pre-generate base sounds
    impact_base = make_impact(duration_s=0.06, sr=sr)
    body_base = make_body_tone(duration_s=0.28, freq=120.0, sr=sr)

    for i, t in enumerate(event_times_out):
        sample_idx = int(t * sr)
        if sample_idx >= n_samples:
            continue
        side = sides[i]
        # choose slightly different timbre per foot and per local rate
        local_rate = float(step_rate_fn_interp(min(event_times[i], timeline_seconds[-1])))
        # map local_rate (steps/s) to tonal brightness
        freq = 110.0 + (local_rate - 0.5) * 60.0
        body = make_body_tone(duration_s=0.2 + 0.06 * np.random.rand(), freq=freq, sr=sr)
        impact = make_impact(duration_s=0.04 + 0.03 * np.random.rand(), sr=sr)

        # mix impact + body
        snd = np.zeros(max(len(impact), len(body)), dtype=np.float32)
        snd[:len(impact)] += impact
        snd[:len(body)] += 0.9 * body

        # amplitude scaling by local_rate (faster -> slightly louder)
        amp = 0.5 + 0.5 * np.tanh((local_rate - 1.0) * 0.8)
        snd *= amp * (0.5 + 0.5 * np.random.rand())

        # simple stereo pan: L/R or small offset from heading (we alternate)
        if side == 'L':
            left = snd
            right = snd * 0.6
        else:
            left = snd * 0.6
            right = snd

        # write into buffer with additive mix
        end = sample_idx + len(snd)
        if end > n_samples:
            end = n_samples
            left = left[: end - sample_idx]
            right = right[: end - sample_idx]
        stereo[0, sample_idx:end] += left
        stereo[1, sample_idx:end] += right

    # apply simple reverb to make pleasant
    stereo = simple_reverb(stereo, sr=sr)

    # normalize to avoid clipping
    peak = np.max(np.abs(stereo)) + 1e-12
    stereo = stereo * (0.95 / peak)
    return stereo


# ---------- main ----------

def main(in_gpx, out_wav):
    print('Parsing GPX...')
    points = parse_gpx(in_gpx)
    print(f'Loaded {len(points)} points')
    timeline_seconds, speeds, source_duration = compute_speeds_and_times(points)
    print(f'Source track duration ~ {source_duration:.1f}s (will be scaled to {DURATION:.1f}s)')

    step_rates = estimate_step_rate(speeds, stride_length_m=0.78)
    print(f'Average estimated cadence: {np.mean(step_rates)*60:.1f} spm')

    # synthesize events in source time
    event_times, sides = synthesize_events(timeline_seconds, step_rates)
    print(f'Generated {len(event_times)} footfall events')

    # create interp function for local rate mapping (source-time)
    def step_rate_fn(t):
        return float(np.interp(t, timeline_seconds, step_rates))

    stereo = render_audio(event_times, sides, timeline_seconds, source_duration, step_rate_fn, out_duration=DURATION, sr=SAMPLE_RATE)

    # write file
    print(f'Writing output to {out_wav} ...')
    # transpose to shape (N, 2)
    out = stereo.T
    sf.write(out_wav, out, SAMPLE_RATE, subtype='FLOAT')
    print('Done!')


if __name__ == '__main__':
    # --- Google Colab friendly entry point ---
    # Example usage in Colab:
    # from google.colab import files
    # uploaded = files.upload()
    # gpx_path = list(uploaded.keys())[0]
    # in_gpx = gpx_path
    # out_wav = '/content/gpx.wav'
    # main(in_gpx, out_wav)
    try:
        in_gpx = gpx_path
        out_wav = '/content/gpx.wav'
        main(in_gpx, out_wav)
    except Exception:
        print('Run main(in_gpx, out_wav) manually when using Google Colab.')


Parsing GPX...
Loaded 13768 points
Source track duration ~ 13767.0s (will be scaled to 60.0s)
Average estimated cadence: 220.9 spm
Generated 50681 footfall events
Writing output to /content/gpx.wav ...
Done!
