From 29b203e941acc65a872b18d6ca760a3bb4fc340f Mon Sep 17 00:00:00 2001
From: Alexander Kehaya <alexanderkehaya@Alexanders-Mac-mini.local>
Date: Wed, 22 Apr 2026 14:57:23 -0700
Subject: [PATCH] fix: voice tracking highlight and mic stall bugs in word
 tracking mode

Addresses two user-reported bugs: (1) highlight not tracking at the right
speed, jumping erratically or lagging behind speech, and (2) mic appearing
to stall out and stop picking up audio after ~60 seconds.

Root causes identified and fixed:

**Seamless recognition restart (P0)**
- Split cleanupRecognition() so AVAudioEngine stays alive across
  SFSpeechRecognitionTask restarts, eliminating audio gaps
- Add pre-emptive 55-second restart timer to beat Apple's ~60s timeout
- Update matchStartOffset to recognizedCharCount before each restart so
  new sessions match from the correct position
- Thread-safe request swapping via NSLock for audio I/O thread safety
- Add contextualStrings from remaining source text for better STT accuracy

**Fix fuzzy matching false positives (P1)**
- Remove overly permissive `contains` check from isFuzzyMatch that caused
  "and" to match "demand", "the" to match "other", etc.
- Tighten prefix matching to require minimum 3-char words
- Require exact match for 2-char words (no edit distance tolerance)
- Fix charLevelMatch skip-both fallback: no longer advances
  lastGoodOrigIndex on genuine mismatches (gibberish no longer matches)
- Fix wordLevelMatch +1 space overcount on last matched word
- Fix unicode scalar vs Character count mismatch in charLevelMatch

**Confidence gating (P2)**
- Replace blind max(charResult, wordResult) with agreement-based selection
- Add sliding window requiring 2-of-3 recent results to agree before
  committing large forward jumps (small steps always pass through)

**Retry resilience (P3)**
- Distinguish timeout errors (code 1110/216) from real errors
- No retry limit for expected timeouts; immediate soft restart
- Backoff with retry limit only for genuine errors

**Architecture cleanup (P4)**
- Merge two polling timers in observeDismiss() into one
- Fix retain cycle in dismiss() asyncAfter closure
- Add isDismissing guard to prevent double-dismiss
- Fix cancelled-task error callback race in restartTask()

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../Textream/NotchOverlayController.swift     |  67 +++--
 Textream/Textream/SpeechRecognizer.swift      | 271 ++++++++++++++++--
 2 files changed, 278 insertions(+), 60 deletions(-)

diff --git a/Textream/Textream/NotchOverlayController.swift b/Textream/Textream/NotchOverlayController.swift
index d705171..a97d111 100644
--- a/Textream/Textream/NotchOverlayController.swift
+++ b/Textream/Textream/NotchOverlayController.swift
@@ -374,21 +374,27 @@ class NotchOverlayController: NSObject {
     }
 
     func dismiss() {
+        guard !isDismissing else { return }
+        isDismissing = true
+
         // Trigger the shrink animation
         speechRecognizer.shouldDismiss = true
         speechRecognizer.forceStop()
 
         // Wait for animation, then remove panel
         DispatchQueue.main.asyncAfter(deadline: .now() + 0.4) { [weak self] in
-            self?.stopMouseTracking()
-            self?.stopCursorTracking()
-            self?.removeStopButton()
-            self?.removeEscMonitor()
-            self?.panel?.orderOut(nil)
-            self?.panel = nil
-            self?.frameTracker = nil
-            self?.speechRecognizer.shouldDismiss = false
-            self?.onComplete?()
+            guard let self else { return }
+            self.stopMouseTracking()
+            self.stopCursorTracking()
+            self.removeStopButton()
+            self.removeEscMonitor()
+            self.cancellables.removeAll()
+            self.panel?.orderOut(nil)
+            self.panel = nil
+            self.frameTracker = nil
+            self.speechRecognizer.shouldDismiss = false
+            self.isDismissing = false
+            self.onComplete?()
         }
     }
 
@@ -424,41 +430,40 @@ class NotchOverlayController: NSObject {
     }
 
     private func observeDismiss() {
-        // Poll for shouldAdvancePage (next page requested from overlay)
+        // Single timer polls all conditions instead of two separate timers
         Timer.publish(every: 0.1, on: .main, in: .common)
             .autoconnect()
             .sink { [weak self] _ in
                 guard let self else { return }
+
+                // Check for page advance
                 if self.speechRecognizer.shouldAdvancePage {
                     self.speechRecognizer.shouldAdvancePage = false
                     self.onNextPage?()
                 }
-                // Poll for page jump from page picker
+
+                // Check for page jump from page picker
                 if let targetIndex = self.overlayContent.jumpToPageIndex {
                     self.overlayContent.jumpToPageIndex = nil
                     TextreamService.shared.jumpToPage(index: targetIndex)
                 }
-            }
-            .store(in: &cancellables)
 
-        // Poll for shouldDismiss becoming true (from view setting it on completion)
-        Timer.publish(every: 0.1, on: .main, in: .common)
-            .autoconnect()
-            .sink { [weak self] _ in
-                guard let self, self.speechRecognizer.shouldDismiss, !self.isDismissing else { return }
-                self.isDismissing = true
-                // Wait for shrink animation, then cleanup
-                DispatchQueue.main.asyncAfter(deadline: .now() + 0.4) {
-                    self.stopMouseTracking()
-                    self.stopCursorTracking()
-                    self.removeStopButton()
-                    self.removeEscMonitor()
-                    self.cancellables.removeAll()
-                    self.panel?.orderOut(nil)
-                    self.panel = nil
-                    self.frameTracker = nil
-                    self.speechRecognizer.shouldDismiss = false
-                    self.onComplete?()
+                // Check for dismiss
+                if self.speechRecognizer.shouldDismiss, !self.isDismissing {
+                    self.isDismissing = true
+                    DispatchQueue.main.asyncAfter(deadline: .now() + 0.4) { [weak self] in
+                        guard let self else { return }
+                        self.stopMouseTracking()
+                        self.stopCursorTracking()
+                        self.removeStopButton()
+                        self.removeEscMonitor()
+                        self.cancellables.removeAll()
+                        self.panel?.orderOut(nil)
+                        self.panel = nil
+                        self.frameTracker = nil
+                        self.speechRecognizer.shouldDismiss = false
+                        self.onComplete?()
+                    }
                 }
             }
             .store(in: &cancellables)
diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift
index e7e8258..d1151c1 100644
--- a/Textream/Textream/SpeechRecognizer.swift
+++ b/Textream/Textream/SpeechRecognizer.swift
@@ -101,6 +101,11 @@ class SpeechRecognizer {
     private var pendingRestart: DispatchWorkItem?
     private var sessionGeneration: Int = 0
     private var suppressConfigChange: Bool = false
+    private var requestLock = NSLock()
+    private var preemptiveRestartTimer: Timer?
+    /// Sliding window of recent match positions for confidence gating.
+    /// We require 2-of-3 recent results to agree before committing a forward jump.
+    private var recentMatchPositions: [Int] = []
 
     /// Update the source text while preserving the current recognized char count.
     /// Used by Director Mode to live-edit unread text without resetting read progress.
@@ -111,6 +116,7 @@ class SpeechRecognizer {
         normalizedSource = Self.normalize(collapsed)
         recognizedCharCount = min(preservingCharCount, collapsed.count)
         matchStartOffset = recognizedCharCount
+        recentMatchPositions = []
     }
 
     /// Jump highlight to a specific char offset (e.g. when user taps a word)
@@ -118,6 +124,7 @@ class SpeechRecognizer {
         recognizedCharCount = charOffset
         matchStartOffset = charOffset
         retryCount = 0
+        recentMatchPositions = []
         if isListening {
             restartRecognition()
         }
@@ -135,6 +142,7 @@ class SpeechRecognizer {
         recognizedCharCount = 0
         matchStartOffset = 0
         retryCount = 0
+        recentMatchPositions = []
         error = nil
         sessionGeneration += 1
 
@@ -199,35 +207,49 @@ class SpeechRecognizer {
         isListening = false
         sourceText = ""
         retryCount = maxRetries
+        recentMatchPositions = []
         cleanupRecognition()
     }
 
     func resume() {
         retryCount = 0
         matchStartOffset = recognizedCharCount
+        recentMatchPositions = []
         shouldDismiss = false
         beginRecognition()
     }
 
-    private func cleanupRecognition() {
+    private func cleanupRecognitionTask() {
         // Cancel any pending restart to prevent overlapping beginRecognition calls
         pendingRestart?.cancel()
         pendingRestart = nil
 
+        stopPreemptiveTimer()
+
         if let observer = configurationChangeObserver {
             NotificationCenter.default.removeObserver(observer)
             configurationChangeObserver = nil
         }
+        requestLock.lock()
         recognitionRequest?.endAudio()
         recognitionRequest = nil
+        requestLock.unlock()
         recognitionTask?.cancel()
         recognitionTask = nil
+    }
+
+    private func cleanupAudioEngine() {
         if audioEngine.isRunning {
             audioEngine.stop()
         }
         audioEngine.inputNode.removeTap(onBus: 0)
     }
 
+    private func cleanupRecognition() {
+        cleanupRecognitionTask()
+        cleanupAudioEngine()
+    }
+
     /// Coalesces all delayed beginRecognition() calls into a single pending work item.
     /// Any previously scheduled restart is cancelled before the new one is queued.
     private func scheduleBeginRecognition(after delay: TimeInterval) {
@@ -286,6 +308,16 @@ class SpeechRecognizer {
         guard let recognitionRequest else { return }
         recognitionRequest.shouldReportPartialResults = true
 
+        // Add contextual strings from the source text to improve STT accuracy
+        let upcoming = String(sourceText.dropFirst(matchStartOffset))
+        let contextWords = upcoming.split(separator: " ")
+            .map { String($0).lowercased().filter { $0.isLetter || $0.isNumber } }
+            .filter { $0.count >= 5 }
+        let uniqueContextWords = Array(Set(contextWords).prefix(50))
+        if !uniqueContextWords.isEmpty {
+            recognitionRequest.contextualStrings = uniqueContextWords
+        }
+
         let inputNode = audioEngine.inputNode
         let hardwareFormat = inputNode.outputFormat(forBus: 0)
 
@@ -327,7 +359,7 @@ class SpeechRecognizer {
         inputNode.removeTap(onBus: 0)
 
         inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFormat) { [weak self] buffer, _ in
-            recognitionRequest.append(buffer)
+            self?.appendBufferToRequest(buffer)
 
             guard let channelData = buffer.floatChannelData?[0] else { return }
             let frameLength = Int(buffer.frameLength)
@@ -359,11 +391,33 @@ class SpeechRecognizer {
                     self.matchCharacters(spoken: spoken)
                 }
             }
-            if error != nil {
+            if let error {
                 DispatchQueue.main.async {
                     // If recognitionRequest is nil, cleanup already ran (intentional cancel) — don't retry
                     guard self.recognitionRequest != nil else { return }
-                    if self.isListening && !self.shouldDismiss && !self.sourceText.isEmpty && self.retryCount < self.maxRetries {
+                    guard self.isListening && !self.shouldDismiss && !self.sourceText.isEmpty else {
+                        self.isListening = false
+                        return
+                    }
+
+                    self.matchStartOffset = self.recognizedCharCount
+
+                    // Distinguish timeout errors (expected every ~60s) from real errors.
+                    // SFSpeechRecognizer timeout is error code 1110 in kAFAssistantErrorDomain,
+                    // or 216 (kAudioConverterErr_FormatNotSupported). Retry immediately for
+                    // timeouts with no retry limit; use backoff for real errors.
+                    let nsError = error as NSError
+                    let isTimeout = nsError.code == 1110 || nsError.code == 216
+
+                    if isTimeout {
+                        // Expected timeout — restart immediately, no retry limit
+                        self.retryCount = 0
+                        if self.audioEngine.isRunning {
+                            self.restartTask()
+                        } else {
+                            self.scheduleBeginRecognition(after: 0.1)
+                        }
+                    } else if self.retryCount < self.maxRetries {
                         self.retryCount += 1
                         let delay = min(Double(self.retryCount) * 0.5, 1.5)
                         self.scheduleBeginRecognition(after: delay)
@@ -378,6 +432,7 @@ class SpeechRecognizer {
             audioEngine.prepare()
             try audioEngine.start()
             isListening = true
+            startPreemptiveTimer()
         } catch {
             // Transient failure after a device switch — retry with longer delay
             if retryCount < maxRetries {
@@ -391,12 +446,131 @@ class SpeechRecognizer {
     }
 
     private func restartRecognition() {
-        // Reset retries so the fresh engine gets a full set of attempts
         retryCount = 0
         isListening = true
-        // Longer delay to let the audio system fully settle after a device change
-        cleanupRecognition()
-        scheduleBeginRecognition(after: 0.5)
+        if audioEngine.isRunning {
+            restartTask()
+        } else {
+            cleanupRecognition()
+            scheduleBeginRecognition(after: 0.5)
+        }
+    }
+
+    // MARK: - Thread-safe buffer appending
+
+    private func appendBufferToRequest(_ buffer: AVAudioPCMBuffer) {
+        requestLock.lock()
+        recognitionRequest?.append(buffer)
+        requestLock.unlock()
+    }
+
+    // MARK: - Soft restart (task only, keeps audio engine running)
+
+    private func restartTask() {
+        // Update match offset before restarting
+        matchStartOffset = recognizedCharCount
+        recentMatchPositions = []
+
+        // Cancel any pending restart to avoid stale beginRecognition clobbering this session
+        pendingRestart?.cancel()
+        pendingRestart = nil
+
+        // Cancel the old task and atomically swap to a new request under lock.
+        // The lock prevents the audio tap from appending to the old request
+        // between endAudio() and the new assignment.
+        let newRequest = SFSpeechAudioBufferRecognitionRequest()
+        newRequest.shouldReportPartialResults = true
+
+        // Add contextual strings for the remaining text
+        let upcoming = String(sourceText.dropFirst(matchStartOffset))
+        let contextWords = upcoming.split(separator: " ")
+            .map { String($0).lowercased().filter { $0.isLetter || $0.isNumber } }
+            .filter { $0.count >= 5 }
+        let uniqueWords = Array(Set(contextWords).prefix(50))
+        if !uniqueWords.isEmpty {
+            newRequest.contextualStrings = uniqueWords
+        }
+
+        // Nil out recognitionRequest before cancelling the old task so the
+        // old task's error callback sees nil and skips retry logic. Then set
+        // the new request after cancellation.
+        requestLock.lock()
+        recognitionRequest?.endAudio()
+        recognitionRequest = nil
+        requestLock.unlock()
+        recognitionTask?.cancel()
+        recognitionTask = nil
+
+        requestLock.lock()
+        recognitionRequest = newRequest
+        requestLock.unlock()
+
+        // Start new recognition task
+        guard let speechRecognizer, speechRecognizer.isAvailable else {
+            error = "Speech recognizer not available"
+            isListening = false
+            return
+        }
+
+        let currentGeneration = sessionGeneration
+        recognitionTask = speechRecognizer.recognitionTask(with: newRequest) { [weak self] result, error in
+            guard let self else { return }
+            if let result {
+                let spoken = result.bestTranscription.formattedString
+                DispatchQueue.main.async {
+                    guard self.sessionGeneration == currentGeneration else { return }
+                    self.retryCount = 0
+                    self.lastSpokenText = spoken
+                    self.matchCharacters(spoken: spoken)
+                }
+            }
+            if let error {
+                DispatchQueue.main.async {
+                    guard self.recognitionRequest != nil else { return }
+                    guard self.isListening && !self.shouldDismiss && !self.sourceText.isEmpty else {
+                        self.isListening = false
+                        return
+                    }
+
+                    self.matchStartOffset = self.recognizedCharCount
+
+                    let nsError = error as NSError
+                    let isTimeout = nsError.code == 1110 || nsError.code == 216
+
+                    if isTimeout {
+                        self.retryCount = 0
+                        if self.audioEngine.isRunning {
+                            self.restartTask()
+                        } else {
+                            self.scheduleBeginRecognition(after: 0.1)
+                        }
+                    } else if self.retryCount < self.maxRetries {
+                        self.retryCount += 1
+                        let delay = min(Double(self.retryCount) * 0.5, 1.5)
+                        self.scheduleBeginRecognition(after: delay)
+                    } else {
+                        self.isListening = false
+                    }
+                }
+            }
+        }
+
+        startPreemptiveTimer()
+    }
+
+    // MARK: - Pre-emptive restart timer
+
+    private func startPreemptiveTimer() {
+        preemptiveRestartTimer?.invalidate()
+        preemptiveRestartTimer = Timer.scheduledTimer(withTimeInterval: 55.0, repeats: true) { [weak self] _ in
+            guard let self, self.isListening, !self.sourceText.isEmpty else { return }
+            self.restartTask()
+        }
+    }
+
+    private func stopPreemptiveTimer() {
+        preemptiveRestartTimer?.invalidate()
+        preemptiveRestartTimer = nil
     }
 
     // MARK: - Fuzzy character-level matching
@@ -408,19 +582,57 @@ class SpeechRecognizer {
         // Strategy 2: word-level match (handles STT word substitutions)
         let wordResult = wordLevelMatch(spoken: spoken)
 
-        let best = max(charResult, wordResult)
+        // Use agreement-based selection instead of blind max().
+        // If both strategies agree within a tolerance, use the average.
+        // If they disagree wildly, use the more conservative (lower) result
+        // to avoid false-positive jumps.
+        let best: Int
+        let tolerance = 20 // characters
+        if abs(charResult - wordResult) <= tolerance {
+            best = (charResult + wordResult) / 2
+        } else {
+            best = min(charResult, wordResult)
+        }
 
-        // Only move forward from the match start offset
         let newCount = matchStartOffset + best
-        if newCount > recognizedCharCount {
-            recognizedCharCount = min(newCount, sourceText.count)
+        guard newCount > recognizedCharCount else { return }
+
+        let candidate = min(newCount, sourceText.count)
+
+        // Confidence gating: require 2-of-3 recent results to agree on
+        // forward movement to avoid single-result false-positive jumps.
+        recentMatchPositions.append(candidate)
+        if recentMatchPositions.count > 3 {
+            recentMatchPositions.removeFirst()
+        }
+
+        // Check if at least 2 of the recent positions agree (within tolerance)
+        let agreementThreshold = 10 // characters
+        var confirmed = false
+        if recentMatchPositions.count >= 2 {
+            var agreeCount = 0
+            for pos in recentMatchPositions {
+                if abs(pos - candidate) <= agreementThreshold {
+                    agreeCount += 1
+                }
+            }
+            confirmed = agreeCount >= 2
+        }
+
+        // Small forward movements (< 1 word length) are always allowed
+        // to keep the highlight responsive for normal reading
+        let smallStep = candidate - recognizedCharCount <= 15
+
+        if confirmed || smallStep {
+            recognizedCharCount = candidate
         }
     }
 
     private func charLevelMatch(spoken: String) -> Int {
         let remainingSource = String(sourceText.dropFirst(matchStartOffset))
-        let src = Array(remainingSource.lowercased().unicodeScalars).map { Character($0) }
-        let spk = Array(Self.normalize(spoken).unicodeScalars).map { Character($0) }
+        // Use Character arrays (not unicodeScalars) so counts match sourceText.count
+        let src = Array(remainingSource.lowercased())
+        let spk = Array(Self.normalize(spoken))
 
         var si = 0
         var ri = 0
@@ -477,10 +689,10 @@ class SpeechRecognizer {
                 }
                 if found { continue }
 
-                // Skip both (substitution)
-                si += 1
+                // No resync found — advance spoken pointer only.
+                // Do NOT advance lastGoodOrigIndex; this is a genuine mismatch,
+                // not a confirmed match position.
                 ri += 1
-                lastGoodOrigIndex = si
             }
         }
 
@@ -517,13 +729,14 @@ class SpeechRecognizer {
                 .filter { $0.isLetter || $0.isNumber }
 
             if srcWord == spkWord || isFuzzyMatch(srcWord, spkWord) {
-                // Count original chars including trailing punctuation, plus space
+                // Count original chars including trailing punctuation
                 matchedCharCount += sourceWords[si].count
-                if si < sourceWords.count - 1 {
-                    matchedCharCount += 1 // space
-                }
                 si += 1
                 ri += 1
+                // Add space separator only if there's a following word
+                if si < sourceWords.count {
+                    matchedCharCount += 1
+                }
             } else {
                 // Try skipping up to 3 spoken words (STT hallucinated words)
                 var foundSpk = false
@@ -581,16 +794,16 @@ class SpeechRecognizer {
         if a.isEmpty || b.isEmpty { return false }
         // Exact match
         if a == b { return true }
-        // One starts with the other (phonetic prefix: "not" ~ "notch")
-        if a.hasPrefix(b) || b.hasPrefix(a) { return true }
-        // One contains the other
-        if a.contains(b) || b.contains(a) { return true }
-        // Shared prefix >= 60% of shorter word
-        let shared = zip(a, b).prefix(while: { $0 == $1 }).count
         let shorter = min(a.count, b.count)
-        if shorter >= 2 && shared >= max(2, shorter * 3 / 5) { return true }
-        // Edit distance tolerance
+        // Prefix match — only for words with at least 3 chars to avoid
+        // false positives like "or" matching "organization"
+        if shorter >= 3 && (a.hasPrefix(b) || b.hasPrefix(a)) { return true }
+        // Shared prefix >= 60% of shorter word (min 3 chars shared)
+        let shared = zip(a, b).prefix(while: { $0 == $1 }).count
+        if shorter >= 3 && shared >= max(3, shorter * 3 / 5) { return true }
+        // Edit distance tolerance — stricter for very short words
         let dist = editDistance(a, b)
+        if shorter <= 2 { return false } // 2-char words must be exact
         if shorter <= 4 { return dist <= 1 }
         if shorter <= 8 { return dist <= 2 }
         return dist <= max(a.count, b.count) / 3