civictechdc · KristijanArmeni · Sep 23, 2025 · Jun 17, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/.ai-context/README.md b/.ai-context/README.md
@@ -20,6 +20,7 @@ consistent UX while allowing easy contribution of new analyzers.
 
 - **Core**: Python 3.12, Inquirer (CLI), TinyDB (metadata)
 - **Data**: Polars/Pandas, PyArrow, Parquet files
+- **Text Processing**: Unicode tokenizer service with scriptio continua support (character-level for CJK/Thai/Southeast Asian scripts, word-level for Latin/Arabic scripts)
 - **Web**: Dash, Shiny for Python, Plotly
 - **Dev Tools**: Black, isort, pytest, PyInstaller
 

diff --git a/.ai-context/architecture-overview.md b/.ai-context/architecture-overview.md
@@ -11,11 +11,16 @@ flowchart TD
     App --> Importers[Data Importers]
     App --> Preprocessing[Semantic Preprocessor]
     App --> Analyzers[Analyzer System]
+    App --> TokenizerService[Tokenizer Service]
 
     Importers --> Parquet[(Parquet Files)]
     Preprocessing --> Parquet
     Analyzers --> Parquet
 
+    Analyzers --> TokenizerService
+    TokenizerService --> BasicTokenizer[BasicTokenizer]
+    BasicTokenizer --> Primary
+
     Analyzers --> Primary[Primary Analyzers]
     Analyzers --> Secondary[Secondary Analyzers]
     Analyzers --> WebPresenters[Web Presenters]
@@ -68,6 +73,39 @@ Key Classes:
 - `FileSelectionState` - File picker state management
 - `TableStats` - Data statistics and preview information
 
+### Service Layer (`services/`)
+
+Reusable services that support analyzers and data processing
+
+Key Services:
+
+- **Tokenizer Service** (`services/tokenizer/`) - Unicode-aware scriptio continua tokenization
+  - `AbstractTokenizer` - Base interface for tokenizer implementations
+  - `TokenizerConfig` - Configuration for tokenization behavior
+  - `BasicTokenizer` - Core implementation with character-level and word-level tokenization
+  - Character-level: CJK, Thai, Lao, Myanmar, Khmer scripts
+  - Word-level: Latin, Arabic scripts with space separation
+  - `TokenType`, `LanguageFamily` - Type definitions for tokenization
+  - Comprehensive regex patterns and social media entity preservation
+  - API contract:
+    - `tokenize(text: str, lang: LanguageFamily | None = None, *, preserve_entities: bool = True) -> list[Token] | Iterator[Token]`
+    - Thread-safe, stateless; may return an iterator for streaming large inputs.
+  - Token model:
+    - `Token { text: str, type: TokenType, start: int, end: int, script: str | None, norm: str | None }`
+    - Offsets are codepoint indices; guarantee grapheme-cluster boundaries (respect ZWJ/emoji sequences).
+  - Normalization:
+    - Apply NFC by default; configurable NFKC for compatibility when requested; never alter preserved entities.
+  - Language handling:
+    - If `lang` is None, infer via Unicode Script with overrides for mixed-script; Arabic note: handle proclitics/enclitics, not just spaces.
+  - Social media entity precedence:
+    - Detect URL (RFC 3986), @mentions, #hashtags before general tokenization; entities are single, atomic tokens.
+  - Regex safety:
+    - Precompile all patterns; avoid catastrophic backtracking; enforce per-call max steps/timeout or fallback to simpler patterns.
+  - Error and config semantics:
+    - Invalid config ⇒ explicit exception; defaults sourced from `SettingsContext` with per-analyzer overrides in `AnalysisContext`.
+  - Performance:
+    - Optional caching of per-script patterns; zero-copy slicing where possible; streaming mode for texts > N chars (configurable).
+
 ## Data Flow Architecture
 
 ### Import → Analysis → Export Pipeline

diff --git a/.ai-context/symbol-reference.md b/.ai-context/symbol-reference.md
@@ -143,6 +143,80 @@ Base interface for data importers
 
 - `analyzers.suite` - `analyzers/__init__.py` - Central registry of all analyzers
 
+### Tokenizer Service (`services/tokenizer/`)
+
+Unicode-aware text tokenization with scriptio continua (character-level) and space-separated script support, plus social media entity preservation.
+
+#### Core Interface - `services/tokenizer/core/base.py`
+
+**`AbstractTokenizer` class**
+
+Base interface for all tokenizer implementations:
+
+- `__init__(config: TokenizerConfig = None)` - Initialize with configuration
+- `tokenize(text: str) -> list[str]` - Basic tokenization into token list
+- `config: TokenizerConfig` - Property to access tokenizer configuration
+- `_preprocess_text(text: str) -> str` - Apply preprocessing (case, normalization)
+- `_postprocess_tokens(tokens: list[str]) -> list[str]` - Filter and clean tokens
+
+#### Configuration Types - `services/tokenizer/core/types.py`
+
+**`TokenizerConfig` dataclass**
+
+Comprehensive tokenization configuration:
+
+- Language handling: `fallback_language_family`
+- Token filtering: `include_punctuation`, `include_numeric`, `include_emoji`
+- Text preprocessing: `case_handling`, `normalize_unicode`
+- Social media: `extract_hashtags`, `extract_mentions`, `include_urls`, `include_emails`
+- Output control: `min_token_length`, `max_token_length`, `strip_whitespace`
+
+**Core Enums:**
+
+- `LanguageFamily` - Language script families (LATIN, CJK, ARABIC, MIXED, UNKNOWN) - CJK includes all scriptio continua scripts
+- `TokenType` - Token classifications (WORD, HASHTAG, MENTION, URL, EMOJI, etc.)
+- `CaseHandling` - Case transformation options (PRESERVE, LOWERCASE, UPPERCASE, NORMALIZE)
+
+#### Basic Implementation - `services/tokenizer/basic/tokenizer.py`
+
+**`BasicTokenizer` class**
+
+Core tokenizer implementation with Unicode awareness:
+
+- Scriptio continua tokenization: Character-level for CJK, Thai, Lao, Myanmar, Khmer
+- Space-separated tokenization: Word-level for Latin, Arabic scripts
+- Social media entity preservation (hashtags, mentions, URLs)
+- Unicode normalization and proper space handling
+- Configurable preprocessing and postprocessing
+- Single-pass regex-based token extraction with order preservation
+
+#### Pattern Matching - `services/tokenizer/basic/patterns.py`
+
+**Pattern Functions:**
+
+- `get_patterns() -> TokenizerPatterns` - Get singleton TokenizerPatterns instance
+- Unicode-aware regex patterns for different script families
+
+**Pattern Classes:**
+
+- `TokenizerPatterns` - Compiled regex patterns for tokenization
+- `SOCIAL_PATTERNS` - Social media entity patterns
+- `LINGUISTIC_PATTERNS` - Language-specific tokenization patterns
+- `FORMATTING_PATTERNS` - Text formatting and structure patterns
+
+#### Service API - `services/tokenizer/__init__.py`
+
+**Convenience Functions:**
+
+- `tokenize_text(text: str, config: TokenizerConfig = None) -> list[str]` - Simple tokenization
+- `create_basic_tokenizer(config: TokenizerConfig = None) -> BasicTokenizer` - Factory function
+
+**Public Exports:**
+
+- Core types: `AbstractTokenizer`, `TokenizerConfig`, `TokenList`, `TokenType`, `LanguageFamily`, `CaseHandling`
+- Implementation: `BasicTokenizer`
+- Factory functions: `create_basic_tokenizer`, `tokenize_text`
+
 ## Entry Points
 
 ### Main Application
@@ -187,17 +261,20 @@ Base interface for data importers
 Application-wide structured JSON logging with configurable levels and automatic rotation.
 
 **Core Functions:**
+
 - `setup_logging(log_file_path: Path, level: int = logging.INFO)` - Configure application logging
 - `get_logger(name: str) -> logging.Logger` - Get logger instance for module
 
 **Features:**
-- Dual handlers: console (ERROR+) and file (INFO+) 
+
+- Dual handlers: console (ERROR+) and file (INFO+)
 - JSON-formatted structured logs with timestamps and context
 - Automatic log rotation (10MB files, 5 backups)
 - CLI-configurable log levels via `--log-level` flag
 - Log location: `~/.local/share/MangoTango/logs/mangotango.log`
 
 **Usage Pattern:**
+
 ```python
 from app.logger import get_logger
 logger = get_logger(__name__)

diff --git a/.github/workflows/build_exe.yml b/.github/workflows/build_exe.yml
@@ -29,26 +29,26 @@ jobs:
             artifact_name: windows
             os: windows-2022
             version_command: icacls "VERSION" /grant Everyone:F /T /C /Q
-            move_command: move dist\mangotango.exe dist\mangotango_windows.exe
-            sha_command: pwsh -c "Get-FileHash -Algorithm SHA1 dist\mangotango_windows.exe | Format-Table Hash -HideTableHeaders > dist\mangotango_windows.exe.sha1"
+            move_command: move dist\cibmangotree.exe dist\cibmangotree_windows.exe
+            sha_command: pwsh -c "Get-FileHash -Algorithm SHA1 dist\cibmangotree_windows.exe | Format-Table Hash -HideTableHeaders > dist\cibmangotree_windows.exe.sha1"
             list_command: dir dist
-            check_command: dist\mangotango_windows.exe --noop
+            check_command: dist\cibmangotree_windows.exe --noop
           - platform_name: MacOS (x86)
             artifact_name: macos-x86
             os: macos-13
-            move_command: mv dist/mangotango dist/mangotango_macos-x86
-            sha_command: shasum -a 1 dist/mangotango_macos-x86 > dist/mangotango_macos-x86.sha1
-            sha_command_pkg: shasum -a 1 dist/mangotango_macos-x86.pkg > dist/mangotango_macos-x86.pkg.sha1
+            move_command: mv dist/cibmangotree dist/cibmangotree_macos-x86
+            sha_command: shasum -a 1 dist/cibmangotree_macos-x86 > dist/cibmangotree_macos-x86.sha1
+            sha_command_pkg: shasum -a 1 dist/cibmangotree_macos-x86.pkg > dist/cibmangotree_macos-x86.pkg.sha1
             list_command: ls -ll dist
-            check_command: dist/mangotango_macos-x86 --noop
+            check_command: dist/cibmangotree_macos-x86 --noop
           - platform_name: MacOS (arm64)
             artifact_name: macos-arm64
             os: macos-15
-            move_command: mv dist/mangotango dist/mangotango_macos-arm64
-            sha_command: shasum -a 1 dist/mangotango_macos-arm64 > dist/mangotango_macos-arm64.sha1
-            sha_command_pkg: shasum -a 1 dist/mangotango_macos-arm64.pkg > dist/mangotango_macos-arm64.pkg.sha1
+            move_command: mv dist/cibmangotree dist/cibmangotree_macos-arm64
+            sha_command: shasum -a 1 dist/cibmangotree_macos-arm64 > dist/cibmangotree_macos-arm64.sha1
+            sha_command_pkg: shasum -a 1 dist/cibmangotree_macos-arm64.pkg > dist/cibmangotree_macos-arm64.pkg.sha1
             list_command: ls -ll dist
-            check_command: dist/mangotango_macos-arm64 --noop
+            check_command: dist/cibmangotree_macos-arm64 --noop
 
     name: Build ${{ matrix.platform_name }}
     runs-on: ${{ matrix.os }}
@@ -130,33 +130,33 @@ jobs:
           APPLE_INST_CERT_ID: ${{secrets.APPLE_INST_CERT_ID}}
           APPLE_KEYCHAIN_PASS: ${{secrets.APPLE_KEY_PASS}}
         run: |
-          mkdir -p /tmp/mangotango/
-          ditto dist/mangotango_${{matrix.artifact_name}} /tmp/mangotango/mangotango
-          chmod +x /tmp/mangotango/mangotango
+          mkdir -p /tmp/cibmangotree/
+          ditto dist/cibmangotree_${{matrix.artifact_name}} /tmp/cibmangotree/cibmangotree
+          chmod +x /tmp/cibmangotree/cibmangotree
           security unlock-keychain -p $APPLE_KEYCHAIN_PASS build.keychain
           security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$APPLE_KEYCHAIN_PASS" build.keychain
           security find-identity -v -p codesigning build.keychain
-          pkgbuild --identifier "org.mangotango.cli" --timestamp --root /tmp/mangotango --install-location /Applications "./dist/mangotango_${{matrix.artifact_name}}_signed.pkg" --sign "$APPLE_INST_CERT_ID"
+          pkgbuild --identifier "org.cibmangotree.cli" --timestamp --root /tmp/cibmangotree --install-location /Applications "./dist/cibmangotree_${{matrix.artifact_name}}_signed.pkg" --sign "$APPLE_INST_CERT_ID"
 
       - name: Notarize Mac package
         if: runner.os == 'macOS' && inputs.is_release
         env:
           APPLE_DEV_EMAIL: ${{secrets.APPLE_DEV_EMAIL}}
           APPLE_TEAM_ID: ${{secrets.TEAM_ID}}
           APP_SPEC_PASS: ${{secrets.APP_SPEC_PASS}}
-        run: xcrun notarytool submit dist/mangotango_${{matrix.artifact_name}}_signed.pkg --apple-id $APPLE_DEV_EMAIL --team-id $APPLE_TEAM_ID --password $APP_SPEC_PASS --wait
+        run: xcrun notarytool submit dist/cibmangotree_${{matrix.artifact_name}}_signed.pkg --apple-id $APPLE_DEV_EMAIL --team-id $APPLE_TEAM_ID --password $APP_SPEC_PASS --wait
 
       - name: Staple the notarization ticket
         if: runner.os == 'macOS' && inputs.is_release
-        run: xcrun stapler staple dist/mangotango_${{matrix.artifact_name}}_signed.pkg
+        run: xcrun stapler staple dist/cibmangotree_${{matrix.artifact_name}}_signed.pkg
 
       - name: Clean up macOS Artifacts
         if: runner.os == 'macOS' && inputs.is_release
         run: |
-          rm -rf /tmp/mangotango
-          rm -rf dist/mangotango_${{matrix.artifact_name}}
-          rm -rf dist/mangotango_${{matrix.artifact_name}}.pkg
-          mv dist/mangotango_${{matrix.artifact_name}}_signed.pkg dist/mangotango_${{matrix.artifact_name}}.pkg
+          rm -rf /tmp/cibmangotree
+          rm -rf dist/cibmangotree_${{matrix.artifact_name}}
+          rm -rf dist/cibmangotree_${{matrix.artifact_name}}.pkg
+          mv dist/cibmangotree_${{matrix.artifact_name}}_signed.pkg dist/cibmangotree_${{matrix.artifact_name}}.pkg
 
       - name: Compute the SHA1 hashsum for macOS .pkg
         if: runner.os == 'macOS' && inputs.is_release

diff --git a/.mcp.json b/.mcp.json
@@ -14,19 +14,6 @@
       ],
       "env": {}
     },
-    "sequential-thinking": {
-      "type": "stdio",
-      "command": "npx",
-      "args": [
-        "-y",
-        "@modelcontextprotocol/server-sequential-thinking"
-      ],
-      "env": {}
-    },
-    "github": {
-      "type": "http",
-      "url": "https://api.githubcopilot.com/mcp/"
-    },
     "context7": {
       "type": "http",
       "url": "https://mcp.context7.com/mcp"

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -1,15 +1,5 @@
 # Claude Code - Mango Tango CLI Integration
 
-## Session Initialization - CRITICAL
-
-**Always start every Claude Code session with**:
-
-```markdown
-Read the initial instructions
-```
-
-This initializes Serena semantic analysis capabilities and loads project context.
-
 ## Project Context
 
 ### Core Documentation
@@ -37,14 +27,6 @@ This initializes Serena semantic analysis capabilities and loads project context
 
 ### Essential Serena Usage
 
-**Project Onboarding** (done once):
-
-```markdown
-- Call `initial_instructions` tool first
-- Use `check_onboarding_performed` to verify setup
-- If needed, call `onboarding` tool for comprehensive analysis
-```
-
 **Symbol-Level Development**:
 
 ```markdown
@@ -119,6 +101,7 @@ find_symbol("AppContext", include_body=True)
 ### Code Development Standards
 
 **Logging Integration:**
+
 ```python
 from app.logger import get_logger
 logger = get_logger(__name__)
@@ -194,7 +177,7 @@ read_memory("task_completion_checklist") # Before committing
 
 ### Symbol Navigation Examples
 
-```python
+```markdown
 # Find app entry point
 find_symbol("main", relative_path="mangotango.py")
 

diff --git a/analyzers/__init__.py b/analyzers/__init__.py
@@ -3,13 +3,13 @@
 from .example.example_base import example_base
 from .example.example_report import example_report
 from .example.example_web import example_web
-from .hashtags import hashtags
-from .hashtags_web import hashtags_web
+from .hashtags.hashtags_base import hashtags
+from .hashtags.hashtags_web import hashtags_web
 from .ngrams.ngram_stats import ngram_stats
 from .ngrams.ngram_web import ngrams_web
 from .ngrams.ngrams_base import ngrams
-from .temporal import temporal
-from .temporal_barplot import temporal_barplot
+from .temporal.temporal_base import temporal
+from .temporal.temporal_web import temporal_web
 from .time_coordination import time_coordination
 
 suite = AnalyzerSuite(
@@ -22,7 +22,7 @@
         ngrams_web,
         time_coordination,
         temporal,
-        temporal_barplot,
+        temporal_web,
         hashtags,
         hashtags_web,
     ]

diff --git a/analyzers/hashtags/__init__.py b/analyzers/hashtags/__init__.py
@@ -1,6 +0,0 @@
-from analyzer_interface import AnalyzerDeclaration
-
-from .interface import interface
-from .main import main
-
-hashtags = AnalyzerDeclaration(interface=interface, main=main, is_distributed=True)

diff --git a/analyzers/hashtags/hashtags_base/__init__.py b/analyzers/hashtags/hashtags_base/__init__.py
@@ -0,0 +1,6 @@
+from analyzer_interface import AnalyzerDeclaration
+
+from .interface import interface
+from .main import main
+
+hashtags = AnalyzerDeclaration(interface=interface, main=main, is_distributed=True)
diff --git a/analyzers/hashtags/interface.py → ...yzers/hashtags/hashtags_base/interface.py b/analyzers/hashtags/interface.py → ...yzers/hashtags/hashtags_base/interface.py
@@ -8,9 +8,9 @@
 )
 from analyzer_interface.params import TimeBinningParam
 
-COL_AUTHOR_ID = "user_id"
-COL_TIME = "time"
-COL_POST = "text"
+COL_AUTHOR_ID = "Unique UserID"
+COL_TIME = "Timestamp"
+COL_POST = "Post Content"
 
 PARAM_TIME_WINDOW = "time_window"
 

diff --git a/analyzers/hashtags/main.py → analyzers/hashtags/hashtags_base/main.py b/analyzers/hashtags/main.py → analyzers/hashtags/hashtags_base/main.py
diff --git a/analyzers/hashtags_web/__init__.py → analyzers/hashtags/hashtags_web/__init__.py b/analyzers/hashtags_web/__init__.py → analyzers/hashtags/hashtags_web/__init__.py
diff --git a/analyzers/hashtags_web/analysis.py → analyzers/hashtags/hashtags_web/analysis.py b/analyzers/hashtags_web/analysis.py → analyzers/hashtags/hashtags_web/analysis.py
@@ -1,6 +1,6 @@
 import polars as pl
 
-from ..hashtags.interface import OUTPUT_COL_HASHTAGS, OUTPUT_COL_USERS
+from ..hashtags_base.interface import OUTPUT_COL_HASHTAGS, OUTPUT_COL_USERS
 
 
 def secondary_analyzer(primary_output, timewindow):

diff --git a/analyzers/hashtags_web/app.py → analyzers/hashtags/hashtags_web/app.py b/analyzers/hashtags_web/app.py → analyzers/hashtags/hashtags_web/app.py
@@ -6,7 +6,7 @@
 from shiny import reactive, render, ui
 from shinywidgets import output_widget, render_widget
 
-from ..hashtags.interface import COL_AUTHOR_ID, COL_POST, COL_TIME
+from ..hashtags_base.interface import COL_AUTHOR_ID, COL_POST, COL_TIME
 from .analysis import secondary_analyzer
 from .plots import (
     MANGO_DARK_GREEN,
@@ -368,7 +368,7 @@ def tweets():
             pl.col(COL_TIME).dt.strftime("%B %d, %Y %I:%M %p")
         )
 
-        df_posts = df_posts.rename({"time": "Post date and time", "text": "Text"})
+        df_posts = df_posts.rename({COL_TIME: "Post date and time", COL_POST: "Text"})
 
         df_posts = df_posts.drop(pl.col(COL_AUTHOR_ID))