civictechdc · JoeKarow · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 15, 2025
diff --git a/analyzers/ngrams/ngrams_base/main.py b/analyzers/ngrams/ngrams_base/main.py
@@ -26,8 +26,12 @@
 def main(context: PrimaryAnalyzerContext):
     # Get parameters with defaults
     parameters = context.params
-    min_n = parameters.get(PARAM_MIN_N, 3)
-    max_n = parameters.get(PARAM_MAX_N, 5)
+    min_n_param = parameters.get(PARAM_MIN_N, 3)
+    max_n_param = parameters.get(PARAM_MAX_N, 5)
+    assert isinstance(min_n_param, int), "min_n parameter must be an integer"
+    assert isinstance(max_n_param, int), "max_n parameter must be an integer"
+    min_n = min_n_param
+    max_n = max_n_param
 
     # Configure tokenizer for social media text processing
     tokenizer_config = TokenizerConfig(
@@ -60,49 +64,96 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]):
             current_row = 0
             for row in df_input.iter_rows(named=True):
                 tokens = tokenize_text(row[COL_MESSAGE_TEXT], tokenizer_config)
+
+                # Deduplicate n-grams within this message
+                # This ensures that if "go go" appears twice in "go go go now",
+                # we yield one row with count=2, not two separate rows.
+                # This semantically represents: "This n-gram appears N times in this message"
+                message_ngrams = {}  # {ngram_id: count}
+
                 for ngram in ngrams(tokens, min_n, max_n):
                     serialized_ngram = serialize_ngram(ngram)
                     if serialized_ngram not in ngrams_by_id:
                         ngrams_by_id[serialized_ngram] = len(ngrams_by_id)
                     ngram_id = ngrams_by_id[serialized_ngram]
+
+                    # Count occurrences of this n-gram within this message
+                    message_ngrams[ngram_id] = message_ngrams.get(ngram_id, 0) + 1
+
+                # Yield one row per unique n-gram in this message (with count)
+                for ngram_id, count in message_ngrams.items():
                     yield {
                         COL_MESSAGE_SURROGATE_ID: row[COL_MESSAGE_SURROGATE_ID],
                         COL_NGRAM_ID: ngram_id,
+                        COL_MESSAGE_NGRAM_COUNT: count,
                     }
+
                 current_row = current_row + 1
                 if current_row % 100 == 0:
                     progress.update(current_row / num_rows)
 
         ngrams_by_id: dict[str, int] = {}
-        df_ngram_instances = pl.DataFrame(get_ngram_rows(ngrams_by_id))
-
-    with ProgressReporter("Computing per-message n-gram statistics"):
-        (
-            pl.DataFrame(df_ngram_instances)
-            .group_by(COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID)
-            .agg(pl.len().alias(COL_MESSAGE_NGRAM_COUNT))
-            .sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID])
-            .write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)
-        )
+        df_ngram_instances = list(get_ngram_rows(ngrams_by_id))
+
+    # N-gram deduplication is now done in-loop in get_ngram_rows(),
+    # so we don't need to group and aggregate here.
+    with ProgressReporter("Writing per-message n-gram data"):
+        # Handle empty case by providing explicit schema
+        if df_ngram_instances:
+            (
+                pl.DataFrame(df_ngram_instances)
+                .sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID])
+                .write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)
+            )
+        else:
+            # Create empty DataFrame with correct schema
+            pl.DataFrame(
+                {
+                    COL_MESSAGE_SURROGATE_ID: [],
+                    COL_NGRAM_ID: [],
+                    COL_MESSAGE_NGRAM_COUNT: [],
+                },
+                schema={
+                    COL_MESSAGE_SURROGATE_ID: pl.Int64,
+                    COL_NGRAM_ID: pl.Int64,
+                    COL_MESSAGE_NGRAM_COUNT: pl.Int64,
+                },
+            ).write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)
 
     with ProgressReporter("Outputting n-gram definitions"):
-        (
+        # Handle empty case by providing explicit schema
+        if ngrams_by_id:
+            (
+                pl.DataFrame(
+                    {
+                        COL_NGRAM_ID: list(ngrams_by_id.values()),
+                        COL_NGRAM_WORDS: list(ngrams_by_id.keys()),
+                    }
+                )
+                .with_columns(
+                    [
+                        pl.col(COL_NGRAM_WORDS)
+                        .str.split(" ")
+                        .list.len()
+                        .alias(COL_NGRAM_LENGTH)
+                    ]
+                )
+                .write_parquet(context.output(OUTPUT_NGRAM_DEFS).parquet_path)
+            )
+        else:
+            # Create empty DataFrame with correct schema
             pl.DataFrame(
                 {
-                    COL_NGRAM_ID: list(ngrams_by_id.values()),
-                    COL_NGRAM_WORDS: list(ngrams_by_id.keys()),
-                }
-            )
-            .with_columns(
-                [
-                    pl.col(COL_NGRAM_WORDS)
-                    .str.split(" ")
-                    .list.len()
-                    .alias(COL_NGRAM_LENGTH)
-                ]
-            )
-            .write_parquet(context.output(OUTPUT_NGRAM_DEFS).parquet_path)
-        )
+                    COL_NGRAM_ID: [],
+                    COL_NGRAM_WORDS: [],
+                    COL_NGRAM_LENGTH: [],
+                },
+                schema={
+                    COL_NGRAM_ID: pl.Int64,
+                    COL_NGRAM_WORDS: pl.String,
+                    COL_NGRAM_LENGTH: pl.Int64,
+                },
+            ).write_parquet(context.output(OUTPUT_NGRAM_DEFS).parquet_path)
 
     with ProgressReporter("Outputting messages"):
         (

diff --git a/analyzers/ngrams/test_data/message_authors.parquet b/analyzers/ngrams/test_data/message_authors.parquet
diff --git a/analyzers/ngrams/test_data/message_ngrams.parquet b/analyzers/ngrams/test_data/message_ngrams.parquet
diff --git a/analyzers/ngrams/test_data/ngrams.parquet b/analyzers/ngrams/test_data/ngrams.parquet
diff --git a/analyzers/ngrams/test_data/ngrams_test_input.csv b/analyzers/ngrams/test_data/ngrams_test_input.csv
@@ -11,3 +11,6 @@ user_004,msg_009,We must get up and fight the system.,2024-01-15T21:39:00Z
 user_001,msg_010,Just discovered this amazing new coffee shop downtown! The atmosphere is incredible and the barista really knows their craft.,2024-01-16T22:51:00Z
 user_002,msg_011,Working from home has its perks but I miss the office dynamics sometimes. Finding balance is key.,2024-01-16T23:54:00Z
 user_003,msg_012,Sunday morning thoughts: grateful for family time and peaceful moments. Life's simple pleasures matter most.,2024-01-17T00:57:00Z
+user_001,msg_013,go go go now we need to go,2024-01-18T10:00:00Z
+user_002,msg_014,긴급 긴급 조치 필요합니다,2024-01-18T11:00:00Z
+user_003,msg_015,abc abc abc test,2024-01-18T12:00:00Z