Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
52f66e3
Fix Korean script classification to use space-separated tokenization
JoeKarow Oct 14, 2025
537fcd1
Add support for ordinals, large numbers, hyphens, Korean, and Unicode…
JoeKarow Oct 14, 2025
2e23ea8
Add comprehensive test coverage for issue #236 fixes
JoeKarow Oct 14, 2025
e47021e
Add pattern caching and cashtag support for performance and bot detec…
JoeKarow Oct 15, 2025
16ce987
Add cashtag config and change unicode normalization default for bot d…
JoeKarow Oct 15, 2025
6b9bf49
Optimize tokenizer performance and add mixed-script preservation for …
JoeKarow Oct 15, 2025
6a3505c
Add comprehensive bot detection edge case tests
JoeKarow Oct 15, 2025
53c8a36
Refactor tokenizer test suite for maintainability
JoeKarow Oct 15, 2025
d01aece
Refactor tokenizer tests: consolidate and parametrize test cases
JoeKarow Oct 15, 2025
25437c5
formatting
JoeKarow Oct 15, 2025
a5bcc33
formatting
JoeKarow Oct 15, 2025
b940ddf
refactor: clean up tokenizer test fixtures and remove unused helpers
JoeKarow Oct 15, 2025
488dca5
Merge branch 'develop' into JoeKarow/236-BasicTokenizer-fixes
JoeKarow Oct 17, 2025
6e320b9
fix: Add curly apostrophe and possessive support to Latin word pattern
JoeKarow Oct 22, 2025
3353424
formatting
JoeKarow Oct 22, 2025
0464a7f
refactor(ngrams): Deduplicate n-grams in-loop for clearer semantics
JoeKarow Oct 22, 2025
9b8291a
test(ngrams): Add comprehensive repetition counting edge case tests
JoeKarow Oct 22, 2025
9298bb8
test(ngrams): Augment test data with within-message repetition scenarios
JoeKarow Oct 22, 2025
6917730
docs(ngrams): Add test data validation documentation
JoeKarow Oct 22, 2025
53ee014
cleanup
JoeKarow Oct 22, 2025
44aefd5
Merge branch 'develop' into JoeKarow/241-koren-repetition
KristijanArmeni Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 78 additions & 27 deletions analyzers/ngrams/ngrams_base/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@
def main(context: PrimaryAnalyzerContext):
# Get parameters with defaults
parameters = context.params
min_n = parameters.get(PARAM_MIN_N, 3)
max_n = parameters.get(PARAM_MAX_N, 5)
min_n_param = parameters.get(PARAM_MIN_N, 3)
max_n_param = parameters.get(PARAM_MAX_N, 5)
assert isinstance(min_n_param, int), "min_n parameter must be an integer"
assert isinstance(max_n_param, int), "max_n parameter must be an integer"
min_n = min_n_param
max_n = max_n_param

# Configure tokenizer for social media text processing
tokenizer_config = TokenizerConfig(
Expand Down Expand Up @@ -60,49 +64,96 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]):
current_row = 0
for row in df_input.iter_rows(named=True):
tokens = tokenize_text(row[COL_MESSAGE_TEXT], tokenizer_config)

# Deduplicate n-grams within this message
# This ensures that if "go go" appears twice in "go go go now",
# we yield one row with count=2, not two separate rows.
# This semantically represents: "This n-gram appears N times in this message"
message_ngrams = {} # {ngram_id: count}

for ngram in ngrams(tokens, min_n, max_n):
serialized_ngram = serialize_ngram(ngram)
if serialized_ngram not in ngrams_by_id:
ngrams_by_id[serialized_ngram] = len(ngrams_by_id)
ngram_id = ngrams_by_id[serialized_ngram]

# Count occurrences of this n-gram within this message
message_ngrams[ngram_id] = message_ngrams.get(ngram_id, 0) + 1

# Yield one row per unique n-gram in this message (with count)
for ngram_id, count in message_ngrams.items():
yield {
COL_MESSAGE_SURROGATE_ID: row[COL_MESSAGE_SURROGATE_ID],
COL_NGRAM_ID: ngram_id,
COL_MESSAGE_NGRAM_COUNT: count,
}

current_row = current_row + 1
if current_row % 100 == 0:
progress.update(current_row / num_rows)

ngrams_by_id: dict[str, int] = {}
df_ngram_instances = pl.DataFrame(get_ngram_rows(ngrams_by_id))

with ProgressReporter("Computing per-message n-gram statistics"):
(
pl.DataFrame(df_ngram_instances)
.group_by(COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID)
.agg(pl.len().alias(COL_MESSAGE_NGRAM_COUNT))
.sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID])
.write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)
)
df_ngram_instances = list(get_ngram_rows(ngrams_by_id))

# N-gram deduplication is now done in-loop in get_ngram_rows(),
# so we don't need to group and aggregate here.
with ProgressReporter("Writing per-message n-gram data"):
# Handle empty case by providing explicit schema
if df_ngram_instances:
(
pl.DataFrame(df_ngram_instances)
.sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID])
.write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)
)
else:
# Create empty DataFrame with correct schema
pl.DataFrame(
{
COL_MESSAGE_SURROGATE_ID: [],
COL_NGRAM_ID: [],
COL_MESSAGE_NGRAM_COUNT: [],
},
schema={
COL_MESSAGE_SURROGATE_ID: pl.Int64,
COL_NGRAM_ID: pl.Int64,
COL_MESSAGE_NGRAM_COUNT: pl.Int64,
},
).write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)

with ProgressReporter("Outputting n-gram definitions"):
(
# Handle empty case by providing explicit schema
if ngrams_by_id:
(
pl.DataFrame(
{
COL_NGRAM_ID: list(ngrams_by_id.values()),
COL_NGRAM_WORDS: list(ngrams_by_id.keys()),
}
)
.with_columns(
[
pl.col(COL_NGRAM_WORDS)
.str.split(" ")
.list.len()
.alias(COL_NGRAM_LENGTH)
]
)
.write_parquet(context.output(OUTPUT_NGRAM_DEFS).parquet_path)
)
else:
# Create empty DataFrame with correct schema
pl.DataFrame(
{
COL_NGRAM_ID: list(ngrams_by_id.values()),
COL_NGRAM_WORDS: list(ngrams_by_id.keys()),
}
)
.with_columns(
[
pl.col(COL_NGRAM_WORDS)
.str.split(" ")
.list.len()
.alias(COL_NGRAM_LENGTH)
]
)
.write_parquet(context.output(OUTPUT_NGRAM_DEFS).parquet_path)
)
COL_NGRAM_ID: [],
COL_NGRAM_WORDS: [],
COL_NGRAM_LENGTH: [],
},
schema={
COL_NGRAM_ID: pl.Int64,
COL_NGRAM_WORDS: pl.String,
COL_NGRAM_LENGTH: pl.Int64,
},
).write_parquet(context.output(OUTPUT_NGRAM_DEFS).parquet_path)

with ProgressReporter("Outputting messages"):
(
Expand Down
Binary file modified analyzers/ngrams/test_data/message_authors.parquet
Binary file not shown.
Binary file modified analyzers/ngrams/test_data/message_ngrams.parquet
Binary file not shown.
Binary file modified analyzers/ngrams/test_data/ngrams.parquet
Binary file not shown.
3 changes: 3 additions & 0 deletions analyzers/ngrams/test_data/ngrams_test_input.csv
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ user_004,msg_009,We must get up and fight the system.,2024-01-15T21:39:00Z
user_001,msg_010,Just discovered this amazing new coffee shop downtown! The atmosphere is incredible and the barista really knows their craft.,2024-01-16T22:51:00Z
user_002,msg_011,Working from home has its perks but I miss the office dynamics sometimes. Finding balance is key.,2024-01-16T23:54:00Z
user_003,msg_012,Sunday morning thoughts: grateful for family time and peaceful moments. Life's simple pleasures matter most.,2024-01-17T00:57:00Z
user_001,msg_013,go go go now we need to go,2024-01-18T10:00:00Z
user_002,msg_014,긴급 긴급 조치 필요합니다,2024-01-18T11:00:00Z
user_003,msg_015,abc abc abc test,2024-01-18T12:00:00Z
Loading