diff --git a/analyzers/__init__.py b/analyzers/__init__.py index 9fbf9c9c..2b2723ab 100644 --- a/analyzers/__init__.py +++ b/analyzers/__init__.py @@ -5,9 +5,9 @@ from .example.example_web import example_web from .hashtags import hashtags from .hashtags_web import hashtags_web -from .ngram_stats import ngram_stats -from .ngram_web import ngrams_web -from .ngrams import ngrams +from .ngrams.ngram_stats import ngram_stats +from .ngrams.ngram_web import ngrams_web +from .ngrams.ngrams_base import ngrams from .temporal import temporal from .temporal_barplot import temporal_barplot from .time_coordination import time_coordination diff --git a/analyzers/ngrams/__init__.py b/analyzers/ngrams/__init__.py index 3f2eb6dc..e69de29b 100644 --- a/analyzers/ngrams/__init__.py +++ b/analyzers/ngrams/__init__.py @@ -1,6 +0,0 @@ -from analyzer_interface import AnalyzerDeclaration - -from .interface import interface -from .main import main - -ngrams = AnalyzerDeclaration(interface=interface, main=main, is_distributed=True) diff --git a/analyzers/ngram_stats/__init__.py b/analyzers/ngrams/ngram_stats/__init__.py similarity index 100% rename from analyzers/ngram_stats/__init__.py rename to analyzers/ngrams/ngram_stats/__init__.py diff --git a/analyzers/ngram_stats/interface.py b/analyzers/ngrams/ngram_stats/interface.py similarity index 97% rename from analyzers/ngram_stats/interface.py rename to analyzers/ngrams/ngram_stats/interface.py index 85f055e0..5b904d08 100644 --- a/analyzers/ngram_stats/interface.py +++ b/analyzers/ngrams/ngram_stats/interface.py @@ -1,7 +1,7 @@ from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface -from ..ngrams import interface as ngrams_interface -from ..ngrams.interface import ( +from ..ngrams_base import interface as ngrams_interface +from ..ngrams_base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_NGRAM_COUNT, diff --git a/analyzers/ngram_stats/main.py b/analyzers/ngrams/ngram_stats/main.py similarity index 99% rename from analyzers/ngram_stats/main.py rename to analyzers/ngrams/ngram_stats/main.py index 09ab5bf6..7aa4f961 100644 --- a/analyzers/ngram_stats/main.py +++ b/analyzers/ngrams/ngram_stats/main.py @@ -5,7 +5,7 @@ from analyzer_interface.context import SecondaryAnalyzerContext from terminal_tools import ProgressReporter -from ..ngrams.interface import ( +from ..ngrams_base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_NGRAM_COUNT, diff --git a/analyzers/ngram_web/__init__.py b/analyzers/ngrams/ngram_web/__init__.py similarity index 100% rename from analyzers/ngram_web/__init__.py rename to analyzers/ngrams/ngram_web/__init__.py diff --git a/analyzers/ngram_web/factory.py b/analyzers/ngrams/ngram_web/factory.py similarity index 100% rename from analyzers/ngram_web/factory.py rename to analyzers/ngrams/ngram_web/factory.py diff --git a/analyzers/ngram_web/interface.py b/analyzers/ngrams/ngram_web/interface.py similarity index 86% rename from analyzers/ngram_web/interface.py rename to analyzers/ngrams/ngram_web/interface.py index 35b78399..203514a4 100644 --- a/analyzers/ngram_web/interface.py +++ b/analyzers/ngrams/ngram_web/interface.py @@ -1,7 +1,7 @@ from analyzer_interface import WebPresenterInterface from ..ngram_stats import interface as ngram_stats_interface -from ..ngrams import interface as ngrams_interface +from ..ngrams_base import interface as ngrams_interface interface = WebPresenterInterface( id="ngram_repetition_by_poster", diff --git a/analyzers/ngrams/ngrams_base/__init__.py b/analyzers/ngrams/ngrams_base/__init__.py new file mode 100644 index 00000000..3f2eb6dc --- /dev/null +++ b/analyzers/ngrams/ngrams_base/__init__.py @@ -0,0 +1,6 @@ +from analyzer_interface import AnalyzerDeclaration + +from .interface import interface +from .main import main + +ngrams = AnalyzerDeclaration(interface=interface, main=main, is_distributed=True) diff --git a/analyzers/ngrams/interface.py b/analyzers/ngrams/ngrams_base/interface.py similarity index 100% rename from analyzers/ngrams/interface.py rename to analyzers/ngrams/ngrams_base/interface.py diff --git a/analyzers/ngrams/main.py b/analyzers/ngrams/ngrams_base/main.py similarity index 95% rename from analyzers/ngrams/main.py rename to analyzers/ngrams/ngrams_base/main.py index 01525717..8b54a6a3 100644 --- a/analyzers/ngrams/main.py +++ b/analyzers/ngrams/ngrams_base/main.py @@ -35,7 +35,7 @@ def main(context: PrimaryAnalyzerContext): & (pl.col(COL_AUTHOR_ID) != "") ) - with ProgressReporter("Generating n-grams") as progress: + with ProgressReporter("Detecting n-grams") as progress: def get_ngram_rows(ngrams_by_id: dict[str, int]): nonlocal progress @@ -63,7 +63,8 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]): ( pl.DataFrame(df_ngram_instances) .group_by(COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID) - .agg(pl.count().alias(COL_MESSAGE_NGRAM_COUNT)) + .agg(pl.len().alias(COL_MESSAGE_NGRAM_COUNT)) + .sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID]) .write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path) ) diff --git a/analyzers/ngrams/test_data/__init__.py b/analyzers/ngrams/test_data/__init__.py new file mode 100644 index 00000000..8906f86c --- /dev/null +++ b/analyzers/ngrams/test_data/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +test_data_dir = Path(__file__).parent.resolve() diff --git a/analyzers/ngrams/test_data/message_authors.parquet b/analyzers/ngrams/test_data/message_authors.parquet new file mode 100644 index 00000000..d1dcb168 Binary files /dev/null and b/analyzers/ngrams/test_data/message_authors.parquet differ diff --git a/analyzers/ngrams/test_data/message_ngrams.parquet b/analyzers/ngrams/test_data/message_ngrams.parquet new file mode 100644 index 00000000..585915e7 Binary files /dev/null and b/analyzers/ngrams/test_data/message_ngrams.parquet differ diff --git a/analyzers/ngrams/test_data/ngram_full.parquet b/analyzers/ngrams/test_data/ngram_full.parquet new file mode 100644 index 00000000..d47cb526 Binary files /dev/null and b/analyzers/ngrams/test_data/ngram_full.parquet differ diff --git a/analyzers/ngrams/test_data/ngram_stats.parquet b/analyzers/ngrams/test_data/ngram_stats.parquet new file mode 100644 index 00000000..901257ac Binary files /dev/null and b/analyzers/ngrams/test_data/ngram_stats.parquet differ diff --git a/analyzers/ngrams/test_data/ngrams.parquet b/analyzers/ngrams/test_data/ngrams.parquet new file mode 100644 index 00000000..39ee8e0b Binary files /dev/null and b/analyzers/ngrams/test_data/ngrams.parquet differ diff --git a/analyzers/ngrams/test_data/ngrams_test_input.csv b/analyzers/ngrams/test_data/ngrams_test_input.csv new file mode 100644 index 00000000..27853095 --- /dev/null +++ b/analyzers/ngrams/test_data/ngrams_test_input.csv @@ -0,0 +1,13 @@ +user_id,message_id,message_text,timestamp +user_004,msg_001,Urgent action needed before it's late.,2024-01-15T09:03:00Z +user_005,msg_002,Climate emergency requires urgent action.,2024-01-15T12:12:00Z +user_004,msg_003,Urgent action needed to save planet.,2024-01-15T13:15:00Z +user_004,msg_004,"Climate emergency requires immediate response.",2024-01-15T14:18:00Z +user_005,msg_005,Urgent action needed to save planet.,2024-01-15T15:21:00Z +user_004,msg_006,Climate emergency requires massive investment.,2024-01-15T16:24:00Z +user_004,msg_007,Climate emergency requires global cooperation.,2024-01-15T19:33:00Z +user_005,msg_008,Someone needs fight the system soon.,2024-01-15T20:36:00Z +user_004,msg_009,We must get up and fight the system.,2024-01-15T21:39:00Z +user_001,msg_010,Just discovered this amazing new coffee shop downtown! The atmosphere is incredible and the barista really knows their craft.,2024-01-16T22:51:00Z +user_002,msg_011,Working from home has its perks but I miss the office dynamics sometimes. Finding balance is key.,2024-01-16T23:54:00Z +user_003,msg_012,Sunday morning thoughts: grateful for family time and peaceful moments. Life's simple pleasures matter most.,2024-01-17T00:57:00Z diff --git a/analyzers/ngrams/test_ngram_stats.py b/analyzers/ngrams/test_ngram_stats.py new file mode 100644 index 00000000..77cfb11b --- /dev/null +++ b/analyzers/ngrams/test_ngram_stats.py @@ -0,0 +1,41 @@ +from pathlib import Path + +from testing import ParquetTestData, test_secondary_analyzer + +from .ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface +from .ngram_stats.main import main +from .ngrams_base.interface import ( + OUTPUT_MESSAGE, + OUTPUT_MESSAGE_NGRAMS, + OUTPUT_NGRAM_DEFS, +) +from .test_data import test_data_dir + + +# This example shows you how to test a secondary analyzer. +# It runs on pytest. +def test_ngram_stats(): + # You use this test function. + test_secondary_analyzer( + interface, + main, + primary_outputs={ + OUTPUT_MESSAGE_NGRAMS: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_MESSAGE_NGRAMS + ".parquet")) + ), + OUTPUT_NGRAM_DEFS: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_NGRAM_DEFS + ".parquet")) + ), + OUTPUT_MESSAGE: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_MESSAGE + ".parquet")) + ), + }, + expected_outputs={ + OUTPUT_NGRAM_STATS: ParquetTestData( + str(Path(test_data_dir, OUTPUT_NGRAM_STATS + ".parquet")) + ), + OUTPUT_NGRAM_FULL: ParquetTestData( + str(Path(test_data_dir, OUTPUT_NGRAM_FULL + ".parquet")) + ), + }, + ) diff --git a/analyzers/ngrams/test_ngrams_base.py b/analyzers/ngrams/test_ngrams_base.py new file mode 100644 index 00000000..417adf79 --- /dev/null +++ b/analyzers/ngrams/test_ngrams_base.py @@ -0,0 +1,162 @@ +import types +from pathlib import Path + +from preprocessing.series_semantic import datetime_string, identifier, text_catch_all +from testing import CsvTestData, ParquetTestData, test_primary_analyzer + +from .ngrams_base.interface import ( + COL_AUTHOR_ID, + COL_MESSAGE_ID, + COL_MESSAGE_TEXT, + COL_MESSAGE_TIMESTAMP, + OUTPUT_MESSAGE, + OUTPUT_MESSAGE_NGRAMS, + OUTPUT_NGRAM_DEFS, + interface, +) +from .ngrams_base.main import main, ngrams, serialize_ngram, tokenize +from .test_data import test_data_dir + +TEST_CSV_FILENAME = "ngrams_test_input.csv" +TEST_STRING = "Mango tree is an open source project." + +# this is expected output of tokenize() +TEST_TOKENIZED_EXPECTED = [ + "mango", # it's lower cased + "tree", + "is", + "an", + "open", + "source", + "project.", # puncutation is not stripped +] + +NGRAMS_EXPECTED_min1_max3 = [ + ["mango"], + ["mango", "tree"], + ["mango", "tree", "is"], + ["tree"], + ["tree", "is"], + ["tree", "is", "an"], + ["is"], + ["is", "an"], + ["is", "an", "open"], + ["an"], + ["an", "open"], + ["an", "open", "source"], + ["open"], + ["open", "source"], + ["open", "source", "project."], + ["source"], + ["source", "project."], + ["project."], +] + +NGRAMS_EXPECTED_min5_max7 = [ + ["mango", "tree", "is", "an", "open"], + ["mango", "tree", "is", "an", "open", "source"], + ["mango", "tree", "is", "an", "open", "source", "project."], + ["tree", "is", "an", "open", "source"], + ["tree", "is", "an", "open", "source", "project."], + ["is", "an", "open", "source", "project."], +] + +# if max ngram len is not found, it just returns all the shortest ngrams +NGRAMS_EXPECTED_min5_max8 = [ + ["mango", "tree", "is", "an", "open"], + ["mango", "tree", "is", "an", "open", "source"], + ["mango", "tree", "is", "an", "open", "source", "project."], + ["tree", "is", "an", "open", "source"], + ["tree", "is", "an", "open", "source", "project."], + ["is", "an", "open", "source", "project."], +] + + +def test_tokenize(): + test_tokenized_actual = tokenize(TEST_STRING) + + assert isinstance( + test_tokenized_actual, list + ), "output of tokenize() is not instance of list" + + assert all( + [ + expected_str == actual_str + for expected_str, actual_str in zip( + TEST_TOKENIZED_EXPECTED, test_tokenized_actual + ) + ] + ), "Tokenized strings does not matched expected tokens." + + pass + + +def test_ngrams(): + test_string_tokenized = tokenize(TEST_STRING) + + test_combinations = { + "min1_max3": { + "min_gram_len": 1, + "max_ngram_len": 3, + "n_expected_ngrams_found": 18, + }, + "min5_max7": { + "min_gram_len": 5, + "max_ngram_len": 7, + "n_expected_ngrams_found": 6, + }, + "min5_max8": { + "min_gram_len": 5, + "max_ngram_len": 8, + "n_expected_ngrams_found": 6, + }, + } + + for test_key, test_params in test_combinations.items(): + ngrams_actual = ngrams( + test_string_tokenized, + min=test_params["min_gram_len"], + max=test_params["max_ngram_len"], + ) + + assert isinstance(ngrams_actual, types.GeneratorType) + assert ( + len(list(ngrams_actual)) == test_params["n_expected_ngrams_found"] + ), f"Nr. expected tokens mismatch for {test_key}" + + +def test_serialize_ngram(): + NGRAM_SERIALIZED_EXPECTED_FIRST = "mango tree is an open" + + test_ngrams = list(ngrams(tokenize(TEST_STRING), min=5, max=8)) + + test_ngram_serialized_actual = serialize_ngram(test_ngrams[0]) + + assert NGRAM_SERIALIZED_EXPECTED_FIRST == test_ngram_serialized_actual + + +def test_ngram_analyzer(): + test_primary_analyzer( + interface=interface, + main=main, + input=CsvTestData( + filepath=str(Path(test_data_dir, TEST_CSV_FILENAME)), + semantics={ + COL_AUTHOR_ID: identifier, + COL_MESSAGE_ID: identifier, + COL_MESSAGE_TEXT: text_catch_all, + COL_MESSAGE_TIMESTAMP: datetime_string, + }, + ), + outputs={ + OUTPUT_MESSAGE_NGRAMS: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_MESSAGE_NGRAMS + ".parquet")) + ), + OUTPUT_NGRAM_DEFS: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_NGRAM_DEFS + ".parquet")) + ), + OUTPUT_MESSAGE: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_MESSAGE + ".parquet")) + ), + }, + ) diff --git a/testing/__init__.py b/testing/__init__.py index 2d53a031..962da5f2 100644 --- a/testing/__init__.py +++ b/testing/__init__.py @@ -3,6 +3,7 @@ CsvTestData, ExcelTestData, JsonTestData, + ParquetTestData, PolarsTestData, ) from .testers import test_primary_analyzer, test_secondary_analyzer diff --git a/testing/testdata.py b/testing/testdata.py index ec17e75a..1cfd61d8 100644 --- a/testing/testdata.py +++ b/testing/testdata.py @@ -106,6 +106,11 @@ def _load_as_polars(self) -> pl.DataFrame: return pl.read_excel(self.filepath) +class ParquetTestData(FileTestData): + def _load_as_polars(self) -> pl.DataFrame: + return pl.read_parquet(self.filepath) + + class PolarsTestData(TestData): def __init__(self, df: pl.DataFrame): self.df = df