Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions analyzers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from .example.example_web import example_web
from .hashtags import hashtags
from .hashtags_web import hashtags_web
from .ngram_stats import ngram_stats
from .ngram_web import ngrams_web
from .ngrams import ngrams
from .ngrams.ngram_stats import ngram_stats
from .ngrams.ngram_web import ngrams_web
from .ngrams.ngrams_base import ngrams
from .temporal import temporal
from .temporal_barplot import temporal_barplot
from .time_coordination import time_coordination
Expand Down
6 changes: 0 additions & 6 deletions analyzers/ngrams/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
from analyzer_interface import AnalyzerDeclaration

from .interface import interface
from .main import main

ngrams = AnalyzerDeclaration(interface=interface, main=main, is_distributed=True)
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface

from ..ngrams import interface as ngrams_interface
from ..ngrams.interface import (
from ..ngrams_base import interface as ngrams_interface
from ..ngrams_base.interface import (
COL_AUTHOR_ID,
COL_MESSAGE_ID,
COL_MESSAGE_NGRAM_COUNT,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from analyzer_interface.context import SecondaryAnalyzerContext
from terminal_tools import ProgressReporter

from ..ngrams.interface import (
from ..ngrams_base.interface import (
COL_AUTHOR_ID,
COL_MESSAGE_ID,
COL_MESSAGE_NGRAM_COUNT,
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from analyzer_interface import WebPresenterInterface

from ..ngram_stats import interface as ngram_stats_interface
from ..ngrams import interface as ngrams_interface
from ..ngrams_base import interface as ngrams_interface

interface = WebPresenterInterface(
id="ngram_repetition_by_poster",
Expand Down
6 changes: 6 additions & 0 deletions analyzers/ngrams/ngrams_base/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from analyzer_interface import AnalyzerDeclaration

from .interface import interface
from .main import main

ngrams = AnalyzerDeclaration(interface=interface, main=main, is_distributed=True)
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def main(context: PrimaryAnalyzerContext):
& (pl.col(COL_AUTHOR_ID) != "")
)

with ProgressReporter("Generating n-grams") as progress:
with ProgressReporter("Detecting n-grams") as progress:

def get_ngram_rows(ngrams_by_id: dict[str, int]):
nonlocal progress
Expand Down Expand Up @@ -63,7 +63,8 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]):
(
pl.DataFrame(df_ngram_instances)
.group_by(COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID)
.agg(pl.count().alias(COL_MESSAGE_NGRAM_COUNT))
.agg(pl.len().alias(COL_MESSAGE_NGRAM_COUNT))
.sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID])
.write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path)
)

Expand Down
3 changes: 3 additions & 0 deletions analyzers/ngrams/test_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pathlib import Path

test_data_dir = Path(__file__).parent.resolve()
Binary file not shown.
Binary file added analyzers/ngrams/test_data/message_ngrams.parquet
Binary file not shown.
Binary file added analyzers/ngrams/test_data/ngram_full.parquet
Binary file not shown.
Binary file added analyzers/ngrams/test_data/ngram_stats.parquet
Binary file not shown.
Binary file added analyzers/ngrams/test_data/ngrams.parquet
Binary file not shown.
13 changes: 13 additions & 0 deletions analyzers/ngrams/test_data/ngrams_test_input.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
user_id,message_id,message_text,timestamp
user_004,msg_001,Urgent action needed before it's late.,2024-01-15T09:03:00Z
user_005,msg_002,Climate emergency requires urgent action.,2024-01-15T12:12:00Z
user_004,msg_003,Urgent action needed to save planet.,2024-01-15T13:15:00Z
user_004,msg_004,"Climate emergency requires immediate response.",2024-01-15T14:18:00Z
user_005,msg_005,Urgent action needed to save planet.,2024-01-15T15:21:00Z
user_004,msg_006,Climate emergency requires massive investment.,2024-01-15T16:24:00Z
user_004,msg_007,Climate emergency requires global cooperation.,2024-01-15T19:33:00Z
user_005,msg_008,Someone needs fight the system soon.,2024-01-15T20:36:00Z
user_004,msg_009,We must get up and fight the system.,2024-01-15T21:39:00Z
user_001,msg_010,Just discovered this amazing new coffee shop downtown! The atmosphere is incredible and the barista really knows their craft.,2024-01-16T22:51:00Z
user_002,msg_011,Working from home has its perks but I miss the office dynamics sometimes. Finding balance is key.,2024-01-16T23:54:00Z
user_003,msg_012,Sunday morning thoughts: grateful for family time and peaceful moments. Life's simple pleasures matter most.,2024-01-17T00:57:00Z
41 changes: 41 additions & 0 deletions analyzers/ngrams/test_ngram_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path

from testing import ParquetTestData, test_secondary_analyzer

from .ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface
from .ngram_stats.main import main
from .ngrams_base.interface import (
OUTPUT_MESSAGE,
OUTPUT_MESSAGE_NGRAMS,
OUTPUT_NGRAM_DEFS,
)
from .test_data import test_data_dir


# This example shows you how to test a secondary analyzer.
# It runs on pytest.
def test_ngram_stats():
# You use this test function.
test_secondary_analyzer(
interface,
main,
primary_outputs={
OUTPUT_MESSAGE_NGRAMS: ParquetTestData(
filepath=str(Path(test_data_dir, OUTPUT_MESSAGE_NGRAMS + ".parquet"))
),
OUTPUT_NGRAM_DEFS: ParquetTestData(
filepath=str(Path(test_data_dir, OUTPUT_NGRAM_DEFS + ".parquet"))
),
OUTPUT_MESSAGE: ParquetTestData(
filepath=str(Path(test_data_dir, OUTPUT_MESSAGE + ".parquet"))
),
},
expected_outputs={
OUTPUT_NGRAM_STATS: ParquetTestData(
str(Path(test_data_dir, OUTPUT_NGRAM_STATS + ".parquet"))
),
OUTPUT_NGRAM_FULL: ParquetTestData(
str(Path(test_data_dir, OUTPUT_NGRAM_FULL + ".parquet"))
),
},
)
162 changes: 162 additions & 0 deletions analyzers/ngrams/test_ngrams_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import types
from pathlib import Path

from preprocessing.series_semantic import datetime_string, identifier, text_catch_all
from testing import CsvTestData, ParquetTestData, test_primary_analyzer

from .ngrams_base.interface import (
COL_AUTHOR_ID,
COL_MESSAGE_ID,
COL_MESSAGE_TEXT,
COL_MESSAGE_TIMESTAMP,
OUTPUT_MESSAGE,
OUTPUT_MESSAGE_NGRAMS,
OUTPUT_NGRAM_DEFS,
interface,
)
from .ngrams_base.main import main, ngrams, serialize_ngram, tokenize
from .test_data import test_data_dir

TEST_CSV_FILENAME = "ngrams_test_input.csv"
TEST_STRING = "Mango tree is an open source project."

# this is expected output of tokenize()
TEST_TOKENIZED_EXPECTED = [
"mango", # it's lower cased
"tree",
"is",
"an",
"open",
"source",
"project.", # puncutation is not stripped
]

NGRAMS_EXPECTED_min1_max3 = [
["mango"],
["mango", "tree"],
["mango", "tree", "is"],
["tree"],
["tree", "is"],
["tree", "is", "an"],
["is"],
["is", "an"],
["is", "an", "open"],
["an"],
["an", "open"],
["an", "open", "source"],
["open"],
["open", "source"],
["open", "source", "project."],
["source"],
["source", "project."],
["project."],
]

NGRAMS_EXPECTED_min5_max7 = [
["mango", "tree", "is", "an", "open"],
["mango", "tree", "is", "an", "open", "source"],
["mango", "tree", "is", "an", "open", "source", "project."],
["tree", "is", "an", "open", "source"],
["tree", "is", "an", "open", "source", "project."],
["is", "an", "open", "source", "project."],
]

# if max ngram len is not found, it just returns all the shortest ngrams
NGRAMS_EXPECTED_min5_max8 = [
["mango", "tree", "is", "an", "open"],
["mango", "tree", "is", "an", "open", "source"],
["mango", "tree", "is", "an", "open", "source", "project."],
["tree", "is", "an", "open", "source"],
["tree", "is", "an", "open", "source", "project."],
["is", "an", "open", "source", "project."],
]


def test_tokenize():
test_tokenized_actual = tokenize(TEST_STRING)

assert isinstance(
test_tokenized_actual, list
), "output of tokenize() is not instance of list"

assert all(
[
expected_str == actual_str
for expected_str, actual_str in zip(
TEST_TOKENIZED_EXPECTED, test_tokenized_actual
)
]
), "Tokenized strings does not matched expected tokens."

pass


def test_ngrams():
test_string_tokenized = tokenize(TEST_STRING)

test_combinations = {
"min1_max3": {
"min_gram_len": 1,
"max_ngram_len": 3,
"n_expected_ngrams_found": 18,
},
"min5_max7": {
"min_gram_len": 5,
"max_ngram_len": 7,
"n_expected_ngrams_found": 6,
},
"min5_max8": {
"min_gram_len": 5,
"max_ngram_len": 8,
"n_expected_ngrams_found": 6,
},
}

for test_key, test_params in test_combinations.items():
ngrams_actual = ngrams(
test_string_tokenized,
min=test_params["min_gram_len"],
max=test_params["max_ngram_len"],
)

assert isinstance(ngrams_actual, types.GeneratorType)
assert (
len(list(ngrams_actual)) == test_params["n_expected_ngrams_found"]
), f"Nr. expected tokens mismatch for {test_key}"


def test_serialize_ngram():
NGRAM_SERIALIZED_EXPECTED_FIRST = "mango tree is an open"

test_ngrams = list(ngrams(tokenize(TEST_STRING), min=5, max=8))

test_ngram_serialized_actual = serialize_ngram(test_ngrams[0])

assert NGRAM_SERIALIZED_EXPECTED_FIRST == test_ngram_serialized_actual


def test_ngram_analyzer():
test_primary_analyzer(
interface=interface,
main=main,
input=CsvTestData(
filepath=str(Path(test_data_dir, TEST_CSV_FILENAME)),
semantics={
COL_AUTHOR_ID: identifier,
COL_MESSAGE_ID: identifier,
COL_MESSAGE_TEXT: text_catch_all,
COL_MESSAGE_TIMESTAMP: datetime_string,
},
),
outputs={
OUTPUT_MESSAGE_NGRAMS: ParquetTestData(
filepath=str(Path(test_data_dir, OUTPUT_MESSAGE_NGRAMS + ".parquet"))
),
OUTPUT_NGRAM_DEFS: ParquetTestData(
filepath=str(Path(test_data_dir, OUTPUT_NGRAM_DEFS + ".parquet"))
),
OUTPUT_MESSAGE: ParquetTestData(
filepath=str(Path(test_data_dir, OUTPUT_MESSAGE + ".parquet"))
),
},
)
1 change: 1 addition & 0 deletions testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
CsvTestData,
ExcelTestData,
JsonTestData,
ParquetTestData,
PolarsTestData,
)
from .testers import test_primary_analyzer, test_secondary_analyzer
5 changes: 5 additions & 0 deletions testing/testdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ def _load_as_polars(self) -> pl.DataFrame:
return pl.read_excel(self.filepath)


class ParquetTestData(FileTestData):
def _load_as_polars(self) -> pl.DataFrame:
return pl.read_parquet(self.filepath)


class PolarsTestData(TestData):
def __init__(self, df: pl.DataFrame):
self.df = df
Expand Down