From 0c219daeeaf7b5f3a12e2a85c9dc4a6940e8b5b6 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Wed, 16 Jul 2025 15:20:16 -0400 Subject: [PATCH 01/11] test: initial commit, add test_ngrams.py --- analyzers/ngrams/test_ngrams.py | 162 ++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 analyzers/ngrams/test_ngrams.py diff --git a/analyzers/ngrams/test_ngrams.py b/analyzers/ngrams/test_ngrams.py new file mode 100644 index 00000000..2b6a6f69 --- /dev/null +++ b/analyzers/ngrams/test_ngrams.py @@ -0,0 +1,162 @@ +import types +from pathlib import Path + +from preprocessing.series_semantic import datetime_string, identifier, text_catch_all +from testing import CsvTestData, ParquetTestData, test_primary_analyzer + +from .interface import ( + COL_AUTHOR_ID, + COL_MESSAGE_ID, + COL_MESSAGE_TEXT, + COL_MESSAGE_TIMESTAMP, + OUTPUT_MESSAGE, + OUTPUT_MESSAGE_NGRAMS, + OUTPUT_NGRAM_DEFS, + interface, +) +from .main import main, ngrams, serialize_ngram, tokenize +from .test_data import test_data_dir + +TEST_CSV_FILENAME = "ngrams_test_input.csv" +TEST_STRING = "Mango tree is an open source project." + +# this is expected output of tokenize() +TEST_TOKENIZED_EXPECTED = [ + "mango", # it's lower cased + "tree", + "is", + "an", + "open", + "source", + "project.", # puncutation is not stripped +] + +NGRAMS_EXPECTED_min1_max3 = [ + ["mango"], + ["mango", "tree"], + ["mango", "tree", "is"], + ["tree"], + ["tree", "is"], + ["tree", "is", "an"], + ["is"], + ["is", "an"], + ["is", "an", "open"], + ["an"], + ["an", "open"], + ["an", "open", "source"], + ["open"], + ["open", "source"], + ["open", "source", "project."], + ["source"], + ["source", "project."], + ["project."], +] + +NGRAMS_EXPECTED_min5_max7 = [ + ["mango", "tree", "is", "an", "open"], + ["mango", "tree", "is", "an", "open", "source"], + ["mango", "tree", "is", "an", "open", "source", "project."], + ["tree", "is", "an", "open", "source"], + ["tree", "is", "an", "open", "source", "project."], + ["is", "an", "open", "source", "project."], +] + +# if max ngram len is not found, it just returns all the shortest ngrams +NGRAMS_EXPECTED_min5_max8 = [ + ["mango", "tree", "is", "an", "open"], + ["mango", "tree", "is", "an", "open", "source"], + ["mango", "tree", "is", "an", "open", "source", "project."], + ["tree", "is", "an", "open", "source"], + ["tree", "is", "an", "open", "source", "project."], + ["is", "an", "open", "source", "project."], +] + + +def test_tokenize(): + test_tokenized_actual = tokenize(TEST_STRING) + + assert isinstance( + test_tokenized_actual, list + ), "output of tokenize() is not instance of list" + + assert all( + [ + expected_str == actual_str + for expected_str, actual_str in zip( + TEST_TOKENIZED_EXPECTED, test_tokenized_actual + ) + ] + ), "Tokenized strings does not matched expected tokens." + + pass + + +def test_ngrams(): + test_string_tokenized = tokenize(TEST_STRING) + + test_combinations = { + "min1_max3": { + "min_gram_len": 1, + "max_ngram_len": 3, + "n_expected_ngrams_found": 18, + }, + "min5_max7": { + "min_gram_len": 5, + "max_ngram_len": 7, + "n_expected_ngrams_found": 6, + }, + "min5_max8": { + "min_gram_len": 5, + "max_ngram_len": 8, + "n_expected_ngrams_found": 6, + }, + } + + for test_key, test_params in test_combinations.items(): + ngrams_actual = ngrams( + test_string_tokenized, + min=test_params["min_gram_len"], + max=test_params["max_ngram_len"], + ) + + assert isinstance(ngrams_actual, types.GeneratorType) + assert ( + len(list(ngrams_actual)) == test_params["n_expected_ngrams_found"] + ), f"Nr. expected tokens mismatch for {test_key}" + + +def test_serialize_ngram(): + NGRAM_SERIALIZED_EXPECTED_FIRST = "mango tree is an open" + + test_ngrams = list(ngrams(tokenize(TEST_STRING), min=5, max=8)) + + test_ngram_serialized_actual = serialize_ngram(test_ngrams[0]) + + assert NGRAM_SERIALIZED_EXPECTED_FIRST == test_ngram_serialized_actual + + +def test_ngram_analyzer(): + test_primary_analyzer( + interface=interface, + main=main, + input=CsvTestData( + filepath=str(Path(test_data_dir, TEST_CSV_FILENAME)), + semantics={ + COL_AUTHOR_ID: identifier, + COL_MESSAGE_ID: identifier, + COL_MESSAGE_TEXT: text_catch_all, + COL_MESSAGE_TIMESTAMP: datetime_string, + }, + ), + outputs={ + OUTPUT_MESSAGE_NGRAMS: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_MESSAGE_NGRAMS + ".parquet")) + ), + OUTPUT_NGRAM_DEFS: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_NGRAM_DEFS + ".parquet")) + ), + OUTPUT_MESSAGE: ParquetTestData( + filepath=str(Path(test_data_dir, OUTPUT_MESSAGE + ".parquet")) + ), + }, + ) From 7c67cbe5dd1841a681b4799474371a9b94cb3059 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Wed, 16 Jul 2025 15:31:03 -0400 Subject: [PATCH 02/11] test: initial commit add .csv and .parquet data for testing --- .../ngrams/test_data/message_authors.parquet | Bin 0 -> 3092 bytes .../ngrams/test_data/message_ngrams.parquet | Bin 0 -> 1859 bytes analyzers/ngrams/test_data/ngrams.parquet | Bin 0 -> 3035 bytes analyzers/ngrams/test_data/ngrams_test_input.csv | 13 +++++++++++++ 4 files changed, 13 insertions(+) create mode 100644 analyzers/ngrams/test_data/message_authors.parquet create mode 100644 analyzers/ngrams/test_data/message_ngrams.parquet create mode 100644 analyzers/ngrams/test_data/ngrams.parquet create mode 100644 analyzers/ngrams/test_data/ngrams_test_input.csv diff --git a/analyzers/ngrams/test_data/message_authors.parquet b/analyzers/ngrams/test_data/message_authors.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d1dcb16810624d5e37ed6e6f6affe148766d8bd4 GIT binary patch literal 3092 zcmdT{4NOy46u$3K$^$wD;XPhmjZX|!hd?RvXWYUo@{b6JEg}<2c|fJ5*jE$~4JvU4 z5oi1Z7MH*_bcz!;F-nv%WwJSEbWU(vVpQC8VvNJcHYeNeeQmKM%w=v%wwJc|ynD|1 z&Ueo}_uMpTLP^V&i#hc7dmO@&$um!r=oB@|6vz^NygoLKNj;E=>FoUB4DfSrYlkuS_q1S@H) zpa`~es6C8*X4$^JKuA9V?#ite-bH? z`T?<&cC+)evAcy+=P`M$6QVakR7mLycd|h|j-6IxG8gMh3Yh}4$;cF$j2u&>5DcN&;=^$^9N9 zB_M2C3X{6%vbd+|=WD(O)yVCp&ZDockQ-ZFnyxB{!oM>4ughi_@r6y~*C%{ahnQlq>Yk0->A`X~Cnd3?B~{`AGRzP6o5!lztHqCHe(T+8`wgQGW>x{Pkg z7}QLE)c8pG*_{4E3tBfF4sYDDGe8+|_xR(@Znd(e$|=($V-kj%eNJ9DeJ<0~wpOjH zNyN9CysE-PjOrKf9#jDW&TiOtk*!*cON;e^wl``t!x=s z^TOer`mdL4+OOW^KX5%-GtJ9$?cl6wO*ePf?tH7t7!Z4rqh>WO^}n>;@7$^=;+?@r zr?fqW9S4>?Xzu$)chG4n=-84QwY|PSYEIj7i*udpLa&XzLpw~0>c0EK$F3LM?J5pW z_ZM9`y0`1l2l+Sfy`5jaw(U??)QY~W+!=uZQ4-fDzh$|(*^5nLFE$HTZV?Vc>m=$P zA@6lY<@R4JBiK#K$=GDlZLFCrxB@^m0Sl~*=gc_@#k^2>+(aTGss|fUxxn)wBLl~G z5Sw8rP1vNd(>rOEy@NP86CTV(0>DgJ`-h8QYde(HICUM9zwIgXjEUf2*z*|Kjg32& z1PCfUA)PD+GyzUrj{9kn7_zpuu7XY+c#EukV@#LdJ@j~=5vYSe9p_Sr(3ItbZ!huUUii z2`pP^vs;Qr6I8RINJ>O{l@5srm|a{6og$HhoZzJ3r;3IHjIadZ!ZF>3w+NZA>_P=} z3KAGty`ZZJ@qkmvM5R{8EMlV()%f(%JdL_CKQ7iH1Qp^%@Cd~*UMzfnM!X0V5Aua3 z2t^gFQ>rU6m4;#~|G$GLd4Y8mVFt};#8ACYv!fduwyxXz^Dj-RQP}3 EcSW{vt^fc4 literal 0 HcmV?d00001 diff --git a/analyzers/ngrams/test_data/message_ngrams.parquet b/analyzers/ngrams/test_data/message_ngrams.parquet new file mode 100644 index 0000000000000000000000000000000000000000..585915e7a44e3178c70840b94101c2fc55587e89 GIT binary patch literal 1859 zcmWG=3^EjD5k0^t8sj4>A6J?VzW6QUF!!obLw6CR9a3|0)b43`(#W^dEExiRT*)8AKHwXd&@ z+MKjEYpd4Pl|hT0YF};9yu8$FvXX7q7LAJwotl~czTB*SelD}|kq=JX+Di(YIF(yI zII>Hp6gaX9yL@n9=2j_iU}R?bu>StOn(wd9c7I=YcUQ^tQ@z~Z*WTWibA1)-WS37K z%H1kO9*WH@pWNlEpA@>w79aWOCY`;c&`mPA<)f>3bV{MCXt2vi7h!LeLKi`2mXFT- z)=vtYc^Us(y_^$#&f;Os<+EltV}j3`T(r4-#^|I>@EL=He=eQY+j%GGw9dvomriM| zoD+0PV`0sulWH?#f;<~`!{T%kv*;0Kv{>B@jY|m*C<8=+WA)}qHdw53m@+UjF|)9; zv2$>8ar5xc~xH7VG@(P@aO3Es#YU&!ATG~3gdin;2M#d(l zX66=_R@OGQcJ>aAPR=f_ZtfnQUfw>we*OW0LBS!RVc`*xQPDB6aq$U>Ny#axY3Ui6 zS=l+cdHDr}Ma3nhW#tu>Rn;}Mb@dI6P0cN>ZS5VMUEMvsef<+APMSPr>a^)IX3m;D zXYRcD3l=U~ykzOJ$dGXcJA5@m4~N4j=c1u#9VlqdnGOQ zgG22MGdR&<3magf;}Dhd5rwBX1r|jH22hRwW*QMuHPql`0ZB-JNm#IQ1v0>b6_{Z} znV|WAH95aDuLP3ttHkCps%_&1xdFuhNkJw_MD7DJVOdg?K~ju?0U`~w2MmDHF!v)# zgUyqKxCkmNBi6tudV~>Z0E48A*a{BOIV@22D_+qo5EJM~9}kwI{QMG8HU?1^RVM7Y z79s@?P@p!TN{GKyrF_;1D+mG20%mMU8A%(pBTO>zIG~^m5W68DDj~LxU(}94?FbLT zNVqE?Axbb>#Xf+{c>^}*3ZD!~(IVC%C@LnlN+z5kPg0jxNO@qy0c!pk^S51e1Y4Z9v;x(}9MA=s=J;K-FML zM@LJL{gqG-Pzz8i&=H|P0jI1aj8!65sAbP&{i zAjbiLivh?zV4s0x!CJFHPJsHusWi&iD?MLvMeQ4u1HJI$lRWlos*lF|9L^7 za!1k5;*!#`@`}o;U0>|pquRS~|AB+mHHQu#Ir`hzJGaqE z7JjDVn;vrFf==_O#O<{H;80+^y=6_=Y+~1I9m2X-TWt1yC*K)pb7u~Jn*KW9o9p0a z3ZUJug`iMW%2=vz`S|K^6)mlt?LXdlnphnt+uodCFK{vZzRHC4QE}nZJ$4;|qGiS>2!F2{ovv6XXnB#iHR^fml9nAuYIGg9>d;>`y69~S zTbDjuuKS|gQsC>ro5feCzswpl%~7wdTXEf-kXsftRyXm4Q!U8c-Md53>sMXat+q~E zM#_9SLCihqsjt3cHoART{~=G;>(uazWtMMxB0X*s`to+peUww?-(X+d{=l3@BJ^ib z?>njDpFMFddpP{q$~|hRd&9bp5&P4*xO{%nqoahmF#On2yt+Gkloh%sTDo-ajB^>k zm(#_Nk{j3Po@pLXhF>I%Igx!T<_>NNo$M2R8dG2?8Ioi7J(s?U$liHJdTjFYh&Q>g zs)@eVNVSL2+-+WnJyzP@VnFikq6H~U)XqB%W6MWI2vr`D-$XtVQitA5o;yMaFD_qo zu*LbnYMOel#p_^IX>qjlt{hdn?*TD5wjx$*1Dz!=({dSOp8as?M+1Ck01u ztel}6uaMIDMvQKn$@h&}{9999%VnCY-+ea0XpC-oR8SLk>}u}~#Z2+d6UshFU-tF1 zRnfvgNl4$7&v`k@fS@(+dMIZ)4x|{`Brf-^uAYcIwdyN+c;Mm4Yw((-=91;RE8z#8 zH>vKXhdX-}ml*mQxZiw32&LdP2KN#vVJ71#8)?ViM{YoOPh)~I)<>5g_wuybM6`Xz z$;f#E?2fyupN;Zm)Yfn8#0f)5hqB1(`hLgAwTvb-+lRpk;S7}Vi|sIGE;W`>q8U=G zy&LEUI8^5D=-N4jYfoZR#vi{3QaXEVYGVG{IUji5zOt27c1GH8txcjs-F#?{)JW6` zR?fUgvt_D&c{!OSu(s8)7C*kAd0ANSqU0?(<%cV!{t-@-j5Ly2zLTK#=ivU(b%ira z99GfO_Re2?#z-{_siO>c-_i6*pCYR6<|sE!O-sML zp7ab{)_-FQa4FJgpvK4F)7bTv+2!)|#9U@YZ|tsVe87(xD*o~0_C2RPezB-({O#80 z5#1>s(W7?oK0mah($Q{}enefzK3QcHxc?E4==(#ff~+~)PM*}OX(&y2V0P-pXj^MY zpXArUU6#34!M$LYau2Sxuyg{zyWYhFd@hW;*n9PX^Vo#7lAbsjW|*fbIp*D(+{r4B z$lE5)+I)D)t{e6x?1_uzl6&d;SF3fZIVV#8URS?wUh!j09?SjA@?ZbriggK&%wAKHBynP58z@>3w8 zLlG8shDW4!7Qkk3Hf{nO9H9av4s5MeLXa1dKWmYJU0~P6eX!FJsV9;@OPgPtJV0y8 z-2f8-#O5I(Vai=hskMV3em3m18jb_2kgWxZGUBAdY{XR?FIyYxkMWQl|C@N>a6B)> z9kGH(0>lv@`-CaN2CPbsllrG^a@{6@ARixqg5%4!#mE&@7JL~$Jiui%0Px}8`hNgh CqcC~^ literal 0 HcmV?d00001 diff --git a/analyzers/ngrams/test_data/ngrams_test_input.csv b/analyzers/ngrams/test_data/ngrams_test_input.csv new file mode 100644 index 00000000..27853095 --- /dev/null +++ b/analyzers/ngrams/test_data/ngrams_test_input.csv @@ -0,0 +1,13 @@ +user_id,message_id,message_text,timestamp +user_004,msg_001,Urgent action needed before it's late.,2024-01-15T09:03:00Z +user_005,msg_002,Climate emergency requires urgent action.,2024-01-15T12:12:00Z +user_004,msg_003,Urgent action needed to save planet.,2024-01-15T13:15:00Z +user_004,msg_004,"Climate emergency requires immediate response.",2024-01-15T14:18:00Z +user_005,msg_005,Urgent action needed to save planet.,2024-01-15T15:21:00Z +user_004,msg_006,Climate emergency requires massive investment.,2024-01-15T16:24:00Z +user_004,msg_007,Climate emergency requires global cooperation.,2024-01-15T19:33:00Z +user_005,msg_008,Someone needs fight the system soon.,2024-01-15T20:36:00Z +user_004,msg_009,We must get up and fight the system.,2024-01-15T21:39:00Z +user_001,msg_010,Just discovered this amazing new coffee shop downtown! The atmosphere is incredible and the barista really knows their craft.,2024-01-16T22:51:00Z +user_002,msg_011,Working from home has its perks but I miss the office dynamics sometimes. Finding balance is key.,2024-01-16T23:54:00Z +user_003,msg_012,Sunday morning thoughts: grateful for family time and peaceful moments. Life's simple pleasures matter most.,2024-01-17T00:57:00Z From 8bb32db90afcd3382bcdd8f5ab82a15da4abce48 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Wed, 16 Jul 2025 15:32:17 -0400 Subject: [PATCH 03/11] test: add ParquetTestData class --- testing/__init__.py | 1 + testing/testdata.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/testing/__init__.py b/testing/__init__.py index 2d53a031..962da5f2 100644 --- a/testing/__init__.py +++ b/testing/__init__.py @@ -3,6 +3,7 @@ CsvTestData, ExcelTestData, JsonTestData, + ParquetTestData, PolarsTestData, ) from .testers import test_primary_analyzer, test_secondary_analyzer diff --git a/testing/testdata.py b/testing/testdata.py index ec17e75a..1cfd61d8 100644 --- a/testing/testdata.py +++ b/testing/testdata.py @@ -106,6 +106,11 @@ def _load_as_polars(self) -> pl.DataFrame: return pl.read_excel(self.filepath) +class ParquetTestData(FileTestData): + def _load_as_polars(self) -> pl.DataFrame: + return pl.read_parquet(self.filepath) + + class PolarsTestData(TestData): def __init__(self, df: pl.DataFrame): self.df = df From eba7a579a520e84358a7f5f19163e612285ff9d4 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Wed, 16 Jul 2025 15:36:23 -0400 Subject: [PATCH 04/11] feat: sort the output of n-gram statistics, change print feedback --- analyzers/ngrams/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analyzers/ngrams/main.py b/analyzers/ngrams/main.py index 01525717..4328e8d7 100644 --- a/analyzers/ngrams/main.py +++ b/analyzers/ngrams/main.py @@ -35,7 +35,7 @@ def main(context: PrimaryAnalyzerContext): & (pl.col(COL_AUTHOR_ID) != "") ) - with ProgressReporter("Generating n-grams") as progress: + with ProgressReporter("Detecting n-grams") as progress: def get_ngram_rows(ngrams_by_id: dict[str, int]): nonlocal progress @@ -64,6 +64,7 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]): pl.DataFrame(df_ngram_instances) .group_by(COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID) .agg(pl.count().alias(COL_MESSAGE_NGRAM_COUNT)) + .sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID]) .write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path) ) From 1d71652d5930156c7e8bb21ee573579d8b315641 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Wed, 16 Jul 2025 15:37:07 -0400 Subject: [PATCH 05/11] test: add __init__.py --- analyzers/ngrams/test_data/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 analyzers/ngrams/test_data/__init__.py diff --git a/analyzers/ngrams/test_data/__init__.py b/analyzers/ngrams/test_data/__init__.py new file mode 100644 index 00000000..8906f86c --- /dev/null +++ b/analyzers/ngrams/test_data/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +test_data_dir = Path(__file__).parent.resolve() From 82f76e615cd7c8c4199bce9f83d1c11c84b2f6b8 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Thu, 17 Jul 2025 16:32:40 -0400 Subject: [PATCH 06/11] refactor: move ngram analyzers to a single ngrams folder --- analyzers/{ => ngrams}/ngram_stats/__init__.py | 0 analyzers/{ => ngrams}/ngram_stats/interface.py | 0 analyzers/{ => ngrams}/ngram_stats/main.py | 0 analyzers/{ => ngrams}/ngram_web/__init__.py | 0 analyzers/{ => ngrams}/ngram_web/factory.py | 0 analyzers/{ => ngrams}/ngram_web/interface.py | 0 analyzers/ngrams/{ => ngrams_base}/__init__.py | 0 analyzers/ngrams/{ => ngrams_base}/interface.py | 0 analyzers/ngrams/{ => ngrams_base}/main.py | 0 analyzers/ngrams/{ => ngrams_base}/test_ngrams.py | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename analyzers/{ => ngrams}/ngram_stats/__init__.py (100%) rename analyzers/{ => ngrams}/ngram_stats/interface.py (100%) rename analyzers/{ => ngrams}/ngram_stats/main.py (100%) rename analyzers/{ => ngrams}/ngram_web/__init__.py (100%) rename analyzers/{ => ngrams}/ngram_web/factory.py (100%) rename analyzers/{ => ngrams}/ngram_web/interface.py (100%) rename analyzers/ngrams/{ => ngrams_base}/__init__.py (100%) rename analyzers/ngrams/{ => ngrams_base}/interface.py (100%) rename analyzers/ngrams/{ => ngrams_base}/main.py (100%) rename analyzers/ngrams/{ => ngrams_base}/test_ngrams.py (100%) diff --git a/analyzers/ngram_stats/__init__.py b/analyzers/ngrams/ngram_stats/__init__.py similarity index 100% rename from analyzers/ngram_stats/__init__.py rename to analyzers/ngrams/ngram_stats/__init__.py diff --git a/analyzers/ngram_stats/interface.py b/analyzers/ngrams/ngram_stats/interface.py similarity index 100% rename from analyzers/ngram_stats/interface.py rename to analyzers/ngrams/ngram_stats/interface.py diff --git a/analyzers/ngram_stats/main.py b/analyzers/ngrams/ngram_stats/main.py similarity index 100% rename from analyzers/ngram_stats/main.py rename to analyzers/ngrams/ngram_stats/main.py diff --git a/analyzers/ngram_web/__init__.py b/analyzers/ngrams/ngram_web/__init__.py similarity index 100% rename from analyzers/ngram_web/__init__.py rename to analyzers/ngrams/ngram_web/__init__.py diff --git a/analyzers/ngram_web/factory.py b/analyzers/ngrams/ngram_web/factory.py similarity index 100% rename from analyzers/ngram_web/factory.py rename to analyzers/ngrams/ngram_web/factory.py diff --git a/analyzers/ngram_web/interface.py b/analyzers/ngrams/ngram_web/interface.py similarity index 100% rename from analyzers/ngram_web/interface.py rename to analyzers/ngrams/ngram_web/interface.py diff --git a/analyzers/ngrams/__init__.py b/analyzers/ngrams/ngrams_base/__init__.py similarity index 100% rename from analyzers/ngrams/__init__.py rename to analyzers/ngrams/ngrams_base/__init__.py diff --git a/analyzers/ngrams/interface.py b/analyzers/ngrams/ngrams_base/interface.py similarity index 100% rename from analyzers/ngrams/interface.py rename to analyzers/ngrams/ngrams_base/interface.py diff --git a/analyzers/ngrams/main.py b/analyzers/ngrams/ngrams_base/main.py similarity index 100% rename from analyzers/ngrams/main.py rename to analyzers/ngrams/ngrams_base/main.py diff --git a/analyzers/ngrams/test_ngrams.py b/analyzers/ngrams/ngrams_base/test_ngrams.py similarity index 100% rename from analyzers/ngrams/test_ngrams.py rename to analyzers/ngrams/ngrams_base/test_ngrams.py From 959b1a11aa96b36e2b8b8ae4be6dd2f59c56d105 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Thu, 17 Jul 2025 16:34:21 -0400 Subject: [PATCH 07/11] refactor: update import statements --- analyzers/__init__.py | 6 +++--- analyzers/ngrams/ngram_stats/interface.py | 4 ++-- analyzers/ngrams/ngram_stats/main.py | 2 +- analyzers/ngrams/ngram_web/interface.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/analyzers/__init__.py b/analyzers/__init__.py index 9fbf9c9c..2b2723ab 100644 --- a/analyzers/__init__.py +++ b/analyzers/__init__.py @@ -5,9 +5,9 @@ from .example.example_web import example_web from .hashtags import hashtags from .hashtags_web import hashtags_web -from .ngram_stats import ngram_stats -from .ngram_web import ngrams_web -from .ngrams import ngrams +from .ngrams.ngram_stats import ngram_stats +from .ngrams.ngram_web import ngrams_web +from .ngrams.ngrams_base import ngrams from .temporal import temporal from .temporal_barplot import temporal_barplot from .time_coordination import time_coordination diff --git a/analyzers/ngrams/ngram_stats/interface.py b/analyzers/ngrams/ngram_stats/interface.py index 85f055e0..5b904d08 100644 --- a/analyzers/ngrams/ngram_stats/interface.py +++ b/analyzers/ngrams/ngram_stats/interface.py @@ -1,7 +1,7 @@ from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface -from ..ngrams import interface as ngrams_interface -from ..ngrams.interface import ( +from ..ngrams_base import interface as ngrams_interface +from ..ngrams_base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_NGRAM_COUNT, diff --git a/analyzers/ngrams/ngram_stats/main.py b/analyzers/ngrams/ngram_stats/main.py index 09ab5bf6..7aa4f961 100644 --- a/analyzers/ngrams/ngram_stats/main.py +++ b/analyzers/ngrams/ngram_stats/main.py @@ -5,7 +5,7 @@ from analyzer_interface.context import SecondaryAnalyzerContext from terminal_tools import ProgressReporter -from ..ngrams.interface import ( +from ..ngrams_base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_NGRAM_COUNT, diff --git a/analyzers/ngrams/ngram_web/interface.py b/analyzers/ngrams/ngram_web/interface.py index 35b78399..203514a4 100644 --- a/analyzers/ngrams/ngram_web/interface.py +++ b/analyzers/ngrams/ngram_web/interface.py @@ -1,7 +1,7 @@ from analyzer_interface import WebPresenterInterface from ..ngram_stats import interface as ngram_stats_interface -from ..ngrams import interface as ngrams_interface +from ..ngrams_base import interface as ngrams_interface interface = WebPresenterInterface( id="ngram_repetition_by_poster", From b2009fa40aa0acac9c51bf3641b5fb026c71bf67 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Thu, 17 Jul 2025 16:36:27 -0400 Subject: [PATCH 08/11] refactor: move and rename base test --- .../{ngrams_base/test_ngrams.py => test_ngrams_base.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename analyzers/ngrams/{ngrams_base/test_ngrams.py => test_ngrams_base.py} (97%) diff --git a/analyzers/ngrams/ngrams_base/test_ngrams.py b/analyzers/ngrams/test_ngrams_base.py similarity index 97% rename from analyzers/ngrams/ngrams_base/test_ngrams.py rename to analyzers/ngrams/test_ngrams_base.py index 2b6a6f69..417adf79 100644 --- a/analyzers/ngrams/ngrams_base/test_ngrams.py +++ b/analyzers/ngrams/test_ngrams_base.py @@ -4,7 +4,7 @@ from preprocessing.series_semantic import datetime_string, identifier, text_catch_all from testing import CsvTestData, ParquetTestData, test_primary_analyzer -from .interface import ( +from .ngrams_base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_TEXT, @@ -14,7 +14,7 @@ OUTPUT_NGRAM_DEFS, interface, ) -from .main import main, ngrams, serialize_ngram, tokenize +from .ngrams_base.main import main, ngrams, serialize_ngram, tokenize from .test_data import test_data_dir TEST_CSV_FILENAME = "ngrams_test_input.csv" From 9792c4a167ce2a2a47160af94bf06d3acc744a13 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Thu, 17 Jul 2025 16:38:34 -0400 Subject: [PATCH 09/11] test: add parquet data for ngram_stats test --- analyzers/ngrams/test_data/ngram_full.parquet | Bin 0 -> 4869 bytes .../ngrams/test_data/ngram_stats.parquet | Bin 0 -> 2217 bytes analyzers/ngrams/test_ngram_stats.py | 41 ++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 analyzers/ngrams/test_data/ngram_full.parquet create mode 100644 analyzers/ngrams/test_data/ngram_stats.parquet create mode 100644 analyzers/ngrams/test_ngram_stats.py diff --git a/analyzers/ngrams/test_data/ngram_full.parquet b/analyzers/ngrams/test_data/ngram_full.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d47cb526ab2f50da2c700d7b40ef08662c9582ad GIT binary patch literal 4869 zcmd5=O>7%Q6rOc9vC}xI(|We+V8lX`AwaMl$7xekq^^@TY3qh05XXrS(Z=357XGVu zlQb1Vh!dbnR2867fy4z-B2=NO7o-SrKs^BxKSJe#fMekX0>n?nd$ZnnleiIxQZdTx z{Jb}BzV|b4#u-y5dZ|d?5NIYr{gmt?WK8stW`XQ(c8g?ZtB34@N})EldfEjdwKgAY zCsy%STS{m*?dhQ*>Z47xDJXlSDn<5GJ47Pgb!LI;Pq^S45=6Jp7afXDPR=i#f8NdB zbp@#y=qF9Sfb1z|D_UVTm!U2xBz@zOF1pwY0`vTnjRHNOCZR3@A0IS?z*wfi1RzGO zy*-UECqivN8lgd|)HK>hx<#bRB2uBW#Vph=H<^5Mb@hIBwlPRuRlGux13f257X)@m zWIs0qHwmbV=8p#YtjqSo4dHAM_F1fC^`fb0X){+UDn(t-=o!TXJxr%Q1Nit8o zJAtM3-S(s4`Xf()cJXC2!3L*-2Epyn$iB@WLFP@Id%ZbGy;Wm1E|)49gB$FVM(J7u zyW(Nzn$TeXbi2An-QZ&M*>)~Qnk*t0lyw*TN#o@Ggs?BYH5czNOQx2et>|S7y5;Mi zTG(0(_q9V(U05FKO;8t)JkiUek85g@Ov8xoa>)+n8uGSX> znzGeC(T%bSMizpqFVI5zp~j%@gStP^(i5$bdpD~NvEjB>?w6~b-kT`-+8wo6+hHqN z4L4Bjy1*`Wu*(4qx2+=UDzPfk&mbv#x$|CjUF(FMC$~w;b}kM9-=HJuMomZ3RdR)b zp2=ZCh7?>b6%GCAC>bhfhLKy+m0WR2H_QSgewdAA^QAd0ucS*Qh(}F$M-1J!^a`9% z;jQ+s#1MPpdva(_pD$Hloq4aJKLjmGtq@ z4G9g9CO=KV1YsBNu6e<0d%@ICnfxGq&?NmJvP*ZdwIKT8zrj`4mt|VC<8dHDUiW_d zds`DBJKuisPr>8;a^}(>(2V?2z9D#0KYr^Mb^*Jiy;FIkt9B}z%^VoZ)Cy&8Cy3Wo7yG)CeX$pg;#P?_*{7c7fsgu$++?H| z^nykuc=>d>YL6;-6v0p^c6B7)*{#40K&Tfctb4hOB=8x+SsZtsgBKTXxq}>Zlx3%V zr?ibJ*1}XbFIjLLr`eW*k~f5xO+fW(#_aMZ&a?-6Knu79LV;jl?KV6v|Mh#2-L5y% zt$Wto;%CV%emMMR_xtU>S=lYV6k;Cf(jNFJ8wko?bzvdHT8kAv_JGw9vP0nYAld%lW zse@03m%`!9ru8G7W}6DQaBIhUZ3<|ksLumgTo2T^y1*Be@N41Y<4OF^O(jpKqeqG} z$pfb|sbed1(aZv1nCEhUUbOc|cR-2RZoM_0YzTSfHkH(t(ewnn{i3)AvBKZc7O_T)|*o+yK# z%5#O$NUATs1pR!v$la~ZgH6IKxF#M$dDyAiC(UQ^V-<))Rm}_^h)hqNnok#!%W2rR zvc7(>WUsH{-@Kk1EE7GjGBd*W2594!exr}UQXX%E94`br6i2*aH@*la!gz-PH;v5% ze#_$d0Q;aJY@ygaY{}aazqmBMIMa7*VJ?ZbbaG?Uu5Gkw+`6?6YW~@Urb$TN z+PP0f`=O4iRfi1mlZ@hY!oDa41;5Bprw(Bs`cWq8<`)&Ds7&#ko1{(GZbR86$vgMl zbAIP{f9KvuVj}(^)7Y;nJE26`E(QQQKWzUyu>6ApJvIOZRIq^tc5pxgIH3_-&;)MS z2HT+-JkSDOXayfI;FwB-@5c;<0lSBFKEQm;!yL@v?}VMsT7>SbM9N=T&HEZ)9RQzV zw-@)0w$5Lo8Pkm2;P4o_Ge1#EOpRyL%*}m#PUGiP)A{I)AhGnk##R+74z=R`qos*V z-U=jAR<@82_F0hd*< z61nkGrdW}y!sM$)$)p|!nK)Tnl&Nb{XgeyKLIxgo*G37b_C^LyQIi0CYuopvvt=ur zPg&!|LfOicDjY0+N-@7{k`(HJj20D-wA>W_pjo>Y3=A~o@}>i6=r`?pU8?Mup-cVi z-9!Q1B*qqQTCKYM*hDt35G%vamuzg#R<&PovRu_%bgh{u-7HOBlq0uKD-{YBbK%_$ zs_XX_m>Z{+;~a#jUq+)x$p_n_XIlf_nxn3d*R5>QczvDw48L*MTvq-6^Nqnzdzj9D z+{X5T`JTgnyRRSrgy2taXVYN*<@B!|n_Jx)e#yf)f2*180`r<{J+Drrx@)(^Vdp;+ zz~>U+5BEC2CM_k*jm|5dZ9$Y(Lg7pJ0ydreY@?43hwc{f_|U;e%Bdrnsf50)wbST~ zgoO|h{EDt=oF5w|6X()3C`4I2Ap|~$#r-i{pP8Zcejz5RaR=yGz-7@%)*$7FN692$ zF?1Z0x;n_jBy`HWFr{OKrwJFI@haYv6e2{=LzvVq<8g*5Xv#Pt`r^e@C|*oXJ=Jk2 z6rM&qm&#Yt_7K10lD{m{C$S<(ACtrV#0Q@d+7Ax74 Date: Thu, 17 Jul 2025 16:39:03 -0400 Subject: [PATCH 10/11] initial commit, __init__.py --- analyzers/ngrams/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 analyzers/ngrams/__init__.py diff --git a/analyzers/ngrams/__init__.py b/analyzers/ngrams/__init__.py new file mode 100644 index 00000000..e69de29b From 81b295779f3fbdd72678d4d35f77b683958a20a3 Mon Sep 17 00:00:00 2001 From: Kristijan Armeni Date: Thu, 17 Jul 2025 16:46:34 -0400 Subject: [PATCH 11/11] chore: pl.count() deprecated use pl.len() --- analyzers/ngrams/ngrams_base/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analyzers/ngrams/ngrams_base/main.py b/analyzers/ngrams/ngrams_base/main.py index 4328e8d7..8b54a6a3 100644 --- a/analyzers/ngrams/ngrams_base/main.py +++ b/analyzers/ngrams/ngrams_base/main.py @@ -63,7 +63,7 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]): ( pl.DataFrame(df_ngram_instances) .group_by(COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID) - .agg(pl.count().alias(COL_MESSAGE_NGRAM_COUNT)) + .agg(pl.len().alias(COL_MESSAGE_NGRAM_COUNT)) .sort(by=[COL_MESSAGE_SURROGATE_ID, COL_NGRAM_ID]) .write_parquet(context.output(OUTPUT_MESSAGE_NGRAMS).parquet_path) )