civictechdc · KristijanArmeni · Jul 22, 2025 · Jun 29, 2025 · Jun 29, 2025 · Jun 29, 2025
diff --git a/analyzers/hashtags/interface.py b/analyzers/hashtags/interface.py
@@ -25,21 +25,21 @@
     id="hashtags",
     version="0.1.0",
     name="Hashtag analysis",
-    short_description="Computes the gini coefficient over hashtag usage",
+    short_description="Computes the concentration of hashtag usage over time.",
     long_description="""
-    Analysis of hashtags measures the extent of online coordination among social media users
-    by looking at how the usage of hashtags in messages changes over time. Specificaly,
-    it measures whether certain hashtags are being used more frequently than others (i.e. trending).
+Analysis of hashtags measures the extent of online coordination among social media users
+by looking at how the usage of hashtags in messages changes over time. Specificaly,
+it measures whether certain hashtags are being used more frequently than others (i.e. trending).
 
-    The intuition behind the analysis is that the users on social media, if coordinated by
-    an event, will converge on using a few hasthags more frequently than others
-    (e.g. #worldcup at the time when a soccer world cup starts). The (in)equality in
-    the distritution of hasthags can be taken as evidence of coordination and is quantified
-    using the Gini coefficient (see: https://ourworldindata.org/what-is-the-gini-coefficient).
+The intuition behind the analysis is that the users on social media, if coordinated by
+an event, will converge on using a few hasthags more frequently than others
+(e.g. #worldcup at the time when a soccer world cup starts). The (in)equality in
+the distritution of hasthags can be taken as evidence of coordination and is quantified
+using the Gini coefficient (see: https://ourworldindata.org/what-is-the-gini-coefficient).
 
-    The results of this test can be used in confirmatory analyses to measure
-    the extent of coordination in large datasets collected from social media platforms around
-    specific events/timepoints that are hypothesized to have been coordinated.
+The results of this test can be used in confirmatory analyses to measure
+the extent of coordination in large datasets collected from social media platforms around
+specific events/timepoints that are hypothesized to have been coordinated.
   """,
     input=AnalyzerInput(
         columns=[

diff --git a/analyzers/hashtags/main.py b/analyzers/hashtags/main.py
@@ -3,6 +3,7 @@
 import polars as pl
 
 from analyzer_interface.context import PrimaryAnalyzerContext
+from terminal_tools import ProgressReporter
 
 from .interface import (
     COL_AUTHOR_ID,
@@ -54,42 +55,46 @@ def hashtag_analysis(data_frame: pl.DataFrame, every="1h") -> pl.DataFrame:
         r"(#\S+)"
     )  # fetch all hashtags based on `#` symbol
 
-    # if hashtag symbol is detected, extract with regex
-    if data_frame.select(has_hashtag_symbols).item():
-        df_input = data_frame.with_columns(extract_hashtags).filter(
-            pl.col(COL_POST) != []
-        )
+    with ProgressReporter("Gathering hashtags..."):
+        # if hashtag symbol is detected, extract with regex
+        if data_frame.select(has_hashtag_symbols).item():
+            df_input = data_frame.with_columns(extract_hashtags).filter(
+                pl.col(COL_POST) != []
+            )
 
-    else:  # otherwise, we assume str: "['hashtag1', 'hashtag2', ...]"
-        raise ValueError(f"The data in {COL_POST} column appear to have no hashtags.")
+        else:  # otherwise, we assume str: "['hashtag1', 'hashtag2', ...]"
+            raise ValueError(
+                f"The data in {COL_POST} column appear to have no hashtags."
+            )
 
     # select columns and sort by time in ascending order
     # (expected by .group_by_dynamic below)
     df_input = df_input.select(pl.col([COL_AUTHOR_ID, COL_TIME, COL_POST])).sort(
         pl.col(COL_TIME)
     )
 
-    # compute gini per timewindow
-    df_out = (
-        df_input.explode(pl.col(COL_POST))
-        .group_by_dynamic(
-            pl.col(COL_TIME), every=every, period=every, start_by="datapoint"
-        )
-        .agg(
-            pl.col(COL_AUTHOR_ID).alias(OUTPUT_COL_USERS),
-            pl.col(COL_POST).alias(OUTPUT_COL_HASHTAGS),
-            pl.col(COL_POST).count().alias(OUTPUT_COL_COUNT),
-            pl.col(COL_POST)
-            .map_batches(gini, returns_scalar=True, return_dtype=pl.Float64)
-            .alias(OUTPUT_COL_GINI),
+    with ProgressReporter("Counting hashtags..."):
+        # compute gini per timewindow
+        df_out = (
+            df_input.explode(pl.col(COL_POST))
+            .group_by_dynamic(
+                pl.col(COL_TIME), every=every, period=every, start_by="datapoint"
+            )
+            .agg(
+                pl.col(COL_AUTHOR_ID).alias(OUTPUT_COL_USERS),
+                pl.col(COL_POST).alias(OUTPUT_COL_HASHTAGS),
+                pl.col(COL_POST).count().alias(OUTPUT_COL_COUNT),
+                pl.col(COL_POST)
+                .map_batches(gini, returns_scalar=True, return_dtype=pl.Float64)
+                .alias(OUTPUT_COL_GINI),
+            )
+            .with_columns(
+                pl.col(OUTPUT_COL_GINI)
+                .rolling_mean(window_size=SMOOTH_WINDOW_SIZE, center=True)
+                .alias(OUTPUT_COL_GINI + "_smooth")
+            )
+            .rename({COL_TIME: OUTPUT_COL_TIMESPAN})
         )
-        .with_columns(
-            pl.col(OUTPUT_COL_GINI)
-            .rolling_mean(window_size=SMOOTH_WINDOW_SIZE, center=True)
-            .alias(OUTPUT_COL_GINI + "_smooth")
-        )
-        .rename({COL_TIME: OUTPUT_COL_TIMESPAN})
-    )
 
     # convert datetime back to string
     df_out = df_out.with_columns(