Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
1d92f70
remove accordion, remove filters from dataframe
KristijanArmeni Jun 29, 2025
f41cbc6
remove vertical line, remove unused dependencies in plots.py
KristijanArmeni Jun 29, 2025
fa8476e
remove date picker, add code to select date by clicking on line plot
KristijanArmeni Jun 29, 2025
1ff639b
fix conlflicts, move color declarations to plots.py
KristijanArmeni Jun 29, 2025
bec78f7
format hover labels
KristijanArmeni Jun 29, 2025
aae4c96
[MAINT] move code for placeholder figures to plots.py
KristijanArmeni Jun 30, 2025
1bbedb4
[MAINT] move `clicked_data` higher up
KristijanArmeni Jun 30, 2025
5beacfb
[MAINT]rename bar plot ids to be more specific
KristijanArmeni Jun 30, 2025
ff08099
[FEAT] use place holder fig in hashtag_bar_plot()
KristijanArmeni Jun 30, 2025
e9e21e2
add line break in placeholder figure text
KristijanArmeni Jun 30, 2025
c12656a
update placeholder text
KristijanArmeni Jun 30, 2025
d7a8763
[FEAT] update code to work when secondary_analysis() returns None
KristijanArmeni Jun 30, 2025
5b621cb
improve time window info display
KristijanArmeni Jun 30, 2025
08a6e10
fix tooltip text
KristijanArmeni Jun 30, 2025
9d39de5
remove selected point from hover information in line plot
KristijanArmeni Jun 30, 2025
9c24af8
update text labels in figures
KristijanArmeni Jun 30, 2025
7d2bd50
update analyzer short description, indent long description
KristijanArmeni Jun 30, 2025
6775af1
main: shorten the description of temporal analyzer
KristijanArmeni Jul 1, 2025
218ae39
use ProgressReporter
KristijanArmeni Jun 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions analyzers/hashtags/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,21 @@
id="hashtags",
version="0.1.0",
name="Hashtag analysis",
short_description="Computes the gini coefficient over hashtag usage",
short_description="Computes the concentration of hashtag usage over time.",
long_description="""
Analysis of hashtags measures the extent of online coordination among social media users
by looking at how the usage of hashtags in messages changes over time. Specificaly,
it measures whether certain hashtags are being used more frequently than others (i.e. trending).
Analysis of hashtags measures the extent of online coordination among social media users
by looking at how the usage of hashtags in messages changes over time. Specificaly,
it measures whether certain hashtags are being used more frequently than others (i.e. trending).

The intuition behind the analysis is that the users on social media, if coordinated by
an event, will converge on using a few hasthags more frequently than others
(e.g. #worldcup at the time when a soccer world cup starts). The (in)equality in
the distritution of hasthags can be taken as evidence of coordination and is quantified
using the Gini coefficient (see: https://ourworldindata.org/what-is-the-gini-coefficient).
The intuition behind the analysis is that the users on social media, if coordinated by
an event, will converge on using a few hasthags more frequently than others
(e.g. #worldcup at the time when a soccer world cup starts). The (in)equality in
the distritution of hasthags can be taken as evidence of coordination and is quantified
using the Gini coefficient (see: https://ourworldindata.org/what-is-the-gini-coefficient).

The results of this test can be used in confirmatory analyses to measure
the extent of coordination in large datasets collected from social media platforms around
specific events/timepoints that are hypothesized to have been coordinated.
The results of this test can be used in confirmatory analyses to measure
the extent of coordination in large datasets collected from social media platforms around
specific events/timepoints that are hypothesized to have been coordinated.
""",
input=AnalyzerInput(
columns=[
Expand Down
59 changes: 32 additions & 27 deletions analyzers/hashtags/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import polars as pl

from analyzer_interface.context import PrimaryAnalyzerContext
from terminal_tools import ProgressReporter

from .interface import (
COL_AUTHOR_ID,
Expand Down Expand Up @@ -54,42 +55,46 @@ def hashtag_analysis(data_frame: pl.DataFrame, every="1h") -> pl.DataFrame:
r"(#\S+)"
) # fetch all hashtags based on `#` symbol

# if hashtag symbol is detected, extract with regex
if data_frame.select(has_hashtag_symbols).item():
df_input = data_frame.with_columns(extract_hashtags).filter(
pl.col(COL_POST) != []
)
with ProgressReporter("Gathering hashtags..."):
# if hashtag symbol is detected, extract with regex
if data_frame.select(has_hashtag_symbols).item():
df_input = data_frame.with_columns(extract_hashtags).filter(
pl.col(COL_POST) != []
)

else: # otherwise, we assume str: "['hashtag1', 'hashtag2', ...]"
raise ValueError(f"The data in {COL_POST} column appear to have no hashtags.")
else: # otherwise, we assume str: "['hashtag1', 'hashtag2', ...]"
raise ValueError(
f"The data in {COL_POST} column appear to have no hashtags."
)

# select columns and sort by time in ascending order
# (expected by .group_by_dynamic below)
df_input = df_input.select(pl.col([COL_AUTHOR_ID, COL_TIME, COL_POST])).sort(
pl.col(COL_TIME)
)

# compute gini per timewindow
df_out = (
df_input.explode(pl.col(COL_POST))
.group_by_dynamic(
pl.col(COL_TIME), every=every, period=every, start_by="datapoint"
)
.agg(
pl.col(COL_AUTHOR_ID).alias(OUTPUT_COL_USERS),
pl.col(COL_POST).alias(OUTPUT_COL_HASHTAGS),
pl.col(COL_POST).count().alias(OUTPUT_COL_COUNT),
pl.col(COL_POST)
.map_batches(gini, returns_scalar=True, return_dtype=pl.Float64)
.alias(OUTPUT_COL_GINI),
with ProgressReporter("Counting hashtags..."):
# compute gini per timewindow
df_out = (
df_input.explode(pl.col(COL_POST))
.group_by_dynamic(
pl.col(COL_TIME), every=every, period=every, start_by="datapoint"
)
.agg(
pl.col(COL_AUTHOR_ID).alias(OUTPUT_COL_USERS),
pl.col(COL_POST).alias(OUTPUT_COL_HASHTAGS),
pl.col(COL_POST).count().alias(OUTPUT_COL_COUNT),
pl.col(COL_POST)
.map_batches(gini, returns_scalar=True, return_dtype=pl.Float64)
.alias(OUTPUT_COL_GINI),
)
.with_columns(
pl.col(OUTPUT_COL_GINI)
.rolling_mean(window_size=SMOOTH_WINDOW_SIZE, center=True)
.alias(OUTPUT_COL_GINI + "_smooth")
)
.rename({COL_TIME: OUTPUT_COL_TIMESPAN})
)
.with_columns(
pl.col(OUTPUT_COL_GINI)
.rolling_mean(window_size=SMOOTH_WINDOW_SIZE, center=True)
.alias(OUTPUT_COL_GINI + "_smooth")
)
.rename({COL_TIME: OUTPUT_COL_TIMESPAN})
)

# convert datetime back to string
df_out = df_out.with_columns(
Expand Down
Loading