diff --git a/analyzers/hashtags/interface.py b/analyzers/hashtags/interface.py
index 70a907ef..e4eeb7ba 100644
--- a/analyzers/hashtags/interface.py
+++ b/analyzers/hashtags/interface.py
@@ -25,21 +25,21 @@
id="hashtags",
version="0.1.0",
name="Hashtag analysis",
- short_description="Computes the gini coefficient over hashtag usage",
+ short_description="Computes the concentration of hashtag usage over time.",
long_description="""
- Analysis of hashtags measures the extent of online coordination among social media users
- by looking at how the usage of hashtags in messages changes over time. Specificaly,
- it measures whether certain hashtags are being used more frequently than others (i.e. trending).
+Analysis of hashtags measures the extent of online coordination among social media users
+by looking at how the usage of hashtags in messages changes over time. Specificaly,
+it measures whether certain hashtags are being used more frequently than others (i.e. trending).
- The intuition behind the analysis is that the users on social media, if coordinated by
- an event, will converge on using a few hasthags more frequently than others
- (e.g. #worldcup at the time when a soccer world cup starts). The (in)equality in
- the distritution of hasthags can be taken as evidence of coordination and is quantified
- using the Gini coefficient (see: https://ourworldindata.org/what-is-the-gini-coefficient).
+The intuition behind the analysis is that the users on social media, if coordinated by
+an event, will converge on using a few hasthags more frequently than others
+(e.g. #worldcup at the time when a soccer world cup starts). The (in)equality in
+the distritution of hasthags can be taken as evidence of coordination and is quantified
+using the Gini coefficient (see: https://ourworldindata.org/what-is-the-gini-coefficient).
- The results of this test can be used in confirmatory analyses to measure
- the extent of coordination in large datasets collected from social media platforms around
- specific events/timepoints that are hypothesized to have been coordinated.
+The results of this test can be used in confirmatory analyses to measure
+the extent of coordination in large datasets collected from social media platforms around
+specific events/timepoints that are hypothesized to have been coordinated.
""",
input=AnalyzerInput(
columns=[
diff --git a/analyzers/hashtags/main.py b/analyzers/hashtags/main.py
index 9f95eaa8..687fb522 100644
--- a/analyzers/hashtags/main.py
+++ b/analyzers/hashtags/main.py
@@ -3,6 +3,7 @@
import polars as pl
from analyzer_interface.context import PrimaryAnalyzerContext
+from terminal_tools import ProgressReporter
from .interface import (
COL_AUTHOR_ID,
@@ -54,14 +55,17 @@ def hashtag_analysis(data_frame: pl.DataFrame, every="1h") -> pl.DataFrame:
r"(#\S+)"
) # fetch all hashtags based on `#` symbol
- # if hashtag symbol is detected, extract with regex
- if data_frame.select(has_hashtag_symbols).item():
- df_input = data_frame.with_columns(extract_hashtags).filter(
- pl.col(COL_POST) != []
- )
+ with ProgressReporter("Gathering hashtags..."):
+ # if hashtag symbol is detected, extract with regex
+ if data_frame.select(has_hashtag_symbols).item():
+ df_input = data_frame.with_columns(extract_hashtags).filter(
+ pl.col(COL_POST) != []
+ )
- else: # otherwise, we assume str: "['hashtag1', 'hashtag2', ...]"
- raise ValueError(f"The data in {COL_POST} column appear to have no hashtags.")
+ else: # otherwise, we assume str: "['hashtag1', 'hashtag2', ...]"
+ raise ValueError(
+ f"The data in {COL_POST} column appear to have no hashtags."
+ )
# select columns and sort by time in ascending order
# (expected by .group_by_dynamic below)
@@ -69,27 +73,28 @@ def hashtag_analysis(data_frame: pl.DataFrame, every="1h") -> pl.DataFrame:
pl.col(COL_TIME)
)
- # compute gini per timewindow
- df_out = (
- df_input.explode(pl.col(COL_POST))
- .group_by_dynamic(
- pl.col(COL_TIME), every=every, period=every, start_by="datapoint"
- )
- .agg(
- pl.col(COL_AUTHOR_ID).alias(OUTPUT_COL_USERS),
- pl.col(COL_POST).alias(OUTPUT_COL_HASHTAGS),
- pl.col(COL_POST).count().alias(OUTPUT_COL_COUNT),
- pl.col(COL_POST)
- .map_batches(gini, returns_scalar=True, return_dtype=pl.Float64)
- .alias(OUTPUT_COL_GINI),
+ with ProgressReporter("Counting hashtags..."):
+ # compute gini per timewindow
+ df_out = (
+ df_input.explode(pl.col(COL_POST))
+ .group_by_dynamic(
+ pl.col(COL_TIME), every=every, period=every, start_by="datapoint"
+ )
+ .agg(
+ pl.col(COL_AUTHOR_ID).alias(OUTPUT_COL_USERS),
+ pl.col(COL_POST).alias(OUTPUT_COL_HASHTAGS),
+ pl.col(COL_POST).count().alias(OUTPUT_COL_COUNT),
+ pl.col(COL_POST)
+ .map_batches(gini, returns_scalar=True, return_dtype=pl.Float64)
+ .alias(OUTPUT_COL_GINI),
+ )
+ .with_columns(
+ pl.col(OUTPUT_COL_GINI)
+ .rolling_mean(window_size=SMOOTH_WINDOW_SIZE, center=True)
+ .alias(OUTPUT_COL_GINI + "_smooth")
+ )
+ .rename({COL_TIME: OUTPUT_COL_TIMESPAN})
)
- .with_columns(
- pl.col(OUTPUT_COL_GINI)
- .rolling_mean(window_size=SMOOTH_WINDOW_SIZE, center=True)
- .alias(OUTPUT_COL_GINI + "_smooth")
- )
- .rename({COL_TIME: OUTPUT_COL_TIMESPAN})
- )
# convert datetime back to string
df_out = df_out.with_columns(
diff --git a/analyzers/hashtags_web/app.py b/analyzers/hashtags_web/app.py
index d45fd5b4..ae61b269 100644
--- a/analyzers/hashtags_web/app.py
+++ b/analyzers/hashtags_web/app.py
@@ -1,13 +1,22 @@
from functools import lru_cache
-import numpy as np
+import plotly.graph_objects as go
import polars as pl
from shiny import reactive, render, ui
from shinywidgets import output_widget, render_widget
from ..hashtags.interface import COL_AUTHOR_ID, COL_POST, COL_TIME
from .analysis import secondary_analyzer
-from .plots import plot_bar_plotly, plot_gini_plotly, plot_users_plotly
+from .plots import (
+ MANGO_DARK_GREEN,
+ _plot_hashtags_placeholder_fig,
+ _plot_users_placeholder_fig,
+ plot_bar_plotly,
+ plot_gini_plotly,
+ plot_users_plotly,
+)
+
+LOGO_URL = "https://raw.githubusercontent.com/CIB-Mango-Tree/CIB-Mango-Tree-Website/main/assets/images/mango-text.PNG"
# https://icons.getbootstrap.com/icons/question-circle-fill/
question_circle_fill = ui.HTML(
@@ -36,9 +45,6 @@ def get_raw_data_subset(time_start, time_end, user_id, hashtag):
)
-# Global variables for CLI integration
-
-
def select_users(secondary_output, selected_hashtag):
users_df = (
secondary_output.filter(pl.col("hashtags") == selected_hashtag)["users_all"]
@@ -58,27 +64,20 @@ def select_users(secondary_output, selected_hashtag):
)
# main panel showing the line plot
-analysis_panel = ui.accordion(
- ui.accordion_panel(
- "",
- [
- ui.card(
- ui.card_header(
- "Full time scale analysis ",
- ui.tooltip(
- ui.tags.span(
- question_circle_fill,
- style="cursor: help; font-size: 14px;",
- ),
- "This analysis shows the gini coefficient over the entire dataset. Select specific timepoints below to explore narrow time windows.",
- placement="top",
- ),
- ),
- ui.input_checkbox("smooth_checkbox", "Show smoothed line", value=False),
- output_widget("line_plot", height="300px"),
- )
- ],
- )
+analysis_panel = ui.card(
+ ui.card_header(
+ "Full time window analysis ",
+ ui.tooltip(
+ ui.tags.span(
+ question_circle_fill,
+ style="cursor: help; font-size: 14px;",
+ ),
+ "This analysis shows the gini coefficient over the entire dataset. Select specific timepoints below to explore narrow time windows.",
+ placement="top",
+ ),
+ ),
+ ui.input_checkbox("smooth_checkbox", "Show smoothed line", value=False),
+ output_widget("line_plot", height="300px"),
)
# panel to show hashtag distributions
@@ -90,18 +89,12 @@ def select_users(secondary_output, selected_hashtag):
question_circle_fill,
style="cursor: help; font-size: 14px;",
),
- "Select a date to display the hashtags that users posted most frequently in the time period starting with that date.",
+ "Displayed are the hashtags that users posted most frequently in the time window starting with selected date.",
placement="top",
),
),
- ui.input_selectize(
- id="date_picker",
- label="Show hashtags for time period starting on:",
- choices=[], # Will be populated by reactive effect
- selected=None,
- width="100%",
- ),
- output_widget("bar_plot", height="1500px"),
+ ui.output_text(id="hashtag_card_info"),
+ output_widget("hashtag_bar_plot", height="1500px"),
max_height="500px",
full_screen=True,
)
@@ -109,23 +102,23 @@ def select_users(secondary_output, selected_hashtag):
# panel to show hashtag count per user distribution
users_plot_panel = ui.card(
ui.card_header(
- "Hashtag usage by users ",
+ "Hashtag usage by accounts ",
ui.tooltip(
ui.tags.span(
question_circle_fill,
style="cursor: help; font-size: 14px;",
),
- "Select a user account to show the number of times it used a specific hashtag.",
+ "Select a hashtag to show the number of times it was used by specific accounts.",
placement="top",
),
),
ui.input_selectize(
id="hashtag_picker",
- label="Show users for hashtag:",
+ label="Show accounts that used hashtag:",
choices=[],
width="100%",
),
- output_widget("user_plot", height="800px"),
+ output_widget("user_bar_plot", height="800px"),
max_height="500px",
full_screen=True,
)
@@ -138,13 +131,13 @@ def select_users(secondary_output, selected_hashtag):
question_circle_fill,
style="cursor: help; font-size: 14px;",
),
- "Inspect the posts containing the hashtag for the specific user in the selected time period.",
+ "Inspect the posts containing the hashtag for the specific user in the selected time window.",
placement="top",
),
),
ui.input_selectize(
id="user_picker",
- label="Show tweets for user:",
+ label="Show posts for account:",
choices=[],
width="100%",
),
@@ -173,6 +166,20 @@ def get_time_step():
return df["timewindow_start"][1] - df["timewindow_start"][0]
return None
+ def _get_timewindow_info_text():
+ """Format selected timewindow into a short info text for feedback"""
+ click_data = clicked_data.get()
+
+ if click_data and hasattr(click_data, "xs") and len(click_data.xs) > 0:
+ timewindow = get_selected_datetime()
+ time_step = get_time_step()
+ timewindow_end = timewindow + time_step
+ format_code = "%B %d, %Y"
+ dates_formatted = f"{timewindow.strftime(format_code)} - {timewindow_end.strftime(format_code)}"
+ return "Time window: " + dates_formatted
+ else:
+ return "Time window not available (select first)"
+
@reactive.effect
def populate_date_choices():
"""Populate date picker choices when data is loaded"""
@@ -191,24 +198,36 @@ def get_selected_datetime_cached(selected_formatted):
df = get_df()
# Find the datetime that matches the formatted string
for dt in df["timewindow_start"].to_list():
- if dt.strftime("%B %d, %Y") == selected_formatted:
+ if dt.strftime("%Y-%m-%d %H:%M") == selected_formatted:
return dt
return df["timewindow_start"].to_list()[0] # fallback
- def get_selected_datetime():
- return get_selected_datetime_cached(input.date_picker())
-
- @reactive.calc
- def selected_date():
- df = get_df()
- x_selected = df.with_columns(
- sel=pl.col("timewindow_start") == input.date_picker()
- ).select(pl.col("sel"))
+ # this will store line plot values when clicked
+ clicked_data = reactive.value(None)
- return np.where(x_selected)[0].item()
+ def get_selected_datetime():
+ """Get date value from when a line plot is clicked on"""
+ click_data = clicked_data.get()
+ if click_data and hasattr(click_data, "xs") and len(click_data.xs) > 0:
+ # Convert the clicked datetime to the format expected by get_selected_datetime_cached
+ clicked_datetime = click_data.xs[0]
+ if hasattr(clicked_datetime, "strftime"):
+ formatted_datetime = clicked_datetime.strftime("%Y-%m-%d %H:%M")
+ return get_selected_datetime_cached(formatted_datetime)
+ else:
+ # If it's already a string, use it directly
+ return get_selected_datetime_cached(clicked_datetime)
+ else:
+ # Return the first datetime as default
+ return get_df()["timewindow_start"][0]
@reactive.calc
def secondary_analysis():
+ # Only run analysis if user has clicked on line plot
+ click_data = clicked_data()
+ if not (click_data and hasattr(click_data, "xs") and len(click_data.xs) > 0):
+ return None
+
timewindow = get_selected_datetime()
df = get_df()
df_out2 = secondary_analyzer(df, timewindow)
@@ -216,7 +235,12 @@ def secondary_analysis():
@reactive.effect
def update_hashtag_choices():
- hashtags = secondary_analysis()["hashtags"].to_list()
+ analysis_result = secondary_analysis()
+ if analysis_result is None:
+ hashtags = []
+ else:
+ hashtags = analysis_result["hashtags"].to_list()
+
ui.update_selectize(
"hashtag_picker",
choices=hashtags,
@@ -226,11 +250,16 @@ def update_hashtag_choices():
@reactive.effect
def update_user_choices():
- df_users = select_users(
- secondary_analysis(), selected_hashtag=input.hashtag_picker()
- ).sort("count", descending=True)
+ analysis_result = secondary_analysis()
+ selected_hashtag = input.hashtag_picker()
- users = df_users["users_all"].to_list()
+ if analysis_result is None or not selected_hashtag:
+ users = []
+ else:
+ df_users = select_users(
+ analysis_result, selected_hashtag=selected_hashtag
+ ).sort("count", descending=True)
+ users = df_users["users_all"].to_list()
ui.update_selectize(
"user_picker",
@@ -239,60 +268,84 @@ def update_user_choices():
session=session,
)
+ # whenever line plot is clicked, update `click_reactive`
+ def on_point_click(trace, points, state):
+ clicked_data.set(points)
+
+ # Get the parent figure widget from the trace
+ fig_widget = trace.parent
+
+ # Remove existing red marker traces
+ traces_to_remove = []
+ for i, existing_trace in enumerate(fig_widget.data):
+ if (
+ hasattr(existing_trace, "marker")
+ and hasattr(existing_trace.marker, "color")
+ and existing_trace.marker.color == MANGO_DARK_GREEN
+ ):
+ traces_to_remove.append(i)
+
+ # Remove old red markers in reverse order
+ for i in reversed(traces_to_remove):
+ fig_widget.data = fig_widget.data[:i] + fig_widget.data[i + 1 :]
+
+ # Add new red marker at clicked point
+ fig_widget.add_scatter(
+ x=[points.xs[0]],
+ y=[points.ys[0]],
+ mode="markers",
+ marker=dict(size=8, color=MANGO_DARK_GREEN),
+ hoverinfo="skip", # Disable hover for this marker
+ showlegend=False,
+ )
+
@render_widget
def line_plot():
- selected_date = get_selected_datetime()
smooth_enabled = input.smooth_checkbox()
+
df = get_df()
- return plot_gini_plotly(df=df, x_selected=selected_date, smooth=smooth_enabled)
+
+ fig = plot_gini_plotly(df=df, smooth=smooth_enabled)
+
+ fig_widget = go.FigureWidget(fig.data, fig.layout)
+ fig_widget.data[0].on_click(on_point_click)
+
+ return fig_widget
@render_widget
- def bar_plot():
- selected_date = get_selected_datetime()
- return plot_bar_plotly(
- data_frame=secondary_analysis(),
- selected_date=selected_date,
- show_title=False,
- )
+ def hashtag_bar_plot():
+ analysis_result = secondary_analysis()
+
+ if analysis_result is not None:
+ selected_date = get_selected_datetime()
+ return plot_bar_plotly(
+ data_frame=analysis_result,
+ selected_date=selected_date,
+ show_title=False,
+ )
+ else:
+ # Return placeholder plot if no data clicked
+ return _plot_hashtags_placeholder_fig()
+
+ @render.text
+ def hashtag_card_info():
+ return _get_timewindow_info_text()
@render_widget
- def user_plot():
+ def user_bar_plot():
+ analysis_result = secondary_analysis()
selected_hashtag = input.hashtag_picker()
- if selected_hashtag:
- users_data = select_users(secondary_analysis(), selected_hashtag)
+
+ if analysis_result is not None and selected_hashtag:
+ users_data = select_users(analysis_result, selected_hashtag)
return plot_users_plotly(users_data)
else:
# Return empty plot if no hashtag selected
- import plotly.graph_objects as go
-
- fig = go.Figure()
- fig.add_annotation(
- x=0.5,
- y=0.5,
- text="Select a hashtag to see user distribution",
- showarrow=False,
- font=dict(size=16),
- xref="paper",
- yref="paper",
- )
- fig.update_layout(
- template="plotly_white",
- xaxis=dict(range=[0, 1]),
- yaxis=dict(range=[0, 1]),
- height=400,
- )
- return fig
+ return _plot_users_placeholder_fig()
@render.text
def tweets_title():
- timewindow = get_selected_datetime()
- time_step = get_time_step()
- if time_step:
- timewindow_end = timewindow + time_step
- format_code = "%B %d, %Y"
- dates_formatted = f"{timewindow.strftime(format_code)} - {timewindow_end.strftime(format_code)}"
- return "Showing posts in time window: " + dates_formatted
- return "Time window information not available"
+ return _get_timewindow_info_text()
@render.data_frame
def tweets():
@@ -319,4 +372,4 @@ def tweets():
df_posts = df_posts.drop(pl.col(COL_AUTHOR_ID))
- return render.DataGrid(df_posts, width="100%", filters=True)
+ return render.DataGrid(df_posts, width="100%")
diff --git a/analyzers/hashtags_web/plots.py b/analyzers/hashtags_web/plots.py
index 938d0ef3..fb259473 100644
--- a/analyzers/hashtags_web/plots.py
+++ b/analyzers/hashtags_web/plots.py
@@ -1,11 +1,13 @@
-import numpy as np
import plotly.graph_objects as go
import polars as pl
-FS = 14
+FS = 16
+MANGO_DARK_GREEN = "#609949"
+MANGO_DARK_ORANGE = "#f3921e"
+LIGHT_BLUE = "#acd7e5"
-def plot_gini_plotly(df: pl.DataFrame, x_selected, smooth: bool = False):
+def plot_gini_plotly(df: pl.DataFrame, smooth: bool = False):
"""Create a plotly line plot with white theme"""
y = df.select(pl.col("gini")).to_numpy().flatten()
@@ -20,6 +22,7 @@ def plot_gini_plotly(df: pl.DataFrame, x_selected, smooth: bool = False):
y=y,
mode="lines",
name="Gini coefficient",
+ hovertemplate="Gini: %{y:.3f}",
line=dict(color="black", width=1.5),
)
)
@@ -33,20 +36,18 @@ def plot_gini_plotly(df: pl.DataFrame, x_selected, smooth: bool = False):
y=y2,
mode="lines",
name="Smoothed",
+ hovertemplate="Gini (Smoothed): %{y:.3f}",
line=dict(color="orange", width=2),
- opacity=0.8,
)
)
- # Add vertical line for selected date (x_selected is now the datetime value directly)
- fig.add_vline(x=x_selected, line_dash="dash", line_color="red", line_width=2)
-
# Update layout with white theme
fig.update_layout(
template="plotly_white",
title="Concentration of hashtags over time",
- xaxis_title="Time",
- yaxis_title="Gini coefficient",
+ xaxis_title="Time window (start date)",
+ yaxis_title="Hashtag Concentration
(Gini coefficient)",
+ hovermode="x unified",
showlegend=False,
height=300,
margin=dict(l=50, r=50, t=50, b=50),
@@ -90,7 +91,7 @@ def plot_bar_plotly(data_frame, selected_date=None, show_title=True):
x=percentages,
y=hashtags,
orientation="h",
- marker_color="#609949",
+ marker_color=MANGO_DARK_GREEN,
hovertemplate="%{y}
%{x:.1f}% of all hashtags",
width=0.8, # Fixed bar width
text=hashtags, # Add text labels on bars
@@ -118,10 +119,10 @@ def plot_bar_plotly(data_frame, selected_date=None, show_title=True):
fig.update_layout(
template="plotly_white",
title=title,
- xaxis_title="% all hashtags in the selected time period",
+ xaxis_title="% all hashtags in the selected time window",
yaxis_title="",
height=dynamic_height,
- margin=dict(l=0, r=100, t=10, b=50),
+ margin=dict(l=0, r=50, t=10, b=50),
showlegend=False,
)
@@ -136,6 +137,29 @@ def plot_bar_plotly(data_frame, selected_date=None, show_title=True):
return fig
+def _plot_hashtags_placeholder_fig():
+ """Create a an empty placeholder figure for hashtags barplot"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ x=0.5,
+ y=0.5,
+ text="Select a time window on line plot above
hashtags in that time window",
+ showarrow=False,
+ font=dict(size=16),
+ xref="paper",
+ yref="paper",
+ )
+ fig.update_layout(
+ template="plotly_white",
+ xaxis=dict(range=[0, 1]),
+ yaxis=dict(range=[0, 1]),
+ height=400,
+ )
+
+ return fig
+
+
def plot_users_plotly(users_data):
"""Create an interactive plotly bar plot for user distribution"""
@@ -171,7 +195,7 @@ def plot_users_plotly(users_data):
x=counts,
y=users,
orientation="h",
- marker_color="#609949",
+ marker_color=MANGO_DARK_GREEN,
hovertemplate="%{y}
%{x} posts",
width=0.8, # Fixed bar width
text=users, # Add text labels on bars
@@ -202,3 +226,25 @@ def plot_users_plotly(users_data):
fig.update_yaxes(categoryorder="array", categoryarray=users, showticklabels=False)
return fig
+
+
+def _plot_users_placeholder_fig():
+ fig = go.Figure()
+
+ fig.add_annotation(
+ x=0.5,
+ y=0.5,
+ text="Select a time window and a hashtag
to see user distribution",
+ showarrow=False,
+ font=dict(size=16),
+ xref="paper",
+ yref="paper",
+ )
+ fig.update_layout(
+ template="plotly_white",
+ xaxis=dict(range=[0, 1]),
+ yaxis=dict(range=[0, 1]),
+ height=400,
+ )
+
+ return fig
diff --git a/analyzers/temporal/interface.py b/analyzers/temporal/interface.py
index e6396686..22254e2d 100644
--- a/analyzers/temporal/interface.py
+++ b/analyzers/temporal/interface.py
@@ -25,7 +25,7 @@
id="temporal",
version="0.1.0",
name="Time frequency analysis",
- short_description="Counts posting events in custom time intervals to discover potential periodic activity.",
+ short_description="Analyzes periodic activity of posting events.",
long_description=description,
input=AnalyzerInput(
columns=[