# Keyword Analysis for Political Topics

This notebook analyzes tweet mentions of specific keywords by party and creates visualizations.

**Example use case:** Analyzing how different parties discuss "Venezuela", "Ukraine", "Klimawandel", etc.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from pathlib import Path
import yaml

# Database connection
import sys
sys.path.insert(0, str(Path.cwd().parent / 'src'))
from xminer.io.db import engine
from sqlalchemy import text

In [None]:
# --- Load parameters from config ---
PARAMS_FILE = Path("../src/xminer/config/parameters.yml")
assert PARAMS_FILE.exists(), f"parameters.yml not found: {PARAMS_FILE}"

with PARAMS_FILE.open("r", encoding="utf-8") as f:
    params = yaml.safe_load(f) or {}

YEAR = int(params.get("year", 2025))
MONTH = int(params.get("month", 12))
YM = f"{YEAR:04d}{MONTH:02d}"

STAND_TEXT = f"Erhoben fÃ¼r {MONTH:02d}/{YEAR}"

# Output directory for graphics
GRAPHICS_BASE_DIR = Path(
    params.get(
        "graphics_base_dir",
        "../outputs",
    )
)

GRAPHICS_DIR = GRAPHICS_BASE_DIR / YM / "graphics" / "keywords"
GRAPHICS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Graphics will be saved to: {GRAPHICS_DIR}")
print(f"Period: {YEAR}-{MONTH:02d}")

In [None]:
# Party colors (consistent with other notebooks)
PARTY_COLORS = {
    "CDU/CSU": "#000000",
    "CDU": "#000000",
    "CSU": "#000000",
    "SPD": "#E3000F",
    "GRÃœNE": "#1AA64A",
    "BÃœNDNIS 90/DIE GRÃœNEN": "#1AA64A",
    "DIE LINKE.": "#BE3075",
    "LINKE": "#BE3075",
    "FDP": "#FFED00",
    "AFD": "#009EE0",
    "BSW": "#009688",
    "FW": "#F28F00",
    "SSW": "#00A3E0",
}

def normalize_party(p: str) -> str:
    """Normalize party names for consistency."""
    if p is None:
        return ""
    key = str(p).strip().upper()
    
    if key in {"CDU", "CSU"}:
        return "CDU/CSU"
    if key.startswith("GRÃœN") or "GRUENE" in key or "B90" in key or "BÃœNDNIS" in key:
        return "GRÃœNE"
    if key in {"LINKE", "DIE LINKE", "DIE LINKE."}:
        return "DIE LINKE."
    return key

def get_party_color(party: str) -> str:
    """Get color for a party."""
    normalized = normalize_party(party)
    return PARTY_COLORS.get(normalized, "#888888")

## Define Keywords to Analyze

In [None]:
# Define keywords to search for (case-insensitive)
# You can modify this list to analyze different topics
KEYWORDS = [
    "Venezuela",
    "Ukraine",
    "Klimawandel",
    "Migration",
    "Wirtschaft",
    "Bildung",
    "Gesundheit",
]

# Or analyze a single keyword in detail
SINGLE_KEYWORD = "Venezuela"  # Change this to analyze different topics

print(f"Will analyze keywords: {KEYWORDS}")
print(f"\nDetailed analysis for: {SINGLE_KEYWORD}")

## Single Keyword Analysis (e.g., Venezuela)

In [None]:
# Query for single keyword by party
query = f"""
SELECT 
    p.partei_kurz AS party,
    COUNT(*) AS tweet_count,
    COUNT(DISTINCT t.username) AS user_count,
    SUM(t.like_count) AS total_likes,
    SUM(t.retweet_count) AS total_retweets,
    SUM(t.impression_count) AS total_impressions,
    AVG(t.like_count) AS avg_likes,
    AVG(t.impression_count) AS avg_impressions
FROM public.tweets t
JOIN politicians_{MONTH:02d}_{YEAR} p ON t.username = p.username
WHERE t.text ILIKE '%{SINGLE_KEYWORD}%'
GROUP BY p.partei_kurz
ORDER BY tweet_count DESC
"""

with engine.connect() as conn:
    df_keyword = pd.read_sql(text(query), conn)

# Normalize party names
df_keyword['party_norm'] = df_keyword['party'].apply(normalize_party)

# Aggregate by normalized party
df_keyword_agg = (
    df_keyword.groupby('party_norm')
    .agg({
        'tweet_count': 'sum',
        'user_count': 'sum',
        'total_likes': 'sum',
        'total_retweets': 'sum',
        'total_impressions': 'sum',
        'avg_likes': 'mean',
        'avg_impressions': 'mean'
    })
    .reset_index()
    .sort_values('tweet_count', ascending=False)
)

print(f"\nTweets mentioning '{SINGLE_KEYWORD}' by party:")
print(df_keyword_agg[['party_norm', 'tweet_count', 'user_count', 'total_impressions']])

In [None]:
# Create bar chart for single keyword
def create_keyword_bar_chart(df, keyword, metric='tweet_count', title=None):
    """
    Create horizontal bar chart showing party engagement with a keyword.
    """
    df_sorted = df.sort_values(metric, ascending=True).copy()
    
    colors = [get_party_color(party) for party in df_sorted['party_norm']]
    
    # Metric labels
    metric_labels = {
        'tweet_count': 'Anzahl Tweets',
        'total_impressions': 'Gesamte Impressionen',
        'total_likes': 'Gesamte Likes',
        'user_count': 'Anzahl Nutzer'
    }
    
    metric_label = metric_labels.get(metric, metric)
    
    if title is None:
        title = f"Tweets Ã¼ber '{keyword}' nach Partei"
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        y=df_sorted['party_norm'],
        x=df_sorted[metric],
        orientation='h',
        marker_color=colors,
        text=[f"{v:,.0f}" for v in df_sorted[metric]],
        textposition='outside',
        textfont=dict(color='white', size=14),
        hovertemplate=(
            "<b>%{y}</b><br>"
            f"{metric_label}: %{{x:,.0f}}<br>"
            "<extra></extra>"
        )
    ))
    
    title_text = f"{title}<br><sub style='font-size:0.85em;'>{STAND_TEXT}</sub>"
    
    fig.update_layout(
        title=dict(text=title_text, x=0.5, xanchor='center', font=dict(size=22)),
        xaxis_title=metric_label,
        yaxis_title="",
        plot_bgcolor='#1a1a1a',
        paper_bgcolor='#1a1a1a',
        font=dict(color='white', size=14),
        margin=dict(l=120, r=100, t=120, b=60),
        height=max(400, 60 * len(df_sorted)),
        xaxis=dict(gridcolor='#333333'),
        yaxis=dict(gridcolor='#333333', tickfont=dict(size=16))
    )
    
    return fig

# Create chart for tweet count
fig_tweets = create_keyword_bar_chart(
    df_keyword_agg,
    SINGLE_KEYWORD,
    metric='tweet_count'
)

# Save for social media
save_path = GRAPHICS_DIR / f"{SINGLE_KEYWORD.lower()}_tweets_by_party.png"
fig_tweets.write_image(save_path, width=1200, height=675, scale=2)
print(f"âœ… Saved: {save_path}")
fig_tweets.show()

In [None]:
# Create chart for impressions
fig_impressions = create_keyword_bar_chart(
    df_keyword_agg,
    SINGLE_KEYWORD,
    metric='total_impressions',
    title=f"Reichweite von '{SINGLE_KEYWORD}'-Tweets nach Partei"
)

save_path = GRAPHICS_DIR / f"{SINGLE_KEYWORD.lower()}_impressions_by_party.png"
fig_impressions.write_image(save_path, width=1200, height=675, scale=2)
print(f"âœ… Saved: {save_path}")
fig_impressions.show()

## Multi-Keyword Comparison

In [None]:
# Query multiple keywords at once
def analyze_multiple_keywords(keywords):
    """
    Analyze multiple keywords and return comparison data.
    """
    results = []
    
    for keyword in keywords:
        query = f"""
        SELECT 
            '{keyword}' AS keyword,
            p.partei_kurz AS party,
            COUNT(*) AS tweet_count,
            SUM(t.impression_count) AS total_impressions
        FROM public.tweets t
        JOIN politicians_{MONTH:02d}_{YEAR} p ON t.username = p.username
        WHERE t.text ILIKE '%{keyword}%'
        GROUP BY p.partei_kurz
        """
        
        with engine.connect() as conn:
            df_temp = pd.read_sql(text(query), conn)
        
        results.append(df_temp)
    
    df_all = pd.concat(results, ignore_index=True)
    df_all['party_norm'] = df_all['party'].apply(normalize_party)
    
    # Aggregate by normalized party
    df_final = (
        df_all.groupby(['keyword', 'party_norm'])
        .agg({'tweet_count': 'sum', 'total_impressions': 'sum'})
        .reset_index()
    )
    
    return df_final

# Analyze all keywords
print(f"Analyzing {len(KEYWORDS)} keywords...")
df_multi = analyze_multiple_keywords(KEYWORDS)

print(f"\nTotal keyword mentions: {df_multi['tweet_count'].sum():,}")
print(f"\nKeyword summary:")
keyword_summary = df_multi.groupby('keyword')['tweet_count'].sum().sort_values(ascending=False)
print(keyword_summary)

In [None]:
# Create grouped bar chart comparing keywords across parties
def create_keyword_comparison_chart(df, title="Keyword-Vergleich nach Partei"):
    """
    Create grouped bar chart comparing multiple keywords by party.
    """
    # Get top parties by total tweets
    top_parties = (
        df.groupby('party_norm')['tweet_count']
        .sum()
        .nlargest(5)
        .index.tolist()
    )
    
    df_top = df[df['party_norm'].isin(top_parties)].copy()
    
    fig = go.Figure()
    
    # Add a bar for each keyword
    colors = px.colors.qualitative.Set2
    
    for i, keyword in enumerate(df_top['keyword'].unique()):
        df_keyword = df_top[df_top['keyword'] == keyword]
        
        fig.add_trace(go.Bar(
            name=keyword,
            x=df_keyword['party_norm'],
            y=df_keyword['tweet_count'],
            marker_color=colors[i % len(colors)],
            text=[f"{v:,.0f}" for v in df_keyword['tweet_count']],
            textposition='outside',
            hovertemplate=(
                f"<b>{keyword}</b><br>"
                "Partei: %{x}<br>"
                "Tweets: %{y:,.0f}<br>"
                "<extra></extra>"
            )
        ))
    
    title_text = f"{title}<br><sub style='font-size:0.85em;'>{STAND_TEXT}</sub>"
    
    fig.update_layout(
        title=dict(text=title_text, x=0.5, xanchor='center', font=dict(size=20)),
        xaxis_title="Partei",
        yaxis_title="Anzahl Tweets",
        barmode='group',
        plot_bgcolor='#1a1a1a',
        paper_bgcolor='#1a1a1a',
        font=dict(color='white'),
        legend=dict(
            title="Keywords",
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99,
            bgcolor='rgba(0,0,0,0.5)'
        ),
        margin=dict(l=60, r=40, t=100, b=60),
        xaxis=dict(gridcolor='#333333'),
        yaxis=dict(gridcolor='#333333')
    )
    
    return fig

# Create comparison chart
fig_comparison = create_keyword_comparison_chart(df_multi)

save_path = GRAPHICS_DIR / "keywords_comparison_by_party.png"
fig_comparison.write_image(save_path, width=1200, height=675, scale=2)
print(f"âœ… Saved: {save_path}")
fig_comparison.show()

In [None]:
# Create heatmap of keywords by party
def create_keyword_heatmap(df, title="Keyword-Heatmap nach Partei"):
    """
    Create heatmap showing keyword usage intensity by party.
    """
    # Pivot table for heatmap
    pivot = df.pivot_table(
        index='keyword',
        columns='party_norm',
        values='tweet_count',
        fill_value=0
    )
    
    # Sort by total
    pivot['_total'] = pivot.sum(axis=1)
    pivot = pivot.sort_values('_total', ascending=False).drop('_total', axis=1)
    
    fig = go.Figure()
    
    fig.add_trace(go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale='Viridis',
        text=pivot.values,
        texttemplate='%{text:.0f}',
        textfont=dict(color='white', size=12),
        hovertemplate=(
            "Keyword: %{y}<br>"
            "Partei: %{x}<br>"
            "Tweets: %{z:,.0f}<br>"
            "<extra></extra>"
        ),
        colorbar=dict(title="Tweets")
    ))
    
    title_text = f"{title}<br><sub style='font-size:0.85em;'>{STAND_TEXT}</sub>"
    
    fig.update_layout(
        title=dict(text=title_text, x=0.5, xanchor='center', font=dict(size=20)),
        xaxis_title="Partei",
        yaxis_title="Keyword",
        plot_bgcolor='#1a1a1a',
        paper_bgcolor='#1a1a1a',
        font=dict(color='white'),
        margin=dict(l=120, r=100, t=100, b=60),
        height=max(400, 60 * len(pivot))
    )
    
    return fig

# Create heatmap
fig_heatmap = create_keyword_heatmap(df_multi)

save_path = GRAPHICS_DIR / "keywords_heatmap.png"
fig_heatmap.write_image(save_path, width=1200, height=800, scale=2)
print(f"âœ… Saved: {save_path}")
fig_heatmap.show()

## Top Tweets for Keyword

In [None]:
# Get top tweets mentioning the keyword
query_top_tweets = f"""
SELECT 
    t.username,
    p.partei_kurz AS party,
    p.FULLNAME AS full_name,
    t.text,
    t.created_at,
    t.like_count,
    t.retweet_count,
    t.impression_count,
    t.tweet_id
FROM public.tweets t
JOIN politicians_{MONTH:02d}_{YEAR} p ON t.username = p.username
WHERE t.text ILIKE '%{SINGLE_KEYWORD}%'
ORDER BY t.impression_count DESC
LIMIT 10
"""

with engine.connect() as conn:
    df_top_tweets = pd.read_sql(text(query_top_tweets), conn)

df_top_tweets['party_norm'] = df_top_tweets['party'].apply(normalize_party)

print(f"\nTop 10 tweets mentioning '{SINGLE_KEYWORD}' by impressions:\n")
for i, row in df_top_tweets.iterrows():
    print(f"{i+1}. @{row['username']} ({row['party_norm']}) - {row['impression_count']:,} impressions")
    print(f"   {row['text'][:100]}...")
    print(f"   Likes: {row['like_count']:,} | RTs: {row['retweet_count']:,}\n")

## Summary

In [None]:
import os

print(f"\n{'='*80}")
print(f"Keyword Analysis Complete!")
print(f"{'='*80}\n")

print(f"Analyzed keyword: {SINGLE_KEYWORD}")
print(f"Additional keywords: {', '.join(KEYWORDS)}\n")

print(f"Charts saved to: {GRAPHICS_DIR}\n")

# List all PNG files
png_files = sorted(GRAPHICS_DIR.glob("*.png"))
for png_file in png_files:
    size = os.path.getsize(png_file) / 1024
    print(f"  ðŸ“Š {png_file.name} ({size:.1f} KB)")

print(f"\nâœ… {len(png_files)} chart(s) ready for social media posting!")