# Simple Keyword Analysis

Fast, simplified version for quick keyword analysis by party.

In [None]:
# Import libraries
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path
import yaml

# Database connection
import sys
sys.path.insert(0, str(Path.cwd().parent / 'src'))
from xminer.io.db import engine
from sqlalchemy import text

print('✅ Libraries imported successfully')

In [None]:
# Configuration
PARAMS_FILE = Path("../src/xminer/config/parameters.yml")

with PARAMS_FILE.open("r", encoding="utf-8") as f:
    params = yaml.safe_load(f) or {}

YEAR = int(params.get("year", 2025))
MONTH = int(params.get("month", 12))
YM = f"{YEAR:04d}{MONTH:02d}"
STAND_TEXT = f"Erhoben für {MONTH:02d}/{YEAR}"

# Graphics directory
GRAPHICS_BASE_DIR = Path(params.get("graphics_base_dir", "../outputs"))
GRAPHICS_DIR = GRAPHICS_BASE_DIR / YM / "graphics" / "keywords"
GRAPHICS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Period: {YEAR}-{MONTH:02d}")
print(f"Output: {GRAPHICS_DIR}")

In [None]:
# Party colors
PARTY_COLORS = {
    "CDU/CSU": "#000000",
    "SPD": "#E3000F",
    "GRÜNE": "#1AA64A",
    "BÜNDNIS 90/DIE GRÜNEN": "#1AA64A",
    "DIE LINKE.": "#BE3075",
    "FDP": "#FFED00",
    "AFD": "#009EE0",
    "BSW": "#009688",
}

def normalize_party(p: str) -> str:
    if p is None:
        return ""
    key = str(p).strip().upper()
    if key in {"CDU", "CSU"}:
        return "CDU/CSU"
    if key.startswith("GRÜN") or "GRUENE" in key or "B90" in key or "BÜNDNIS" in key:
        return "GRÜNE"
    if key in {"LINKE", "DIE LINKE", "DIE LINKE."}:
        return "DIE LINKE."
    return key

def get_party_color(party: str) -> str:
    normalized = normalize_party(party)
    return PARTY_COLORS.get(normalized, "#888888")

## Set Your Keyword Here

In [None]:
# Change this to analyze different topics
KEYWORD = "Venezuela"

print(f"Analyzing keyword: {KEYWORD}")

## Query Database

In [None]:
# Simple query - just party and count
query = f"""
SELECT 
    p.partei_kurz AS party,
    COUNT(*) AS tweet_count
FROM public.tweets t
JOIN politicians_{MONTH:02d}_{YEAR} p ON t.username = p.username
WHERE t.text ILIKE '%{KEYWORD}%'
GROUP BY p.partei_kurz
ORDER BY tweet_count DESC
"""

print(f"Querying tweets mentioning '{KEYWORD}'...")

with engine.connect() as conn:
    df = pd.read_sql(text(query), conn)

print(f"✅ Found {len(df)} parties with tweets mentioning '{KEYWORD}'")
print(f"   Total tweets: {df['tweet_count'].sum():,}\n")

df

In [None]:
# Normalize party names
df['party_norm'] = df['party'].apply(normalize_party)

# Aggregate by normalized party
df_agg = (
    df.groupby('party_norm')['tweet_count']
    .sum()
    .reset_index()
    .sort_values('tweet_count', ascending=True)  # Ascending for horizontal bars
)

print(f"After normalization: {len(df_agg)} parties\n")
df_agg

## Create Visualization

In [None]:
# Get party colors
colors = [get_party_color(p) for p in df_agg['party_norm']]

# Create figure
fig = go.Figure()

fig.add_trace(go.Bar(
    y=df_agg['party_norm'],
    x=df_agg['tweet_count'],
    orientation='h',
    marker_color=colors,
    text=[f"{v:,.0f}" for v in df_agg['tweet_count']],
    textposition='outside',
    textfont=dict(color='white', size=14),
    hovertemplate=(
        "<b>%{y}</b><br>"
        "Tweets: %{x:,.0f}<br>"
        "<extra></extra>"
    )
))

title = f"Tweets über '{KEYWORD}' nach Partei<br><sub style='font-size:0.85em;'>{STAND_TEXT}</sub>"

fig.update_layout(
    title=dict(text=title, x=0.5, xanchor='center', font=dict(size=22)),
    xaxis_title="Anzahl Tweets",
    yaxis_title="",
    plot_bgcolor='#1a1a1a',
    paper_bgcolor='#1a1a1a',
    font=dict(color='white', size=14),
    margin=dict(l=120, r=100, t=120, b=60),
    height=max(400, 60 * len(df_agg)),
    xaxis=dict(gridcolor='#333333'),
    yaxis=dict(gridcolor='#333333', tickfont=dict(size=16))
)

# Save
output_file = GRAPHICS_DIR / f"{KEYWORD.lower()}_tweets_by_party.png"
fig.write_image(output_file, width=1200, height=675, scale=2)

print(f"✅ Saved: {output_file}")

# Display
fig.show()

## Pie Chart: Percentage Distribution

In [None]:
# Create pie chart showing percentage distribution
import numpy as np

# Sort by tweet count descending for pie chart
df_pie = df_agg.sort_values('tweet_count', ascending=False)

# Calculate percentages
total = df_pie['tweet_count'].sum()
df_pie['percentage'] = (df_pie['tweet_count'] / total * 100).round(1)

# Get colors for pie chart
pie_colors = [get_party_color(p) for p in df_pie['party_norm']]

# Create pie chart
fig_pie = go.Figure()

fig_pie.add_trace(go.Pie(
    labels=df_pie['party_norm'],
    values=df_pie['tweet_count'],
    marker=dict(colors=pie_colors),
    textinfo='label+percent',
    textfont=dict(size=14, color='white'),
    hovertemplate=(
        "<b>%{label}</b><br>"
        "Tweets: %{value:,.0f}<br>"
        "Anteil: %{percent}<br>"
        "<extra></extra>"
    )
))

title_pie = f"Verteilung der '{KEYWORD}'-Tweets nach Partei<br><sub style='font-size:0.85em;'>{STAND_TEXT}</sub>"

fig_pie.update_layout(
    title=dict(text=title_pie, x=0.5, xanchor='center', font=dict(size=22)),
    paper_bgcolor='#1a1a1a',
    font=dict(color='white'),
    margin=dict(l=20, r=20, t=100, b=20),
    showlegend=True,
    legend=dict(
        orientation="v",
        yanchor="middle",
        y=0.5,
        xanchor="left",
        x=1.02,
        bgcolor='rgba(0,0,0,0.5)'
    )
)

# Save pie chart
output_file_pie = GRAPHICS_DIR / f"{KEYWORD.lower()}_distribution_pie.png"
fig_pie.write_image(output_file_pie, width=1080, height=1080, scale=2)  # Square for Instagram

print(f"✅ Saved pie chart: {output_file_pie}")
print(f"\nPercentage distribution:")
for _, row in df_pie.iterrows():
    print(f"  {row['party_norm']}: {row['percentage']:.1f}% ({row['tweet_count']:,} tweets)")

# Display
fig_pie.show()

## Get Top Tweets

In [None]:
# Get top tweets mentioning the keyword
query_top = f"""
SELECT 
    t.username,
    p.partei_kurz AS party,
    t.text,
    t.like_count,
    t.retweet_count,
    t.impression_count
FROM public.tweets t
JOIN politicians_{MONTH:02d}_{YEAR} p ON t.username = p.username
WHERE t.text ILIKE '%{KEYWORD}%'
ORDER BY t.impression_count DESC
LIMIT 10
"""

with engine.connect() as conn:
    df_top = pd.read_sql(text(query_top), conn)

df_top['party_norm'] = df_top['party'].apply(normalize_party)

print(f"Top 10 tweets mentioning '{KEYWORD}' by impressions:\n")
for i, row in df_top.iterrows():
    print(f"{i+1}. @{row['username']} ({row['party_norm']})")
    print(f"   💡 {row['impression_count']:,} impressions | 👍 {row['like_count']:,} likes | 🔄 {row['retweet_count']:,} RTs")
    print(f"   {row['text'][:120]}...\n")

df_top[['username', 'party_norm', 'impression_count', 'like_count', 'retweet_count']]

## Summary

## View Generated Charts

In [None]:
import os

print(f"\n{'='*80}")
print(f"Analysis Complete: {KEYWORD}")
print(f"{'='*80}\n")

print(f"Total tweets mentioning '{KEYWORD}': {df_agg['tweet_count'].sum():,}")
print(f"Parties discussing this topic: {len(df_agg)}")
print(f"\nTop party: {df_agg.iloc[-1]['party_norm']} ({df_agg.iloc[-1]['tweet_count']:,} tweets)\n")

png_files = sorted(GRAPHICS_DIR.glob("*.png"))
if png_files:
    print(f"Charts in {GRAPHICS_DIR}:\n")
    for png_file in png_files:
        size = os.path.getsize(png_file) / 1024
        print(f"  📊 {png_file.name} ({size:.1f} KB)")

print(f"\n✅ Ready for social media posting!")

In [None]:
from IPython.display import Image, display

# Display the bar chart
print("📊 Bar Chart:")
display(Image(filename=str(GRAPHICS_DIR / f"{KEYWORD.lower()}_tweets_by_party.png"), width=800))

print("\n🥧 Pie Chart:")
display(Image(filename=str(GRAPHICS_DIR / f"{KEYWORD.lower()}_distribution_pie.png"), width=600))