In [1]:
import pandas as pd
from pathlib import Path
import yaml

# --- Load parameters.yml (same file the pipelines use) ---
PARAMS_FILE = Path("C:/Users/felix/Documents/xminer/src/xminer/config/parameters.yml")
assert PARAMS_FILE.exists(), f"parameters.yml not found: {PARAMS_FILE}"

with PARAMS_FILE.open("r", encoding="utf-8") as f:
    params = yaml.safe_load(f) or {}

YEAR = int(params.get("year", 2025))
MONTH = int(params.get("month", 11))
YM = f"{YEAR:04d}{MONTH:02d}"

STAND_TEXT = f"Erhoben für {MONTH:02d}/{YEAR}"  # << das nutzt der Plot

GRAPHICS_BASE_DIR = Path(
    params.get(
        "graphics_base_dir",
        r"C:/Users/felix/Documents/xminer/outputs",
    )
)

GRAPHICS_DIR = GRAPHICS_BASE_DIR / YM / "graphics"
GRAPHICS_DIR.mkdir(parents=True, exist_ok=True)


In [14]:
import pandas as pd
from pathlib import Path

month = "11"
year = "2025"

base_path = Path(f'C:/Users/felix/Documents/xminer/data/politicians_{month}.csv')
assert base_path.exists(), f'File not found: {base_path}'

df_politicians = pd.read_csv(base_path, low_memory=False, sep=';')

print("Shape:", df_politicians.shape)

Shape: (632, 32)


In [15]:
title = df_politicians['AKAD_TITEL'].astype('string').fillna('').str.strip()
first = df_politicians['VORNAME'].astype('string').fillna('').str.strip().str.split().str[0]
last  = df_politicians['NACHNAME'].astype('string').fillna('').str.strip()

df_politicians['FULLNAME'] = (title + ' ' + first + ' ' + last).str.split().str.join(' ')


Index(['KEY_COLUMN', 'ID', 'GEBURTSDATUM', 'GEBURTSORT', 'GEBURTSLAND',
       'STERBEDATUM', 'GESCHLECHT', 'FAMILIENSTAND', 'RELIGION', 'BERUF',
       'PARTEI_KURZ', 'VITA_KURZ', 'VEROEFFENTLICHUNGSPFLICHTIGES', 'NACHNAME',
       'VORNAME', 'ORTSZUSATZ', 'ADEL', 'PRAEFIX', 'ANREDE_TITEL',
       'AKAD_TITEL', 'HISTORIE_VON', 'HISTORIE_BIS', 'WP_WP', 'WP_MDBWP_VON',
       'WP_MDBWP_BIS', 'WP_WKR_NUMMER', 'WP_WKR_NAME', 'WP_WKR_LAND',
       'WP_LISTE', 'WP_MANDATSART', 'WP_INSTITUTIONEN', 'USERNAME'],
      dtype='object')

In [4]:
base_path = Path(f'C:/Users/felix/Documents/xminer/outputs/{year}{month}/tweets/tweets_individual_month_{year}{month}.csv')
assert base_path.exists(), f'File not found: {base_path}'

df_tweets = pd.read_csv(base_path, low_memory=False)

# Parse datetime columns if present
for col in ['created_at', 'retrieved_at']:
    if col in df_tweets.columns:
        df_tweets[col] = pd.to_datetime(df_tweets[col], utc=True, errors='coerce')

print("Shape:", df_tweets.shape)

UNION_MAP = {"CDU": "CDU/CSU", "CSU": "CDU/CSU"}

def normalize_party(df: pd.DataFrame) -> pd.DataFrame:
    if "partei_kurz" in df.columns:
        df["partei_kurz"] = (
            df["partei_kurz"]
            .astype(str)
            .str.strip()
            .str.upper()
            .replace(UNION_MAP)
        )
    return df

df_tweets = normalize_party(df_tweets)
df_tweets.partei_kurz.value_counts()

Shape: (261, 28)


partei_kurz
AFD                      98
CDU/CSU                  71
BÜNDNIS 90/DIE GRÜNEN    39
SPD                      29
DIE LINKE.               24
Name: count, dtype: int64

In [17]:
df_tweets = df_tweets.merge(
    df_politicians[['USERNAME', 'FULLNAME']],
    right_on='USERNAME',
    left_on='username',
    how='left'
)


In [18]:
def plot_party_hbar(
    df_profiles,
    y_col: str,  # e.g. "username"
    x_col: str,
    top_n: int = 10,
    party_col: str = "partei_kurz",
    title: str | None = None,
    x_label: str | None = None,
    save_name: str | None = None,   # << NEW parameter: filename without path
):
    import pandas as pd
    import plotly.graph_objects as go
    from pathlib import Path

    PARTY_COLORS = {
        "CDU/CSU": "#000000",
        "CDU": "#000000",
        "CSU": "#000000",
        "SPD": "#E3000F",
        "GRÜNE": "#1AA64A",
        "GRUENE": "#1AA64A",
        "B90/GRUENE": "#1AA64A",
        "DIE LINKE": "#BE3075",
        "LINKE": "#BE3075",
        "FDP": "#FFED00",
        "AFD": "#009EE0",
        "BSW": "#009688",
        "FW": "#F28F00",
        "SSW": "#00A3E0",
        "PIRATEN": "#FF8800",
        "PARTEI": "#9E9E9E",
        "ÖDP": "#FF6A00",
        "OEDP": "#FF6A00",
    }

    def _normalize_party_value(p: str) -> str:
        if p is None:
            return ""
        key = str(p).strip().upper()
        if key in {"CDU", "CSU"}:
            return "CDU/CSU"
        if key.startswith("GRÜN") or key.startswith("GRUEN") or "GRUENE" in key or "GRÜNE" in key or "B90" in key:
            return "GRÜNE"
        if key in {"LINKE", "DIE LINKE", "DIE LINKE."}:
            return "DIE LINKE"
        if key in {"ÖDP", "OEDP"}:
            return "ÖDP"
        if key in {"AFD", "ALTERNATIVE FÜR DEUTSCHLAND", "ALTERNATIVE FUER DEUTSCHLAND"}:
            return "AFD"
        return key

    def _resolve_party_colors(series: pd.Series) -> list[str]:
        parties = series.astype("string").fillna("")
        return [PARTY_COLORS.get(_normalize_party_value(p), "#888888") for p in parties]

    def _is_dark_color(hex_color: str) -> bool:
        """Determine if a color is dark based on brightness."""
        hex_color = hex_color.lstrip("#")
        r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        brightness = (r * 299 + g * 587 + b * 114) / 1000
        return brightness < 140

    for col in (y_col, x_col):
        if col not in df_profiles.columns:
            raise ValueError(f"Missing required column: {col}")
    if "FULLNAME" not in df_profiles.columns:
        raise ValueError("Missing required column: FULLNAME")

    work = df_profiles.copy()
    if party_col not in work.columns:
        work[party_col] = None

    # --- combined label for y-axis ---
    work["_label"] = work["FULLNAME"].astype(str).str.strip() + " (" + work[y_col].astype(str).str.strip() + ")"

    work = work.sort_values(x_col, ascending=False).head(top_n).copy()

    categories = work["_label"].tolist()[::-1]
    work["_y_cat"] = pd.Categorical(work["_label"], categories=categories, ordered=True)

    colors = _resolve_party_colors(work[party_col])

    # Decide whether each bar is short or long
    max_x = work[x_col].max()
    threshold = 0.15 * max_x
    text_positions = ["outside" if x < threshold else "inside" for x in work[x_col]]

    # Choose text colors: white for dark bars (inside), black otherwise
    text_colors = []
    for c, pos in zip(colors, text_positions):
        if pos == "outside":
            text_colors.append("#000000")
        else:
            text_colors.append("#FFFFFF" if _is_dark_color(c) else "#000000")

    x_title = x_label or x_col

    fig = go.Figure(
        go.Bar(
            x=work[x_col],
            y=work["_y_cat"],
            orientation="h",
            marker_color=colors,
            text=[f"{v:,.0f}" for v in work[x_col]],
            textposition=text_positions,
            insidetextanchor="end",
            textfont=dict(
                color=text_colors,
            ),
            customdata=work[[party_col, y_col, "FULLNAME"]].astype(str).values,
            hovertemplate=(
                "Name: %{customdata[2]} (%{customdata[1]})<br>"
                f"{x_title}: %{{x:,.0f}}<br>"
                f"{party_col}: %{{customdata[0]}}<extra></extra>"
            ),
        )
    )

    # ---- Titel + globaler Stand-Text zusammenbauen ----
    try:
        global STAND_TEXT
        stand_text = STAND_TEXT
    except NameError:
        stand_text = None

    if title and stand_text:
        title_text = f"{title}<br><sub style='font-size:0.85em; line-height:0.5;'>{stand_text}</sub>"
        top_margin = 100
    elif title:
        title_text = title
        top_margin = 50
    elif stand_text:
        title_text = stand_text
        top_margin = 60
    else:
        title_text = None
        top_margin = 40

    fig.update_layout(
        title=dict(
            text=title_text,
            x=0.5,
            xanchor="center",
            yanchor="top",
            yref="container",
            font=dict(size=20),
        ),
        xaxis_title=x_title,
        yaxis_title="",
        yaxis=dict(categoryorder="array", categoryarray=categories),
        bargap=0.25,
        margin=dict(l=10, r=40, t=top_margin, b=10),
        height=max(300, 35 * len(work)),
        uniformtext_minsize=8,
        uniformtext_mode="show",
    )

    fig.update_traces(cliponaxis=False, texttemplate="%{text}")

    # ---- Optional: Save the figure ----
    if save_name:
        try:
            global GRAPHICS_DIR
        except NameError:
            raise RuntimeError("GRAPHICS_DIR not defined globally. Initialize it before calling the function.")

        save_path = Path(GRAPHICS_DIR) / f"{save_name}.png"
        fig.write_image(save_path, width=900, height=600, scale=2)
        print(f"✅ Plot saved to: {save_path}")

    return fig


In [19]:
df_tweets.head()

Unnamed: 0,partei_kurz,username,n_tweets,likes_mean,replies_mean,retweets_mean,quotes_mean,bookmarks_mean,impressions_mean,engagement_mean,...,impressions_sum,engagement_sum,like_to_reply_total_ratio,retweet_to_like_total_ratio,engagement_rate_total,followers_latest,verified_share,protected_share,USERNAME,FULLNAME
0,AFD,edgar_naujok,515,0.040777,0.0,693.547573,0.0,0.0,0.864078,693.58835,...,445,357198,,17008.428571,802.692135,928,0.0,0.0,edgar_naujok,Edgar Naujok
1,AFD,KrahMax,431,659.266821,47.283063,164.389791,2.825986,12.821346,14027.770302,886.587007,...,6045969,382119,13.942981,0.249352,0.063202,93088,1.0,0.0,KrahMax,Dr. Maximilian Krah
2,AFD,GtzFrmming,296,196.148649,10.878378,473.614865,0.817568,3.621622,3837.837838,685.081081,...,1136000,202784,18.031056,2.414571,0.178507,37743,0.0,0.0,GtzFrmming,Dr. Götz Frömming
3,AFD,BrandnerSt,251,413.486056,32.334661,188.844622,1.63745,5.462151,5610.553785,641.76494,...,1408249,161083,12.787703,0.456713,0.114385,30702,0.0,0.0,BrandnerSt,Stephan Brandner
4,AFD,Dr_Rainer_Kraft,248,49.044355,3.346774,1.173387,0.076613,0.520161,820.701613,54.16129,...,203534,13432,14.654217,0.023925,0.065994,5997,0.0,0.0,Dr_Rainer_Kraft,Dr. Rainer Kraft


In [20]:
df_tweets.columns

Index(['partei_kurz', 'username', 'n_tweets', 'likes_mean', 'replies_mean',
       'retweets_mean', 'quotes_mean', 'bookmarks_mean', 'impressions_mean',
       'engagement_mean', 'engagement_rate_mean', 'like_to_reply_mean',
       'retweet_to_like_mean', 'likes_per_1k_followers_mean',
       'engagement_per_1k_followers_mean', 'likes_sum', 'replies_sum',
       'retweets_sum', 'quotes_sum', 'bookmarks_sum', 'impressions_sum',
       'engagement_sum', 'like_to_reply_total_ratio',
       'retweet_to_like_total_ratio', 'engagement_rate_total',
       'followers_latest', 'verified_share', 'protected_share', 'USERNAME',
       'FULLNAME'],
      dtype='object')

In [23]:
plot_party_hbar(
    df_tweets, 
    'username', 
    'likes_sum', 
    top_n=20, 
    title='MdBs mit den meisten Likes', 
    x_label='Summe Likes',
    save_name="top20_sum_likes",
)

✅ Plot saved to: C:\Users\felix\Documents\xminer\outputs\202511\graphics\top20_sum_likes.png


In [24]:
plot_party_hbar(
    df_tweets, 
    'username', 
    'likes_mean', 
    top_n=20, 
    title='MdBs mit den meisten Likes im Durchschnitt pro Post', 
    x_label='Durschnitt Likes',
    save_name="top20_mean_likes",
)

✅ Plot saved to: C:\Users\felix\Documents\xminer\outputs\202511\graphics\top20_mean_likes.png


In [26]:
plot_party_hbar(
    df_tweets, 
    'username', 
    'replies_sum', 
    top_n=20, 
    title='MdBs mit den meisten Antworten auf Posts', 
    x_label='Summe Antworten',
    save_name="top20_sum_replies",
)

✅ Plot saved to: C:\Users\felix\Documents\xminer\outputs\202511\graphics\top20_sum_replies.png


In [25]:
plot_party_hbar(
    df_tweets, 
    'username', 
    'replies_mean', 
    top_n=20, 
    title='MdBs mit den meisten Antworten auf Posts im Durchschnitt pro Post', 
    x_label='Durschnitt Antworten',
    save_name="top20_mean_replies",
)

✅ Plot saved to: C:\Users\felix\Documents\xminer\outputs\202511\graphics\top20_mean_replies.png
