In [17]:
import pandas as pd
from pathlib import Path
import yaml

# --- Load parameters.yml (same file the pipelines use) ---
PARAMS_FILE = Path("C:/Users/felix/Documents/xminer/src/xminer/config/parameters.yml")
assert PARAMS_FILE.exists(), f"parameters.yml not found: {PARAMS_FILE}"

with PARAMS_FILE.open("r", encoding="utf-8") as f:
    params = yaml.safe_load(f) or {}

YEAR = int(params.get("year", 2025))
MONTH = int(params.get("month", 10))
YM = f"{YEAR:04d}{MONTH:02d}"

STAND_TEXT = f"Erhoben fÃ¼r {MONTH:02d}/{YEAR}"  # << das nutzt der Plot

GRAPHICS_BASE_DIR = Path(
    params.get(
        "graphics_base_dir",
        r"C:/Users/felix/Documents/xminer/outputs",
    )
)

GRAPHICS_DIR = GRAPHICS_BASE_DIR / YM / "graphics"
GRAPHICS_DIR.mkdir(parents=True, exist_ok=True)


In [5]:
import pandas as pd
from pathlib import Path

month = "10"
year = "2025"

base_path = Path(f'C:/Users/felix/Documents/xminer/data/politicians_{month}.csv')
assert base_path.exists(), f'File not found: {base_path}'

df_politicians = pd.read_csv(base_path, low_memory=False, sep=';')

print("Shape:", df_politicians.shape)
df_politicians.head(10)

Shape: (632, 32)


Unnamed: 0,KEY_COLUMN,ID,GEBURTSDATUM,GEBURTSORT,GEBURTSLAND,STERBEDATUM,GESCHLECHT,FAMILIENSTAND,RELIGION,BERUF,...,WP_WP,WP_MDBWP_VON,WP_MDBWP_BIS,WP_WKR_NUMMER,WP_WKR_NAME,WP_WKR_LAND,WP_LISTE,WP_MANDATSART,WP_INSTITUTIONEN,USERNAME
0,1,11000756,16.01.1948,Berlin,Deutschland,,mÃ¤nnlich,"geschieden, 3 Kinder",konfessionslos,Rechtsanwalt,...,21.0,25.03.2025,,83.0,Berlin-Treptow-KÃ¶penick,BE,BE,Direktwahl,,GregorGysi
1,2,11002718,18.02.1961,Aachen,Deutschland,,mÃ¤nnlich,"verheiratet, 3 Kinder",rÃ¶misch-katholisch,MinisterprÃ¤sident a. D.,...,21.0,25.03.2025,,86.0,Aachen I,NW,NW,Direktwahl,,ArminLaschet
2,3,11002720,19.01.1968,Dessau,Deutschland,,weiblich,"geschieden, 1 Kind",,Dipl.-Agraringenieurin,...,21.0,25.03.2025,,70.0,Anhalt â€“ Dessau â€“ Wittenberg,ST,ST,Landesliste,,
3,4,11002733,09.06.1961,Lorsch,Deutschland,,mÃ¤nnlich,"verheiratet, 2 Kinder",evangelisch,Dipl.-Mathematiker,...,21.0,25.03.2025,,187.0,BergstraÃŸe,HE,HE,Direktwahl,,meister_schafft
4,5,11002735,11.11.1955,Brilon,Deutschland,,mÃ¤nnlich,"verheiratet, 3 Kinder",rÃ¶misch-katholisch,Rechtsanwalt,...,21.0,25.03.2025,,146.0,Hochsauerlandkreis,NW,NW,Direktwahl,,_FriedrichMerz
5,6,11002735,11.11.1955,Brilon,Deutschland,,mÃ¤nnlich,"verheiratet, 3 Kinder",rÃ¶misch-katholisch,Rechtsanwalt,...,21.0,25.03.2025,,146.0,Hochsauerlandkreis,NW,NW,Direktwahl,,bundeskanzler
6,7,11002754,17.05.1962,DÃ¼ren,Deutschland,,mÃ¤nnlich,"verheiratet, 1 Kind",evangelisch,Politikwissenschaftler,...,21.0,25.03.2025,,89.0,DÃ¼ren,NW,NW,Direktwahl,,_ThomasRachel
7,8,11002765,02.07.1965,Meckenheim,Deutschland,,mÃ¤nnlich,"verheiratet, 3 Kinder",katholisch,Rechtsanwalt,...,21.0,25.03.2025,,97.0,Rhein-Sieg-Kreis II,NW,NW,Direktwahl,,n_roettgen
8,9,11003034,31.03.1958,Stralsund,Deutschland,,mÃ¤nnlich,"geschieden, 2 Kinder",,Wirtschaftswissenschaftler,...,21.0,25.03.2025,,14.0,Rostock â€“ Landkreis Rostock II,MV,MV,Landesliste,,DietmarBartsch
9,10,11003132,03.05.1966,Friedrichroda,Deutschland,,weiblich,"verpartnert, 2 Kinder",evangelisch,MdB,...,21.0,25.03.2025,,192.0,Erfurt â€“ Weimar â€“ Weimarer Land II,TH,TH,Landesliste,,GoeringEckardt


In [24]:
base_path = Path(f'C:/Users/felix/Documents/xminer/outputs/{year}{month}/tweets/tweets_individual_month_{year}{month}.csv')
assert base_path.exists(), f'File not found: {base_path}'

df_politicians = pd.read_csv(base_path, low_memory=False)

# Parse datetime columns if present
for col in ['created_at', 'retrieved_at']:
    if col in df_politicians.columns:
        df_politicians[col] = pd.to_datetime(df_politicians[col], utc=True, errors='coerce')

print("Shape:", df_politicians.shape)

UNION_MAP = {"CDU": "CDU/CSU", "CSU": "CDU/CSU"}

def normalize_party(df: pd.DataFrame) -> pd.DataFrame:
    if "partei_kurz" in df.columns:
        df["partei_kurz"] = (
            df["partei_kurz"]
            .astype(str)
            .str.strip()
            .str.upper()
            .replace(UNION_MAP)
        )
    return df

df_politicians = normalize_party(df_politicians)
df_politicians.partei_kurz.value_counts()

Shape: (269, 28)


partei_kurz
AFD                      98
CDU/CSU                  74
BÃœNDNIS 90/DIE GRÃœNEN    44
SPD                      29
DIE LINKE.               24
Name: count, dtype: int64

In [27]:
df_summary = (
    df_politicians
    .groupby("partei_kurz")[["n_tweets", "impressions_sum"]]
    .sum()
    .assign(
        n_tweets_pct=lambda x: x["n_tweets"] / x["n_tweets"].sum(),
        impressions_pct=lambda x: x["impressions_sum"] / x["impressions_sum"].sum(),
    )
    .reset_index()
)


In [28]:
df_summary

Unnamed: 0,partei_kurz,n_tweets,impressions_sum,n_tweets_pct,impressions_pct
0,AFD,5330,40307817,0.566479,0.407229
1,BÃœNDNIS 90/DIE GRÃœNEN,1251,19508291,0.132958,0.197092
2,CDU/CSU,1583,28277156,0.168243,0.285683
3,DIE LINKE.,935,5473059,0.099373,0.055294
4,SPD,310,5414432,0.032947,0.054702


In [6]:
base_path = Path(f'C:/Users/felix/Documents/xminer/outputs/{year}{month}/tweets/tweets_party_month_{year}{month}.csv')
assert base_path.exists(), f'File not found: {base_path}'

df_party = pd.read_csv(base_path, low_memory=False)

# Parse datetime columns if present
for col in ['created_at', 'retrieved_at']:
    if col in df_party.columns:
        df_party[col] = pd.to_datetime(df_party[col], utc=True, errors='coerce')

print("Shape:", df_party.shape)
df_party.head(10)

Shape: (6, 17)


Unnamed: 0,partei_kurz,tweets,likes_sum,replies_sum,retweets_sum,quotes_sum,bookmarks_sum,impressions_sum,engagement_sum,engagement_rate_mean,like_to_reply_mean,retweet_to_like_mean,likes_per_1k_followers_mean,engagement_per_1k_followers_mean,verified_share,protected_share,engagement_rate_total
0,AfD,5330,2195551,120787,2428728,10611,43380,40307817,4799057,46.269505,18.75471,0.113577,14.542714,310.722139,0.091182,0.0,0.11906
1,CDU,1449,365784,201473,246021,12200,12680,28206089,838158,22.250144,5.687363,0.112384,12.241352,73.600328,0.184955,0.0,0.029715
2,BÃœNDNIS 90/DIE GRÃœNEN,1251,258859,107863,291761,5918,6991,19508291,671392,13.6771,12.696787,0.132916,8.937697,29.003693,0.47562,0.0,0.034416
3,DIE LINKE.,935,97723,42563,181575,2194,2687,5473059,326742,4.779412,11.266371,0.132406,7.07746,37.056581,0.091979,0.0,0.0597
4,SPD,310,75149,41939,29555,1661,2327,5414432,150631,5.646378,2.154049,0.078079,3.768982,21.828088,0.467742,0.0,0.02782
5,CSU,134,1633,346,1695,20,18,71067,3712,0.067687,2.694101,0.089467,1.550217,8.771599,0.0,0.0,0.052232


Shape: (269, 28)


Unnamed: 0,partei_kurz,username,n_tweets,likes_mean,replies_mean,retweets_mean,quotes_mean,bookmarks_mean,impressions_mean,engagement_mean,...,quotes_sum,bookmarks_sum,impressions_sum,engagement_sum,like_to_reply_total_ratio,retweet_to_like_total_ratio,engagement_rate_total,followers_latest,verified_share,protected_share
241,SPD,Karl_Lauterbach,56,1203.803571,656.607143,165.446429,25.910714,37.160714,81658.607143,2088.928571,...,1451,2081,4572882,116980,1.83337,0.137436,0.025581,1195860,1.0,0.0


In [8]:
import pandas as pd

# Step 1: Subset
sub = df_party[["partei_kurz", "tweets", "engagement_sum"]].copy()

# Step 2: Merge CDU + CSU
sub["partei_kurz"] = sub["partei_kurz"].replace({
    "CDU": "CDU/CSU",
    "CSU": "CDU/CSU",
})

# Step 3: Group + sum
result = sub.groupby("partei_kurz", as_index=False).sum()

# Step 4: Build percentage columns
total_tweets = result["tweets"].sum()
total_eng = result["engagement_sum"].sum()

result["tweets_pct"] = result["tweets"] / total_tweets
result["engagement_sum_pct"] = result["engagement_sum"] / total_eng

result


Unnamed: 0,partei_kurz,tweets,engagement_sum,tweets_pct,engagement_sum_pct
0,AfD,5330,4799057,0.566479,0.706815
1,BÃœNDNIS 90/DIE GRÃœNEN,1251,671392,0.132958,0.098884
2,CDU/CSU,1583,841870,0.168243,0.123992
3,DIE LINKE.,935,326742,0.099373,0.048123
4,SPD,310,150631,0.032947,0.022185


In [32]:
def plot_party_stack_tweets_engagement(
    df_party,
    tweets_pct_col: str = "tweets_pct",
    engagement_pct_col: str = "engagement_sum_pct",
    party_col: str = "partei_kurz",
    title: str | None = None,
    save_name: str | None = None,   # filename ohne Pfad/Extension
    min_inside_pct: float = 0.08,   # Schwelle: ab wann Text "inside", sonst "outside"
):
    import pandas as pd
    import plotly.graph_objects as go
    from pathlib import Path

    PARTY_COLORS = {
        "CDU/CSU": "#000000",
        "CDU": "#000000",
        "CSU": "#000000",
        "SPD": "#E3000F",
        "GRÃœNE": "#1AA64A",
        "GRUENE": "#1AA64A",
        "B90/GRUENE": "#1AA64A",
        "DIE LINKE": "#BE3075",
        "LINKE": "#BE3075",
        "FDP": "#FFED00",
        "AFD": "#009EE0",
        "BSW": "#009688",
        "FW": "#F28F00",
        "SSW": "#00A3E0",
        "PIRATEN": "#FF8800",
        "PARTEI": "#9E9E9E",
        "Ã–DP": "#FF6A00",
        "OEDP": "#FF6A00",
    }

    def _normalize_party_value(p: str) -> str:
        if p is None:
            return ""
        key = str(p).strip().upper()
        if key in {"CDU", "CSU"}:
            return "CDU/CSU"
        if key.startswith("GRÃœN") or key.startswith("GRUEN") or "GRUENE" in key or "GRÃœNE" in key or "B90" in key:
            return "GRÃœNE"
        if key in {"LINKE", "DIE LINKE", "DIE LINKE."}:
            return "DIE LINKE"
        if key in {"Ã–DP", "OEDP"}:
            return "Ã–DP"
        if key in {"AFD", "ALTERNATIVE FÃœR DEUTSCHLAND", "ALTERNATIVE FUER DEUTSCHLAND"}:
            return "AFD"
        return key

    def _is_dark_color(hex_color: str) -> bool:
        """Bestimmen, ob eine Farbe 'dunkel' ist (fÃ¼r Textfarbe innen)."""
        hex_color = hex_color.lstrip("#")
        r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        brightness = (r * 299 + g * 587 + b * 114) / 1000
        return brightness < 140

    # --- basic checks ---
    for col in (party_col, tweets_pct_col, engagement_pct_col):
        if col not in df_party.columns:
            raise ValueError(f"Missing required column: {col}")

    work = df_party.copy()
    work[party_col] = work[party_col].astype(str).str.strip()

    # Zwei Balken: Anteil Tweets / Anteil Engagement
    x_vals = ["Anteil Tweets", "Anteil Impressions"]

    fig = go.Figure()

    # Ein Trace pro Partei (stacked)
    for _, row in work.iterrows():
        p = row[party_col]
        key = _normalize_party_value(p)
        color = PARTY_COLORS.get(key, "#888888")

        # y-Werte (Anteile 0â€“1)
        y_vals = [row[tweets_pct_col], row[engagement_pct_col]]

        # Text als Prozentangabe
        text_vals = [f"{v * 100:.1f} %" if v is not None else "" for v in y_vals]

        # Position & Textfarbe abhÃ¤ngig von Anteil
        text_positions = []
        text_colors = []
        for v in y_vals:
            if v is None:
                text_positions.append("outside")
                text_colors.append("#000000")
                continue

            if v >= min_inside_pct:
                # groÃŸ genug â†’ inside
                text_positions.append("inside")
                text_colors.append("#FFFFFF" if _is_dark_color(color) else "#000000")
            else:
                # kleine Segmente â†’ outside
                text_positions.append("outside")
                text_colors.append("#000000")

        fig.add_bar(
            name=key,
            x=x_vals,
            y=y_vals,
            marker_color=color,
            text=text_vals,
            textposition=text_positions,
            textfont=dict(color=text_colors, size=11),
            hovertemplate=(
                f"Partei: {key}<br>"
                "Kategorie: %{x}<br>"
                "Anteil: %{y:.1%}<extra></extra>"
            ),
        )

    # ---- Titel + globaler Stand-Text wie in plot_party_hbar ----
    try:
        global STAND_TEXT
        stand_text = STAND_TEXT
    except NameError:
        stand_text = None

    if title and stand_text:
        title_text = f"{title}<br><sub style='font-size:0.85em; line-height:0.5;'>{stand_text}</sub>"
        top_margin = 100
    elif title:
        title_text = title
        top_margin = 50
    elif stand_text:
        title_text = stand_text
        top_margin = 60
    else:
        title_text = None
        top_margin = 40

    fig.update_layout(
        title=dict(
            text=title_text,
            x=0.5,
            xanchor="center",
            yanchor="top",
            yref="container",
            font=dict(size=20),
        ),
        barmode="stack",
        xaxis_title="",
        yaxis_title="Anteil",
        yaxis=dict(tickformat=".0%"),
        margin=dict(l=40, r=40, t=top_margin, b=40),
        legend_title_text="Partei",
        uniformtext_minsize=8,
        uniformtext_mode="show",
    )

    # ---- Optional: speichern ----
    if save_name:
        try:
            global GRAPHICS_DIR
        except NameError:
            raise RuntimeError("GRAPHICS_DIR not defined globally. Initialize it before calling the function.")

        save_path = Path(GRAPHICS_DIR) / f"{save_name}.png"
        fig.write_image(save_path, width=900, height=600, scale=2)
        print(f"âœ… Plot saved to: {save_path}")

    return fig


In [33]:
fig = plot_party_stack_tweets_engagement(
    df_party=df_summary, 
    tweets_pct_col="n_tweets_pct" ,
    engagement_pct_col="impressions_pct",         # your aggregated df
    title="Anteile nach Partei: Tweets vs. Impressions",
    save_name="party_share_tweets_impressions"
)
fig.show()


âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\party_share_tweets_impressions.png


In [10]:
import plotly.express as px
import pandas as pd
import numpy as np

# --- Tabelle mit Kennzahlen & Perzentilen ---
col = "delta_followers_count"
s = df_profiles[col].dropna()

percentiles = {
    "Perzentil 1% (p01)": s.quantile(0.01),
    "Perzentil 10% (p10)": s.quantile(0.10),
    "Perzentil 25% (p25)": s.quantile(0.25),
    "Median (50%)": s.quantile(0.50),
    "Perzentil 75% (p75)": s.quantile(0.75),
    "Perzentil 90% (p90)": s.quantile(0.90),
    "Perzentil 99% (p99)": s.quantile(0.99),
    "Durchschnitt (Mittelwert)": s.mean(),
}

stats_df = (
    pd.Series(percentiles)
    .rename_axis("Statistik")
    .reset_index(name="Delta Follower-Anzahl")
    .round(2)
)

print(stats_df)


                   Statistik  Delta Follower-Anzahl
0         Perzentil 1% (p01)                -247.80
1        Perzentil 10% (p10)                 -18.00
2        Perzentil 25% (p25)                  -4.00
3               Median (50%)                   3.00
4        Perzentil 75% (p75)                  25.00
5        Perzentil 90% (p90)                 159.00
6        Perzentil 99% (p99)                2148.40
7  Durchschnitt (Mittelwert)                 105.64


In [11]:
display(stats_df)

Unnamed: 0,Statistik,Delta Follower-Anzahl
0,Perzentil 1% (p01),-247.8
1,Perzentil 10% (p10),-18.0
2,Perzentil 25% (p25),-4.0
3,Median (50%),3.0
4,Perzentil 75% (p75),25.0
5,Perzentil 90% (p90),159.0
6,Perzentil 99% (p99),2148.4
7,Durchschnitt (Mittelwert),105.64


In [12]:
df_profiles = (
    df_profiles
        .assign(__key=df_profiles['username'].astype(str).str.strip().str.lower())
        .merge(
            df_politicians
                .assign(__key=df_politicians['USERNAME'].astype(str).str.strip().str.lower())[
                    ['AKAD_TITEL', 'VORNAME', 'NACHNAME', '__key']
                ],
            on='__key',
            how='left'
        )
        .drop(columns='__key')
)


In [13]:
title = df_profiles['AKAD_TITEL'].astype('string').fillna('').str.strip()
first = df_profiles['VORNAME'].astype('string').fillna('').str.strip().str.split().str[0]
last  = df_profiles['NACHNAME'].astype('string').fillna('').str.strip()

df_profiles['FULLNAME'] = (title + ' ' + first + ' ' + last).str.split().str.join(' ')


In [14]:
df_profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   username               441 non-null    object 
 1   name_curr              441 non-null    object 
 2   partei_kurz            441 non-null    object 
 3   followers_count_prev   441 non-null    int64  
 4   followers_count_curr   441 non-null    int64  
 5   following_count_prev   441 non-null    int64  
 6   following_count_curr   441 non-null    int64  
 7   tweet_count_prev       441 non-null    int64  
 8   tweet_count_curr       441 non-null    int64  
 9   listed_count_prev      441 non-null    int64  
 10  listed_count_curr      441 non-null    int64  
 11  delta_followers_count  441 non-null    float64
 12  delta_following_count  441 non-null    float64
 13  delta_tweet_count      441 non-null    float64
 14  delta_listed_count     441 non-null    float64
 15  pct_fo

In [15]:
num = df_profiles.select_dtypes(include=['number'])
num.describe().T.sort_values('mean', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
followers_count_curr,441.0,22313.854875,97232.544999,0.0,939.0,2895.0,9551.0,1203449.0
followers_count_prev,441.0,22208.217687,96766.897717,0.0,919.0,2880.0,9443.0,1195174.0
tweet_count_curr,441.0,4486.988662,8483.188547,0.0,275.0,1510.0,5159.0,81147.0
tweet_count_prev,441.0,4471.226757,8469.08786,0.0,259.0,1498.0,5117.0,81212.0
following_count_curr,441.0,835.131519,981.675655,0.0,178.0,556.0,1095.0,9032.0
following_count_prev,441.0,834.519274,982.188769,0.0,177.0,555.0,1091.0,9010.0
listed_count_prev,441.0,172.62585,333.840372,0.0,19.0,81.0,170.0,3149.0
listed_count_curr,441.0,171.258503,335.151372,0.0,16.0,77.0,171.0,3148.0
delta_followers_count,441.0,105.637188,820.255379,-2127.0,-4.0,3.0,25.0,15238.0
snapshot_span_days,441.0,29.795918,2.468728,0.0,30.0,30.0,30.0,30.0


In [8]:
vc = df_profiles.get('partei_kurz')
vc.value_counts(dropna=False).to_frame('count') if vc is not None else 'partei_kurz missing'

Unnamed: 0_level_0,count
partei_kurz,Unnamed: 1_level_1
CDU/CSU,132
AFD,123
SPD,81
BÃœNDNIS 90/DIE GRÃœNEN,65
DIE LINKE.,39
SSW,1


In [9]:
cols2 = [c for c in ['username','name','partei_kurz','followers_count','following_count','tweet_count','follow_ratio','followers_per_tweet'] if c in df_profiles.columns]

top_ratio = (df_profiles.sort_values('follow_ratio', ascending=False, na_position='last')[cols2].head(20)
             if 'follow_ratio' in df_profiles.columns else 'follow_ratio missing')

top_fpt = (df_profiles.sort_values('followers_per_tweet', ascending=False, na_position='last')[cols2].head(20)
           if 'followers_per_tweet' in df_profiles.columns else 'followers_per_tweet missing')

top_ratio 


Unnamed: 0,username,name,partei_kurz,followers_count,following_count,tweet_count,follow_ratio,followers_per_tweet
364,EskenSaskia,Saskia Esken,SPD,85252,1,90,85252.0,947.244444
320,GregorGysi,Gregor Gysi,DIE LINKE.,497693,123,3597,4046.284553,138.363358
359,Karl_Lauterbach,Prof. Karl Lauterbach,SPD,1195860,323,12468,3702.352941,95.914341
0,Alice_Weidel,Alice Weidel,AFD,1203449,616,5378,1953.650974,223.772592
188,_FriedrichMerz,Friedrich Merz,CDU/CSU,544834,455,3593,1197.437363,151.637629
71,JoernKoenigAfD,"JÃ¶rn KÃ¶nig, MdB",AFD,1182,1,434,1182.0,2.723502
193,bundeskanzler,Bundeskanzler Friedrich Merz,CDU/CSU,128796,109,712,1181.614679,180.893258
2,Tino_Chrupalla,Tino Chrupalla,AFD,209554,373,1652,561.806971,126.848668
199,AussenMinDE,Johann Wadephul,CDU/CSU,27742,50,226,554.84,122.752212
11,BrandnerSt,ðŸ‡©ðŸ‡ªStephan BrandnerðŸ‡©ðŸ‡ª,AFD,29562,57,4767,518.631579,6.201385


In [30]:
def plot_party_hbar(
    df_profiles,
    y_col: str,  # e.g. "username"
    x_col: str,
    top_n: int = 10,
    party_col: str = "partei_kurz",
    title: str | None = None,
    x_label: str | None = None,
    save_name: str | None = None,   # << NEW parameter: filename without path
):
    import pandas as pd
    import plotly.graph_objects as go
    from pathlib import Path

    PARTY_COLORS = {
        "CDU/CSU": "#000000",
        "CDU": "#000000",
        "CSU": "#000000",
        "SPD": "#E3000F",
        "GRÃœNE": "#1AA64A",
        "GRUENE": "#1AA64A",
        "B90/GRUENE": "#1AA64A",
        "DIE LINKE": "#BE3075",
        "LINKE": "#BE3075",
        "FDP": "#FFED00",
        "AFD": "#009EE0",
        "BSW": "#009688",
        "FW": "#F28F00",
        "SSW": "#00A3E0",
        "PIRATEN": "#FF8800",
        "PARTEI": "#9E9E9E",
        "Ã–DP": "#FF6A00",
        "OEDP": "#FF6A00",
    }

    def _normalize_party_value(p: str) -> str:
        if p is None:
            return ""
        key = str(p).strip().upper()
        if key in {"CDU", "CSU"}:
            return "CDU/CSU"
        if key.startswith("GRÃœN") or key.startswith("GRUEN") or "GRUENE" in key or "GRÃœNE" in key or "B90" in key:
            return "GRÃœNE"
        if key in {"LINKE", "DIE LINKE", "DIE LINKE."}:
            return "DIE LINKE"
        if key in {"Ã–DP", "OEDP"}:
            return "Ã–DP"
        if key in {"AFD", "ALTERNATIVE FÃœR DEUTSCHLAND", "ALTERNATIVE FUER DEUTSCHLAND"}:
            return "AFD"
        return key

    def _resolve_party_colors(series: pd.Series) -> list[str]:
        parties = series.astype("string").fillna("")
        return [PARTY_COLORS.get(_normalize_party_value(p), "#888888") for p in parties]

    def _is_dark_color(hex_color: str) -> bool:
        """Determine if a color is dark based on brightness."""
        hex_color = hex_color.lstrip("#")
        r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        brightness = (r * 299 + g * 587 + b * 114) / 1000
        return brightness < 140

    for col in (y_col, x_col):
        if col not in df_profiles.columns:
            raise ValueError(f"Missing required column: {col}")
    if "FULLNAME" not in df_profiles.columns:
        raise ValueError("Missing required column: FULLNAME")

    work = df_profiles.copy()
    if party_col not in work.columns:
        work[party_col] = None

    # --- combined label for y-axis ---
    work["_label"] = work["FULLNAME"].astype(str).str.strip() + " (" + work[y_col].astype(str).str.strip() + ")"

    work = work.sort_values(x_col, ascending=False).head(top_n).copy()

    categories = work["_label"].tolist()[::-1]
    work["_y_cat"] = pd.Categorical(work["_label"], categories=categories, ordered=True)

    colors = _resolve_party_colors(work[party_col])

    # Decide whether each bar is short or long
    max_x = work[x_col].max()
    threshold = 0.15 * max_x
    text_positions = ["outside" if x < threshold else "inside" for x in work[x_col]]

    # Choose text colors: white for dark bars (inside), black otherwise
    text_colors = []
    for c, pos in zip(colors, text_positions):
        if pos == "outside":
            text_colors.append("#000000")
        else:
            text_colors.append("#FFFFFF" if _is_dark_color(c) else "#000000")

    x_title = x_label or x_col

    fig = go.Figure(
        go.Bar(
            x=work[x_col],
            y=work["_y_cat"],
            orientation="h",
            marker_color=colors,
            text=[f"{v:,.0f}" for v in work[x_col]],
            textposition=text_positions,
            insidetextanchor="end",
            textfont=dict(
                color=text_colors,
            ),
            customdata=work[[party_col, y_col, "FULLNAME"]].astype(str).values,
            hovertemplate=(
                "Name: %{customdata[2]} (%{customdata[1]})<br>"
                f"{x_title}: %{{x:,.0f}}<br>"
                f"{party_col}: %{{customdata[0]}}<extra></extra>"
            ),
        )
    )

    # ---- Titel + globaler Stand-Text zusammenbauen ----
    try:
        global STAND_TEXT
        stand_text = STAND_TEXT
    except NameError:
        stand_text = None

    if title and stand_text:
        title_text = f"{title}<br><sub style='font-size:0.85em; line-height:0.5;'>{stand_text}</sub>"
        top_margin = 100
    elif title:
        title_text = title
        top_margin = 50
    elif stand_text:
        title_text = stand_text
        top_margin = 60
    else:
        title_text = None
        top_margin = 40

    fig.update_layout(
        title=dict(
            text=title_text,
            x=0.5,
            xanchor="center",
            yanchor="top",
            yref="container",
            font=dict(size=20),
        ),
        xaxis_title=x_title,
        yaxis_title="",
        yaxis=dict(categoryorder="array", categoryarray=categories),
        bargap=0.25,
        margin=dict(l=10, r=40, t=top_margin, b=10),
        height=max(300, 35 * len(work)),
        uniformtext_minsize=8,
        uniformtext_mode="show",
    )

    fig.update_traces(cliponaxis=False, texttemplate="%{text}")

    # ---- Optional: Save the figure ----
    if save_name:
        try:
            global GRAPHICS_DIR
        except NameError:
            raise RuntimeError("GRAPHICS_DIR not defined globally. Initialize it before calling the function.")

        save_path = Path(GRAPHICS_DIR) / f"{save_name}.png"
        fig.write_image(save_path, width=900, height=600, scale=2)
        print(f"âœ… Plot saved to: {save_path}")

    return fig


In [17]:
df_profiles.columns

Index(['username', 'name_curr', 'partei_kurz', 'followers_count_prev',
       'followers_count_curr', 'following_count_prev', 'following_count_curr',
       'tweet_count_prev', 'tweet_count_curr', 'listed_count_prev',
       'listed_count_curr', 'delta_followers_count', 'delta_following_count',
       'delta_tweet_count', 'delta_listed_count', 'pct_followers_count',
       'pct_following_count', 'pct_tweet_count', 'pct_listed_count',
       'retrieved_at_prev', 'retrieved_at_curr', 'snapshot_span_days',
       'AKAD_TITEL', 'VORNAME', 'NACHNAME', 'FULLNAME'],
      dtype='object')

In [31]:
plot_party_hbar(
    df_profiles, 
    'username', 
    'delta_followers_count', 
    top_n=20, 
    title='MdBs mit den meisten neuen Followern auf X', 
    x_label='Anzahl der neuer Follower',
    save_name="top20_new_followers",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top20_new_followers.png


In [27]:
def plot_party_hbar(
    df_profiles,
    y_col: str,  # e.g. "username"
    x_col: str,
    top_n: int = 10,
    party_col: str = "partei_kurz",
    title: str | None = None,
    x_label: str | None = None,
    save_name: str | None = None,   # << NEW parameter: filename without path
):
    import pandas as pd
    import plotly.graph_objects as go
    from pathlib import Path

    PARTY_COLORS = {
        "CDU/CSU": "#000000",
        "CDU": "#000000",
        "CSU": "#000000",
        "SPD": "#E3000F",
        "GRÃœNE": "#1AA64A",
        "GRUENE": "#1AA64A",
        "B90/GRUENE": "#1AA64A",
        "DIE LINKE": "#BE3075",
        "LINKE": "#BE3075",
        "FDP": "#FFED00",
        "AFD": "#009EE0",
        "BSW": "#009688",
        "FW": "#F28F00",
        "SSW": "#00A3E0",
        "PIRATEN": "#FF8800",
        "PARTEI": "#9E9E9E",
        "Ã–DP": "#FF6A00",
        "OEDP": "#FF6A00",
    }

    def _normalize_party_value(p: str) -> str:
        if p is None:
            return ""
        key = str(p).strip().upper()
        if key in {"CDU", "CSU"}:
            return "CDU/CSU"
        if key.startswith("GRÃœN") or key.startswith("GRUEN") or "GRUENE" in key or "GRÃœNE" in key or "B90" in key:
            return "GRÃœNE"
        if key in {"LINKE", "DIE LINKE", "DIE LINKE."}:
            return "DIE LINKE"
        if key in {"Ã–DP", "OEDP"}:
            return "Ã–DP"
        if key in {"AFD", "ALTERNATIVE FÃœR DEUTSCHLAND", "ALTERNATIVE FUER DEUTSCHLAND"}:
            return "AFD"
        return key

    def _resolve_party_colors(series: pd.Series) -> list[str]:
        parties = series.astype("string").fillna("")
        return [PARTY_COLORS.get(_normalize_party_value(p), "#888888") for p in parties]

    def _is_dark_color(hex_color: str) -> bool:
        """Determine if a color is dark based on brightness."""
        hex_color = hex_color.lstrip("#")
        r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        brightness = (r * 299 + g * 587 + b * 114) / 1000
        return brightness < 140

    for col in (y_col, x_col):
        if col not in df_profiles.columns:
            raise ValueError(f"Missing required column: {col}")
    if "FULLNAME" not in df_profiles.columns:
        raise ValueError("Missing required column: FULLNAME")

    work = df_profiles.copy()
    if party_col not in work.columns:
        work[party_col] = None

    # --- combined label for y-axis ---
    work["_label"] = work["FULLNAME"].astype(str).str.strip() + " (" + work[y_col].astype(str).str.strip() + ")"

    work = work.sort_values(x_col, ascending=False).head(top_n).copy()

    categories = work["_label"].tolist()[::-1]
    work["_y_cat"] = pd.Categorical(work["_label"], categories=categories, ordered=True)

    colors = _resolve_party_colors(work[party_col])

    # Decide whether each bar is short or long
    max_x = work[x_col].max()
    threshold = 0.15 * max_x
    text_positions = ["outside" if x < threshold else "inside" for x in work[x_col]]

    # Choose text colors: white for dark bars (inside), black otherwise
    text_colors = []
    for c, pos in zip(colors, text_positions):
        if pos == "outside":
            text_colors.append("#000000")
        else:
            text_colors.append("#FFFFFF" if _is_dark_color(c) else "#000000")

    x_title = x_label or x_col

    fig = go.Figure(
        go.Bar(
            x=work[x_col],
            y=work["_y_cat"],
            orientation="h",
            marker_color=colors,
            text=[f"{v*100:.0f} %" for v in work[x_col]],
            textposition=text_positions,
            insidetextanchor="end",
            textfont=dict(
                color=text_colors,
            ),
            customdata=work[[party_col, y_col, "FULLNAME"]].astype(str).values,
            hovertemplate=(
                "Name: %{customdata[2]} (%{customdata[1]})<br>"
                f"{x_title}: %{{x:,.0f}}<br>"
                f"{party_col}: %{{customdata[0]}}<extra></extra>"
            ),
        )
    )

    # ---- Titel + globaler Stand-Text zusammenbauen ----
    try:
        global STAND_TEXT
        stand_text = STAND_TEXT
    except NameError:
        stand_text = None

    if title and stand_text:
        title_text = f"{title}<br><sub style='font-size:0.85em; line-height:0.5;'>{stand_text}</sub>"
        top_margin = 100
    elif title:
        title_text = title
        top_margin = 50
    elif stand_text:
        title_text = stand_text
        top_margin = 60
    else:
        title_text = None
        top_margin = 40

    fig.update_layout(
        title=dict(
            text=title_text,
            x=0.5,
            xanchor="center",
            yanchor="top",
            yref="container",
            font=dict(size=20),

        ),
        xaxis_title=x_title,
        yaxis_title="",
        yaxis=dict(categoryorder="array", categoryarray=categories),
        bargap=0.25,
        margin=dict(l=10, r=40, t=top_margin, b=10),
        height=max(300, 35 * len(work)),
        uniformtext_minsize=8,
        uniformtext_mode="show",
    )
    fig.update_xaxes(tickformat=".0%")

    fig.update_traces(cliponaxis=False, texttemplate="%{text}")

    # ---- Optional: Save the figure ----
    if save_name:
        try:
            global GRAPHICS_DIR
        except NameError:
            raise RuntimeError("GRAPHICS_DIR not defined globally. Initialize it before calling the function.")

        save_path = Path(GRAPHICS_DIR) / f"{save_name}.png"
        fig.write_image(save_path, width=900, height=600, scale=2)
        print(f"âœ… Plot saved to: {save_path}")

    return fig


In [29]:
plot_party_hbar(
    df_profiles, 
    'username', 
    'pct_followers_count', 
    top_n=20, 
    title='MdBs mit den prozentual meisten neuen Followern auf X', 
    x_label='Neue Follower in Prozent [%]',
    save_name="top20_new_followers_pct",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top20_new_followers_pct.png


In [23]:
plot_party_hbar(
    df_profiles, 
    'username', 
    'delta_tweet_count', 
    top_n=20, 
    title='MdBs mit den meisten neuen Posts auf X',
    x_label='Anzahl neuer Posts',
    save_name="top20_new_posts",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top20_new_posts.png


In [45]:
plot_party_hbar(
    df_profiles[df_profiles["partei_kurz"]=="AFD"], 
    'username', 
    'followers_count', 
    top_n=10, 
    title='MdBs der AfD mit den meisten Followern auf X',
    x_label='Anzahl der Follower',
    save_name="top10_followers_afd",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top10_followers_afd.png


In [46]:
plot_party_hbar(
    df_profiles[df_profiles["partei_kurz"]=="CDU/CSU"], 
    'username', 
    'followers_count', 
    top_n=10, 
    title='MdBs der CDU/CSU mit den meisten Followern auf X',
    x_label='Anzahl der Follower',
    save_name="top10_followers_cdu",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top10_followers_cdu.png


In [47]:
plot_party_hbar(
    df_profiles[df_profiles["partei_kurz"]=='BÃœNDNIS 90/DIE GRÃœNEN'], 
    'username', 
    'followers_count', 
    top_n=10, 
    title='MdBs von BÃœNDNIS 90/DIE GRÃœNEN mit den meisten Followern auf X',
    x_label='Anzahl der Follower',
    save_name="top10_followers_gruene",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top10_followers_gruene.png


In [48]:
party="DIE LINKE."
plot_party_hbar(
    df_profiles[df_profiles["partei_kurz"]==party], 
    'username', 
    'followers_count', 
    top_n=10, 
    title=f'MdBs von {party} mit den meisten Followern auf X',
    x_label='Anzahl der Follower',
    save_name="top10_followers_linke",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top10_followers_linke.png


In [49]:
plot_party_hbar(
    df_profiles[df_profiles["partei_kurz"]=="SPD"], 
    'username', 
    'followers_count', 
    top_n=10, 
    title=f'MdBs der SPD mit den meisten Followern auf X',
    x_label='Anzahl der Follower',
    save_name="top10_followers_spd",
)

âœ… Plot saved to: C:\Users\felix\Documents\xminer\outputs\202510\graphics\top10_followers_spd.png
