In [310]:
import pandas as pd
import plotly.io as pio
import plotly.express as px

# Benutzerdefiniertes Template definieren
infoviz_template = dict(
    layout=dict(
        template="plotly_white",
        title=dict(
            font=dict(size=20, family="Arial", weight="bold", color="black"),
            y=0.91,  # Titel weiter nach oben
            x=0.05,  # Links ausgerichtet
            xanchor="left",  # Linksbündig
        ),
        xaxis=dict(
            showgrid=False,
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelposition="outside bottom"
        ),
        yaxis=dict(
            showgrid=True, gridcolor="lightgrey",
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelposition="outside left"
        ),
    )
)
pio.templates["infoviz"] = infoviz_template

In [311]:
years = [str(year) for year in range(2013, 2025)]

df = pd.read_excel("data.xlsx", sheet_name=years)

data_list = []

for year, df in df.items():
    # Füge das Jahr als neue Spalte hinzu
    df['Jahr'] = year
    
    # Füge den DataFrame der Liste hinzu
    data_list.append(df)

# Alle DataFrames in der Liste zu einem einzigen DataFrame zusammenführen
final_df = pd.concat(data_list, ignore_index=True)


In [312]:
final_df[final_df['Variable'] == 'Fachsemester'] 

Unnamed: 0,Variable,Category,Bachelor Agrarwissenschaften,Bachelor Ernährungswissenschaften,Bachelor Nachwachsende Rohstoffe und Bioressourcen,Bachelor Ökotrophologie,Bachelor Umwelt und globaler Wandel,Master Ernährungswissenschaften,Master Ernährungsökonomie*,Master Agrarökonomie und Betriebsmanagement,...,Master Nutzpflanzenwissenschaften,Master Oenologie/Weinwirtschaft,Master Transition Management,Master Sustainable Transition*,Master Agrar- und Ressourcenökonomie*,Master Getränketechnologie*,Master Informationstechnologie in den Agrar- und Umweltwissenschaften*,Master Nachhaltige Ernährungswirtschaft,Master Sustainable Transition,Master Nutztierwissenschaften*
22,Fachsemester,1 (%),36.0,36.0,,31.0,32.0,21.0,19.0,40.0,...,,,,,,,,,,
23,Fachsemester,2 (%),2.0,,,,,9.0,19.0,20.0,...,,,,,,,,,,
24,Fachsemester,3 (%),33.0,28.0,,30.0,35.0,34.0,26.0,20.0,...,,,,,,,,,,
25,Fachsemester,4 (%),,1.0,,1.0,,13.0,10.0,10.0,...,,,,,,,,,,
26,Fachsemester,5 (%),20.0,24.0,,25.0,26.0,16.0,16.0,10.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,Fachsemester,12 (%),,,,,,,,,...,,,,,,,,,,
793,Fachsemester,13 (%),3,,,,,,,,...,,,,,,,,,,
794,Fachsemester,14 und mehr (%),,,,,,,,,...,,,,,,,,,,
795,Fachsemester,Gesamt (%),100,100.0,100.0,100.0,100.0,100.0,,,...,100.0,,100.0,,,,,100.0,100.0,


In [313]:
df = final_df[final_df.columns[~final_df.columns.str.contains(r'\*')]] # drop * in the data
df.columns = df.columns.str.replace('*', '', regex=False)# just to be sure
df = df.rename(columns={'Master Getränketechnologi':'Master Getränketechnologie'})# fix a typo
clean_columns = {}
for col in df.columns:
    clean_name = col.strip()  # get rid of other human typos
    if clean_name in clean_columns:
        clean_columns[clean_name].append(col)
    else:
        clean_columns[clean_name] = [col]

# smack together the similar col names 
merged_df = df[['Jahr']].copy()
for clean_name, original_cols in clean_columns.items():
    if len(original_cols) > 1:
        merged_df[clean_name] = df[original_cols].sum(axis=1)
    else:
        merged_df[clean_name] = df[original_cols[0]]
df = merged_df.fillna(value=0) # use if needed for plotting

In [314]:
df = df[df['Variable'].isin(['Fachsemester'])]
df = df[df['Variable'] == 'Fachsemester'].drop(columns='Variable')
df = df.groupby(['Jahr', 'Category']).sum().reset_index()
df = df[df['Category'] != 'Gesamt (%)']

In [315]:
df['Bachelor Ökotrophologie'] = df['Bachelor Ökotrophologie'].astype(float)
df['Bachelor Agrarwissenschaften'] = df['Bachelor Agrarwissenschaften'].replace(' ',0)
df['Bachelor Agrarwissenschaften'] = df['Bachelor Agrarwissenschaften'].astype(float)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [316]:
df

Unnamed: 0,Jahr,Category,Bachelor Agrarwissenschaften,Bachelor Ernährungswissenschaften,Bachelor Nachwachsende Rohstoffe und Bioressourcen,Bachelor Ökotrophologie,Bachelor Umwelt und globaler Wandel,Master Ernährungswissenschaften,Master Agrarökonomie und Betriebsmanagement,Master Nutztierwissenschaften,...,Master Transition Management,Master Ökotrophologie,Master Agrobiotechnologie,Master Insect Biotechnology and Bioresources,Bachelor Umweltmanagement,Master Ernährungsökonomie,Master Nutzpflanzenwissenschaften,Master Oenologie/Weinwirtschaft,Master Nachhaltige Ernährungswirtschaft,Master Sustainable Transition
0,2013,1 (%),36.0,36.0,0.0,31.0,32.0,21.0,40.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013,10 (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013,11 (%),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013,12 (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013,13 (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,2024,6 (%),15.0,7.0,6.0,2.0,6.0,3.0,0.0,0.0,...,0.0,17.0,7.0,10.0,0.0,0.0,0.0,0.0,17.0,0.0
187,2024,7 (%),12.0,9.0,17.0,11.0,29.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
188,2024,8 (%),3.0,1.0,6.0,4.0,6.0,0.0,0.0,0.0,...,0.0,0.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
189,2024,9 (%),6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0


In [317]:
# Spalten für Bachelor und Master extrahieren
bachelor_columns = [col for col in df.columns if 'Bachelor' in col]
master_columns = [col for col in df.columns if 'Master' in col]

# Melde Bachelor und Master in separaten DataFrames an
df_bachelor = pd.melt(df, id_vars=['Jahr', 'Category'], value_vars=bachelor_columns,
                      var_name='Studiengang', value_name='Bachelorg Wert')
df_bachelor['Typ'] = 'Bachelor'

df_master = pd.melt(df, id_vars=['Jahr', 'Category'], value_vars=master_columns,
                    var_name='Studiengang', value_name='Master Wert')
df_master['Typ'] = 'Master'

# Kombinieren der DataFrames für Bachelor und Master
df_combined = pd.concat([df_bachelor, df_master])

# Sortieren nach Jahr und Category
df_combined = df_combined.sort_values(by=['Jahr', 'Category', 'Typ'])


In [318]:
df_combined = df_combined.groupby(['Jahr', 'Category']).sum().reset_index().drop(columns=['Studiengang', 'Typ'])
df = df_combined[df_combined['Category'] != 'Anzahl']
df

Unnamed: 0,Jahr,Category,Bachelorg Wert,Master Wert
0,2013,1 (%),135.0,380.0
1,2013,10 (%),0.0,0.0
2,2013,11 (%),1.0,0.0
3,2013,12 (%),0.0,0.0
4,2013,13 (%),0.0,0.0
...,...,...,...,...
174,2024,5 (%),118.0,180.0
175,2024,6 (%),36.0,59.0
176,2024,7 (%),78.0,17.0
177,2024,8 (%),20.0,17.0


In [319]:


# Umwandeln der Kategorie von '1 (%)', '2 (%)', etc. in numerische Werte (Entfernen von " (%)")
df['Category_num'] = df['Category'].str.extract('(\d+)').astype(int)

# Sortieren nach Jahr und numerischer Kategorie
df_sorted = df.sort_values(by=['Jahr', 'Category_num'])

# Entfernen der Hilfsspalte 'Category_num'
df_sorted = df_sorted.drop(columns=['Category_num'])





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [320]:
bachelor_df = df_sorted[["Jahr", "Category", "Bachelorg Wert"]].rename(columns={"Bachelorg Wert": "Wert"})
bachelor_df["Studiengang"] = "Bachelor"

master_df = df_sorted[["Jahr", "Category", "Master Wert"]].rename(columns={"Master Wert": "Wert"})
master_df["Studiengang"] = "Master"

# Beide DataFrames zusammenführen
result = pd.concat([bachelor_df, master_df])

# Neu anordnen der Spalten
result = result[["Jahr", "Category", "Studiengang", "Wert"]]

# Sortieren nach Jahr und Category, falls gewünscht
df_sorted = result.sort_values(by=["Jahr", "Category"]).reset_index(drop=True)


In [321]:
df_sorted

Unnamed: 0,Jahr,Category,Studiengang,Wert
0,2013,1 (%),Bachelor,135.0
1,2013,1 (%),Master,380.0
2,2013,10 (%),Bachelor,0.0
3,2013,10 (%),Master,0.0
4,2013,11 (%),Bachelor,1.0
...,...,...,...,...
331,2024,7 (%),Master,17.0
332,2024,8 (%),Bachelor,20.0
333,2024,8 (%),Master,17.0
334,2024,9 (%),Bachelor,10.0


In [322]:
# Die Kategorien in aufsteigender Reihenfolge festlegen
category_order = ["1 (%)", "2 (%)", "3 (%)", "4 (%)", "5 (%)", "6 (%)", "7 (%)", "8 (%)", "9 (%)", "10 (%)", "11 (%)", "12 (%)", "13 (%)", "14 und mehr (%)"]


In [324]:
import pandas as pd
import plotly.express as px


# Template ins Plotly laden
pio.templates["infoviz"] = infoviz_template

# DataFrame erstellen (diese Zeile geht davon aus, dass df_sorted existiert)
df = pd.DataFrame(df_sorted)

# Die Kategorien in absteigender Reihenfolge festlegen
category_order = ["14 und mehr (%)", "13 (%)", "12 (%)", "11 (%)", "10 (%)", "9 (%)", "8 (%)", "7 (%)", "6 (%)", "5 (%)", "4 (%)", "3 (%)", "2 (%)", "1 (%)"]

df['Category'] = df['Category'].str.replace(' (%)', '', regex=False)

# Bubble-Plot erstellen und die Reihenfolge der Kategorien explizit umkehren
fig = px.scatter(df,
                 x="Jahr", 
                 y="Category", 
                 color="Studiengang", 
                 size="Wert", 
                 hover_name="Studiengang",  # Hover-Information
                 title="Verteilung von </b><span style='color:#73c6e9;'>Bachelor</span> <b> und </b><span style='color:#cc5b6e;'>Master</span> <b> über die Jahre",
                 labels={"Jahr": "Jahr", "Category": "Fachsemester", "Wert": "Wert"},
                 size_max=60,  # Maximale Blasengröße
                 category_orders={"Category": category_order})  # Reihenfolge der Kategorien umkehren

# Größe des Plots anpassen (Höhe und Breite)
fig.update_layout(
    template="infoviz",  # Dein Template anwenden
    height=1200,  # Höhe des Plots
    width=1400,   # Breite des Plots
     showlegend=False,
)

# Plot anzeigen
fig.show()

In [299]:
df_sorted = df_sorted[df_sorted['Jahr'] == '2024']
df_sorted

Unnamed: 0,Jahr,Category,Studiengang,Wert
308,2024,1 (%),Bachelor,122.0
309,2024,1 (%),Master,229.0
310,2024,10 (%),Bachelor,7.0
311,2024,10 (%),Master,0.0
312,2024,11 (%),Bachelor,9.0
313,2024,11 (%),Master,15.0
314,2024,12 (%),Bachelor,0.0
315,2024,12 (%),Master,0.0
316,2024,13 (%),Bachelor,3.0
317,2024,13 (%),Master,0.0


In [308]:
infoviz_template = dict(
    layout=dict(
        template="plotly_white",
        title=dict(
            font=dict(size=20, family="Arial", weight="bold", color="black"),
            xanchor="left",  
            xref="paper",
            x=0,
            subtitle=dict(
                text="Vergleich der Absoluten Anzahl der Personen nach Abschluss",
                font=dict(color="gray", size=13),
            ),
        ),
        xaxis=dict(
            showgrid=False,
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelstandoff=10,
            ticklabelposition="outside bottom"
        ),
        yaxis=dict(
            showgrid=True, gridcolor="lightgrey",
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelstandoff=10,
            ticklabelposition="outside left"
        ),
    )
)
pio.templates["infoviz"] = infoviz_template
# DataFrame erstellen (diese Zeile geht davon aus, dass df_sorted existiert)
df = pd.DataFrame(df_sorted)

# Entferne das "%" aus der 'Category' Spalte
df['Category'] = df['Category'].str.replace(' (%)', '', regex=False)

# Die Kategorien in absteigender Reihenfolge festlegen
category_order = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14 und mehr"]

# Bachelor und Master auf der Y-Achse trennen (mit Bachelor oben und Master unten)
df['Y-Wert'] = df['Studiengang'].map({'Bachelor': 1, 'Master': -1})  # Bachelor auf 1 (oben) und Master auf -1 (unten) setzen

# Bubble-Plot erstellen und die Reihenfolge der Kategorien explizit umkehren
fig = px.scatter(df,
                 x="Category",  # Category auf die X-Achse setzen
                 y="Y-Wert",  # Y-Wert für die Platzierung von Bachelor und Master
                 color="Studiengang", 
                 size="Wert", 
                 hover_name="Studiengang",  # Hover-Information
                 title="Verhältnis der Fachsemester in </b><span style='color:#73c6e9;'>Bachelor</span> <b> und </b><span style='color:#cc5b6e;'>Master</span> <b> für das Jahr 2024",
                 labels={"Jahr": "Jahr", "Category": "Kategorie", "Wert": "Wert"},
                 size_max=60,  # Maximale Blasengröße
                 category_orders={"Category": category_order},# Reihenfolge der Kategorien umkehren
                 color_discrete_map={"Bachelor": "#73c6e9", "Master": "#cc5b6e"})  


# Layout anpassen und Template anwenden
fig.update_layout(
    template="infoviz",  # Template anwenden
    height=400,  # Höhe des Plots (jetzt horizontal)
    width=1400,  # Breite des Plots
    yaxis_title="Abschluss",
    xaxis_title="Fachsemester",
    yaxis=dict(
        tickvals=[1, -1],
        ticktext=["Bachelor", "Master"],  # Anzeige bleibt gleich
        showgrid=True,  # Rasterlinien
        zerolinecolor="lightgrey",
       
    ),
    showlegend=False
)
fig.update_traces(marker=dict(opacity=1))
# Plot anzeigen
fig.show()


In [306]:
import plotly.express as px
import pandas as pd

# DataFrame erstellen (diese Zeile geht davon aus, dass df_sorted existiert)
df = pd.DataFrame(df_sorted)

# Entferne das "%" aus der 'Category' Spalte
df['Category'] = df['Category'].str.replace(' (%)', '', regex=False)

# Die Kategorien in absteigender Reihenfolge festlegen
category_order = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14 und mehr"]

# Bachelor und Master als separate Spalten behandeln
df['Y-Wert'] = df['Studiengang'].map({'Bachelor': 1, 'Master': -1})  # Bachelor oben, Master unten

# Gruppiertes Balkendiagramm erstellen
fig = px.bar(df,
             x="Category",  # Kategorie auf der X-Achse
             y="Wert",  # Wert auf der Y-Achse
             color="Studiengang",  # Unterscheidung zwischen Bachelor und Master
             barmode="group",  # Gruppiertes Balkendiagramm
             title="Verhältnis der Fachsemester in </b><span style='color:#73c6e9;'>Bachelor</span> <b> und </b><span style='color:#cc5b6e;'>Master</span> <b> für das Jahr 2024",
             labels={"Category": "Fachsemester", "Wert": "Anzahl der Personen"},
             category_orders={"Category": category_order},  # Reihenfolge der Kategorien
             color_discrete_map={"Bachelor": "#73c6e9", "Master": "#cc5b6e"})  # Farben für Bachelor und Master

# Layout anpassen
fig.update_layout(
    template="infoviz",  # Dein benutzerdefiniertes Template
    height=500,  # Höhe des Plots
    width=1200,  # Breite des Plots
    yaxis_title="Anzahl der Personen",
    xaxis_title="Fachsemester",
    xaxis=dict(
        ticklabelstandoff=10,
        ticklabelposition="outside bottom",
    ),
    showlegend=False,  # Legende einblenden
)

# Plot anzeigen
fig.show()
