# Fachsemester

## Import

In [17]:
import pandas as pd
import plotly.io as pio
import plotly.express as px

## Template

In [18]:
infoviz_template = dict(
    layout=dict(
        template="plotly_white",
        title=dict(
            font=dict(size=20, family="Arial", weight="bold", color="black"),
            y=0.91, 
            x=0.05, 
            xanchor="left", 
        ),
        xaxis=dict(
            showgrid=False,
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelposition="outside bottom"
        ),
        yaxis=dict(
            showgrid=True, gridcolor="lightgrey",
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelposition="outside left"
        ),
    )
)
pio.templates["infoviz"] = infoviz_template

## Processing

In [19]:
# Define list of years as strings
years = [str(year) for year in range(2013, 2025)]

# Read all sheets into a dictionary
df = pd.read_excel("data.xlsx", sheet_name=years)

data_list = []

# Add year column and collect data
for year, sheet_df in df.items():
    sheet_df["Jahr"] = year
    data_list.append(sheet_df)

# Concatenate all years into one DataFrame
final_df = pd.concat(data_list, ignore_index=True)

In [20]:
final_df[final_df['Variable'] == 'Fachsemester'] 

Unnamed: 0,Variable,Category,Bachelor Agrarwissenschaften,Bachelor Ernährungswissenschaften,Bachelor Nachwachsende Rohstoffe und Bioressourcen,Bachelor Ökotrophologie,Bachelor Umwelt und globaler Wandel,Master Ernährungswissenschaften,Master Ernährungsökonomie*,Master Agrarökonomie und Betriebsmanagement,...,Master Nutzpflanzenwissenschaften,Master Oenologie/Weinwirtschaft,Master Transition Management,Master Sustainable Transition*,Master Agrar- und Ressourcenökonomie*,Master Getränketechnologie*,Master Informationstechnologie in den Agrar- und Umweltwissenschaften*,Master Nachhaltige Ernährungswirtschaft,Master Sustainable Transition,Master Nutztierwissenschaften*
22,Fachsemester,1 (%),36.0,36.0,,31.0,32.0,21.0,19.0,40.0,...,,,,,,,,,,
23,Fachsemester,2 (%),2.0,,,,,9.0,19.0,20.0,...,,,,,,,,,,
24,Fachsemester,3 (%),33.0,28.0,,30.0,35.0,34.0,26.0,20.0,...,,,,,,,,,,
25,Fachsemester,4 (%),,1.0,,1.0,,13.0,10.0,10.0,...,,,,,,,,,,
26,Fachsemester,5 (%),20.0,24.0,,25.0,26.0,16.0,16.0,10.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,Fachsemester,12 (%),,,,,,,,,...,,,,,,,,,,
793,Fachsemester,13 (%),3,,,,,,,,...,,,,,,,,,,
794,Fachsemester,14 und mehr (%),,,,,,,,,...,,,,,,,,,,
795,Fachsemester,Gesamt (%),100,100.0,100.0,100.0,100.0,100.0,,,...,100.0,,100.0,,,,,100.0,100.0,


In [21]:
df = final_df[final_df.columns[~final_df.columns.str.contains(r'\*')]] # drop * in the data
df.columns = df.columns.str.replace('*', '', regex=False)# just to be sure
df = df.rename(columns={'Master Getränketechnologi':'Master Getränketechnologie'})# fix a typo
clean_columns = {}
for col in df.columns:
    clean_name = col.strip()  # get rid of other human typos
    if clean_name in clean_columns:
        clean_columns[clean_name].append(col)
    else:
        clean_columns[clean_name] = [col]

# smack together the similar col names 
merged_df = df[['Jahr']].copy()
for clean_name, original_cols in clean_columns.items():
    if len(original_cols) > 1:
        merged_df[clean_name] = df[original_cols].sum(axis=1)
    else:
        merged_df[clean_name] = df[original_cols[0]]
df = merged_df.fillna(value=0) # use if needed for plotting

In [22]:
df = df[df['Variable'].isin(['Fachsemester'])]
df = df[df['Variable'] == 'Fachsemester'].drop(columns='Variable')
df = df.groupby(['Jahr', 'Category']).sum().reset_index()
df = df[df['Category'] != 'Gesamt (%)']

In [23]:
df['Bachelor Ökotrophologie'] = df['Bachelor Ökotrophologie'].astype(float)
df['Bachelor Agrarwissenschaften'] = df['Bachelor Agrarwissenschaften'].replace(' ',0)
df['Bachelor Agrarwissenschaften'] = df['Bachelor Agrarwissenschaften'].astype(float)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [24]:
# Separate Bachelor and Master columns
bachelor_columns = [col for col in df.columns if "Bachelor" in col]
master_columns = [col for col in df.columns if "Master" in col]

# Reshape Bachelor data
df_bachelor = pd.melt(
    df,
    id_vars=["Jahr", "Category"],
    value_vars=bachelor_columns,
    var_name="Studiengang",
    value_name="Bachelorg Wert"
)
df_bachelor["Typ"] = "Bachelor"

# Reshape Master data
df_master = pd.melt(
    df,
    id_vars=["Jahr", "Category"],
    value_vars=master_columns,
    var_name="Studiengang",
    value_name="Master Wert"
)
df_master["Typ"] = "Master"

# Combine both
df_combined = pd.concat([df_bachelor, df_master])

# Sort for clarity
df_combined = df_combined.sort_values(by=["Jahr", "Category", "Typ"])

In [25]:
df_combined = df_combined.groupby(['Jahr', 'Category']).sum().reset_index().drop(columns=['Studiengang', 'Typ'])
df = df_combined[df_combined['Category'] != 'Anzahl']

In [26]:
# Extract numeric part from 'Category' (e.g. '1 (%)' → 1)
df["Category_num"] = df["Category"].str.extract(r"(\d+)").astype(int)

# Sort by year and extracted numeric category
df_sorted = df.sort_values(by=["Jahr", "Category_num"])

# Drop helper column
df_sorted = df_sorted.drop(columns=["Category_num"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
# Prepare Bachelor DataFrame
bachelor_df = df_sorted[["Jahr", "Category", "Bachelorg Wert"]].rename(
    columns={"Bachelorg Wert": "Wert"}
)
bachelor_df["Studiengang"] = "Bachelor"

# Prepare Master DataFrame
master_df = df_sorted[["Jahr", "Category", "Master Wert"]].rename(
    columns={"Master Wert": "Wert"}
)
master_df["Studiengang"] = "Master"

# Combine both
result = pd.concat([bachelor_df, master_df])

# Reorder columns
result = result[["Jahr", "Category", "Studiengang", "Wert"]]

# Final sorting
df_sorted = result.sort_values(by=["Jahr", "Category"]).reset_index(drop=True)

In [28]:
category_order = ["1 (%)", "2 (%)", "3 (%)", "4 (%)", "5 (%)", "6 (%)", "7 (%)", "8 (%)", "9 (%)", "10 (%)", "11 (%)", "12 (%)", "13 (%)", "14 und mehr (%)"]

In [29]:

pio.templates["infoviz"] = infoviz_template

# Prepare DataFrame
df = pd.DataFrame(df_sorted)

# Define custom category order (from highest to lowest semester)
category_order = [
    "14 und mehr (%)", "13 (%)", "12 (%)", "11 (%)", "10 (%)", "9 (%)",
    "8 (%)", "7 (%)", "6 (%)", "5 (%)", "4 (%)", "3 (%)", "2 (%)", "1 (%)"
]

# Clean up category names
df["Category"] = df["Category"].str.replace(" (%)", "", regex=False)

# Create scatter plot
fig = px.scatter(
    df,
    x="Jahr",
    y="Category",
    color="Studiengang",
    size="Wert",
    hover_name="Studiengang",
    title="Verteilung von </b><span style='color:#73c6e9;'>Bachelor</span> "
          "<b> und </b><span style='color:#cc5b6e;'>Master</span> <b> über die Jahre",
    labels={"Jahr": "Jahr", "Category": "Fachsemester", "Wert": "Wert"},
    size_max=60,
    category_orders={"Category": category_order}
)

# Layout configuration
fig.update_layout(
    template="infoviz",
    height=1200,
    width=1400,
    showlegend=False
)

# Hinweis: Halbkreise (B.A. / M.A.) waren geplant, aber scheiterten an Plotly-Grenzen
fig.show()
fig.write_image("Plots/pdf/fachsemester1.pdf")

In [30]:
df_sorted = df_sorted[df_sorted['Jahr'] == '2024']

In [31]:
infoviz_template = dict(
    layout=dict(
        template="plotly_white",
        title=dict(
            font=dict(size=20, family="Arial", weight="bold", color="black"),
            xanchor="left",  
            xref="paper",
            x=0,
            subtitle=dict(
                text="Vergleich der Absoluten Anzahl der Personen nach Abschluss",
                font=dict(color="gray", size=13),
            ),
        ),
        xaxis=dict(
            showgrid=False,
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelstandoff=10,
            ticklabelposition="outside bottom"
        ),
        yaxis=dict(
            showgrid=True, gridcolor="lightgrey",
            zerolinecolor="lightgrey",
            tickfont=dict(color="grey", size=12),
            title_font=dict(color="grey", weight="bold", size=13),
            title_standoff=15,
            ticklabelstandoff=10,
            ticklabelposition="outside left"
        ),
    )
)
pio.templates["infoviz"] = infoviz_template

df = pd.DataFrame(df_sorted)

df['Category'] = df['Category'].str.replace(' (%)', '', regex=False)

category_order = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14 und mehr"]

df['Y-Wert'] = df['Studiengang'].map({'Bachelor': 1, 'Master': -1})

In [32]:
fig = px.scatter(df,
                 x="Category",
                 y="Y-Wert",
                 color="Studiengang", 
                 size="Wert", 
                 hover_name="Studiengang",
                 title="Verhältnis der Fachsemester in </b><span style='color:#73c6e9;'>Bachelor</span> <b> und </b><span style='color:#cc5b6e;'>Master</span> <b> für das Jahr 2024",
                 labels={"Jahr": "Jahr", "Category": "Kategorie", "Wert": "Wert"},
                 size_max=60,
                 category_orders={"Category": category_order},
                 color_discrete_map={"Bachelor": "#73c6e9", "Master": "#cc5b6e"})  



fig.update_layout(
    template="infoviz", 
    height=400,
    width=1400,
    yaxis_title="Abschluss",
    xaxis_title="Fachsemester",
    yaxis=dict(
        tickvals=[1, -1],
        ticktext=["Bachelor", "Master"],
        showgrid=True,
        zerolinecolor="lightgrey",
       
    ),
    showlegend=False
)
fig.update_traces(marker=dict(opacity=1))

fig.show()
fig.write_image("Plots/pdf/fachsemester2.pdf")

In [33]:
df = pd.DataFrame(df_sorted)

df['Category'] = df['Category'].str.replace(' (%)', '', regex=False)

category_order = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14 und mehr"]

df['Y-Wert'] = df['Studiengang'].map({'Bachelor': 1, 'Master': -1})

fig = px.bar(df,
             x="Category",
             y="Wert",
             color="Studiengang",
             barmode="group",
             title="Verhältnis der Fachsemester in </b><span style='color:#73c6e9;'>Bachelor</span> <b> und </b><span style='color:#cc5b6e;'>Master</span> <b> für das Jahr 2024",
             labels={"Category": "Fachsemester", "Wert": "Anzahl der Personen"},
             category_orders={"Category": category_order},
             color_discrete_map={"Bachelor": "#73c6e9", "Master": "#cc5b6e"})

fig.update_layout(
    template="infoviz",
    height=500, 
    width=1200,
    yaxis_title="Anzahl der Personen",
    xaxis_title="Fachsemester",
    xaxis=dict(
        ticklabelstandoff=10,
        ticklabelposition="outside bottom",
    ),
    showlegend=False,
)

fig.show()
fig.write_image("Plots/pdf/fachsemester3.pdf")