In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]

In [3]:
df = pd.read_csv(PROJECT_ROOT / "data/srm_data_2007_2023_mt.csv")

In [6]:
# ensure numeric
df = df.copy()

# aggregate MWh by Region-Year-Activity
agg = (
    df.groupby(["Region", "Year", "Activity"], as_index=False)["MWh"]
      .sum()
)

totals = (
    agg.groupby(["Region", "Year"], as_index=False)["MWh"]
       .sum()
       .rename(columns={"MWh": "Total Mwh"})
)

# compute % per activity within each Region-Year
merged = agg.merge(
    totals.rename(columns={"Total Mwh": "Total_Mwh_tmp"}),
    on=["Region", "Year"],
    how="left"
)
merged["pct"] = np.where(
    merged["Total_Mwh_tmp"] > 0,
    (merged["MWh"]) / merged["Total_Mwh_tmp"] * 100,
    0.0
)

# pivot to have one column per Activity with percentages
pct_wide = (
    merged.pivot_table(
        index=["Region", "Year"],
        columns="Activity",
        values="pct",
        fill_value=0.0
    )
    .reset_index()
)

final_df = pct_wide.merge(totals, on=["Region", "Year"], how="left")

activity_cols = [c for c in final_df.columns if c not in ["Region", "Year", "Total Mwh"]]
final_df = final_df[["Region", "Year", "Total Mwh"] + activity_cols]

In [7]:
final_df

Unnamed: 0,Region,Year,Total Mwh,Administratif,Agricole,Industriel,Résidentiel,Tertiaire
0,Béni Mellal-Khénifra,2007,172438.261,4.724003,34.366767,52.091625,0.000000,8.817605
1,Béni Mellal-Khénifra,2008,178040.167,4.621690,34.524347,52.142916,0.000000,8.711047
2,Béni Mellal-Khénifra,2009,186983.313,5.865952,38.668881,45.434694,0.034390,9.996083
3,Béni Mellal-Khénifra,2010,203141.832,6.312180,37.351734,45.126579,0.087352,11.122155
4,Béni Mellal-Khénifra,2011,236212.811,6.180273,37.488597,45.141477,0.080996,11.108657
...,...,...,...,...,...,...,...,...
165,Tanger-Tétouan-Al Hoceïma,2019,229265.115,4.824103,30.207958,53.799863,0.614356,10.553720
166,Tanger-Tétouan-Al Hoceïma,2020,226040.931,5.569148,31.113741,52.959151,0.849371,9.508588
167,Tanger-Tétouan-Al Hoceïma,2021,235484.660,6.088722,30.509938,51.885043,0.943654,10.572644
168,Tanger-Tétouan-Al Hoceïma,2022,251039.050,6.687757,28.790900,52.238610,1.229323,11.053411


In [8]:
final_df.to_csv(PROJECT_ROOT / "data/srm_data_2007_2023_mt_percentages.csv", index = False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def export_activity_pies_to_pdf(
    final_df: pd.DataFrame,
    pdf_path: str,
    title_slides: bool = True,
    figsize=(12, 6.75),  # ~16:9 slide proportion
):
    """
    Create a PDF with one pie-chart slide per row of `final_df`.
    Optionally insert a title slide before each Region section.

    Expects columns:
      - "Region", "Year", "Total Mwh"
      - one column per activity containing percentage values (0–100)

    Parameters
    ----------
    final_df : pd.DataFrame
        Wide dataframe with columns as described above.
    pdf_path : str
        Output PDF file path.
    title_slides : bool
        If True, insert a title page for each Region to separate sections.
    figsize : tuple
        Figure size in inches (width, height), tuned for slide aspect.
    """
    required = {"Region", "Year", "Total Mwh"}
    missing = required - set(final_df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # Activity columns = everything except Region/Year/Total Mwh
    activity_cols = [c for c in final_df.columns if c not in ["Region", "Year", "Total Mwh"]]
    if not activity_cols:
        raise ValueError("No activity columns found. Make sure activity percentage columns exist.")

    # Ensure numeric
    for c in activity_cols + ["Total Mwh", "Year"]:
        final_df[c] = pd.to_numeric(final_df[c], errors="coerce")

    # Build consistent colors across all pages for all activities
    n_acts = len(activity_cols)
    cmap = plt.cm.get_cmap("tab20", max(n_acts, 3))
    color_map = {act: cmap(i % cmap.N) for i, act in enumerate(activity_cols)}

    # Sort by Region, then Year
    df_sorted = final_df.sort_values(["Region", "Year"], kind="mergesort")

    with PdfPages(pdf_path) as pdf:
        for region, df_region in df_sorted.groupby("Region", sort=False):
            # Title slide per region
            if title_slides:
                fig = plt.figure(figsize=figsize)
                ax = fig.add_subplot(111)
                ax.axis("off")
                ax.text(
                    0.5, 0.55, str(region),
                    ha="center", va="center", fontsize=28, fontweight="bold"
                )
                ax.text(
                    0.5, 0.40, "Activity Distribution by Year",
                    ha="center", va="center", fontsize=16
                )
                pdf.savefig(fig, bbox_inches="tight")
                plt.close(fig)

            # One slide per row (year) for this region
            for _, row in df_region.sort_values("Year").iterrows():
                values = row[activity_cols].to_numpy(dtype=float)
                labels = activity_cols
                legend_labels = [f"{act} ({val:.1f}%)" for act, val in zip(labels, values)]
                colors = [color_map[a] for a in labels]

                fig, ax = plt.subplots(figsize=figsize)

                # Make room for the legend on the right: use slightly smaller axes box
                # so the legend can sit outside.
                ax.set_position([0.06, 0.10, 0.62, 0.80])

                wedges, _ = ax.pie(values, startangle=90, labels=None, colors=colors)

                # Legend on the right
                ax.legend(
                    wedges,
                    legend_labels,
                    title="Activities",
                    loc="center left",
                    bbox_to_anchor=(1.0, 0.5),
                    frameon=False,
                )

                # Title with region, year and total MWh
                total_mwh = row["Total Mwh"]
                year_int = int(row["Year"]) if pd.notnull(row["Year"]) else row["Year"]
                ax.set_title(
                    f"Activity distribution — {region} ({year_int})\n"
                    f"Total consumption: {total_mwh:,.0f} MWh",
                    fontsize=14
                )

                pdf.savefig(fig, bbox_inches="tight")
                plt.close(fig)

# ---------------------------
# Example usage:
export_activity_pies_to_pdf(final_df, "activity_pies_by_region_year_mt.pdf", title_slides=True)


  cmap = plt.cm.get_cmap("tab20", max(n_acts, 3))
