In [3]:
import os
from pathlib import Path
import pandas as pd

In [9]:
# import sys 
# # del sys.modules["onee.utils"]
# del sys.modules["onee.data.loader"]

In [7]:
from onee.utils import get_move_in_year, plot_time_evolution, get_move_out_year

In [8]:
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]

In [10]:
from onee.data.loader import DataLoader

In [11]:
db_path = PROJECT_ROOT / "data/all_data.db"
import sqlite3
db = sqlite3.connect(db_path)

In [12]:
data_loader = DataLoader(PROJECT_ROOT)

In [None]:
df, df_features = data_loader.load_cd_data(
        db_path=db_path,
    )

In [14]:
df.columns

Index(['region', 'partenaire', 'contrat', 'annee', 'mois', 'activite',
       'consommation_kwh', 'puissance_facturee', 'puissance_appelee',
       'date_emmenagement', 'date_demenagement'],
      dtype='object')

In [15]:
def compute_contract_features(df, entity_col="activite", end_year=None, future_just_started=0):
    """
    Compute contract features aggregated by entity (activite, region, etc.) and year.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe containing contract data
    entity_col : str, default="activite"
        Column name to group by (e.g., "activite", "region")
    end_year : int, optional
        If provided, forecast values until this year
    future_just_started : int, default=0
        Number of new contracts starting each future year
    
    Returns:
    --------
    pd.DataFrame
        Aggregated features by entity and year
    """
    
    # Pre-compute start/end years and aggregate puissance for each contract per year
    contract_puissance = (
        df.groupby(["contrat", "annee"])
        .agg({
            "puissance_facturee": "sum",
            "puissance_appelee": "sum",
            entity_col: "first"
        })
        .reset_index()
    )
    
    # Pre-compute start and end years for each contract (done once)
    contract_years = (
        df.groupby("contrat")
        .apply(lambda g: pd.Series({
            "start_year": get_move_in_year(g),
            "finish_year": get_move_out_year(g),
            entity_col: g[entity_col].iloc[0]
        }))
        .reset_index()
    )
    
    # Remove contracts with incomplete info
    contract_years = contract_years.dropna(subset=["start_year", "finish_year"])
    contract_years["start_year"] = contract_years["start_year"].astype(int)
    contract_years["finish_year"] = contract_years["finish_year"].astype(int)
    
    # Merge puissance data with contract years
    contract_data = contract_years.merge(contract_puissance, on=["contrat", entity_col], how="left")
    
    def add_features_optimized(entity_value, annee, contract_data):
        # Filter contracts for this entity that are active in this year
        mask = (
            (contract_data[entity_col] == entity_value) &
            (contract_data["start_year"] <= annee) &
            (contract_data["finish_year"] >= annee) &
            (contract_data["annee"] == annee)
        )
        active_contracts = contract_data[mask]
        
        # Calculate age for each contract
        ages = annee - active_contracts["start_year"]
        
        # Create age masks
        just_started_mask = ages == 0
        two_years_mask = ages == 1
        three_years_mask = ages == 2
        more_than_3_mask = ages > 2
        
        return pd.Series({
            entity_col: entity_value,
            "annee": annee,
            # Contract counts
            "total_active_contrats": len(active_contracts),
            "just_started": just_started_mask.sum(),
            "two_years_old": two_years_mask.sum(),
            "three_years_old": three_years_mask.sum(),
            "more_than_3_years_old": more_than_3_mask.sum(),
            # Puissance facturée
            "puissance_facturee_total": active_contracts["puissance_facturee"].sum(),
            "puissance_facturee_just_started": active_contracts.loc[just_started_mask, "puissance_facturee"].sum(),
            "puissance_facturee_two_years_old": active_contracts.loc[two_years_mask, "puissance_facturee"].sum(),
            "puissance_facturee_three_years_old": active_contracts.loc[three_years_mask, "puissance_facturee"].sum(),
            "puissance_facturee_more_than_3_years_old": active_contracts.loc[more_than_3_mask, "puissance_facturee"].sum(),
            # Puissance appelée
            "puissance_appelee_total": active_contracts["puissance_appelee"].sum(),
            "puissance_appelee_just_started": active_contracts.loc[just_started_mask, "puissance_appelee"].sum(),
            "puissance_appelee_two_years_old": active_contracts.loc[two_years_mask, "puissance_appelee"].sum(),
            "puissance_appelee_three_years_old": active_contracts.loc[three_years_mask, "puissance_appelee"].sum(),
            "puissance_appelee_more_than_3_years_old": active_contracts.loc[more_than_3_mask, "puissance_appelee"].sum(),
        })
    
    def get_new_year_values(past_values, entity_value, annee, contract_years):
        """
        Calculate values for future years by aging contracts and removing finished ones.
        """
        # Get contracts that will finish this year for this entity
        finished_mask = (
            (contract_years[entity_col] == entity_value) &
            (contract_years["finish_year"] == annee - 1)
        )
        finished_contracts = contract_years[finished_mask]
        
        # Calculate ages of finished contracts (at the time they finish)
        finished_ages = (annee - 1) - finished_contracts["start_year"]
        
        # Count how many contracts finish in each age category
        finished_counts = {
            "just_started": (finished_ages == 0).sum(),
            "two_years_old": (finished_ages == 1).sum(),
            "three_years_old": (finished_ages == 2).sum(),
            "more_than_3_years_old": (finished_ages > 2).sum()
        }
        
        out = {}
        out[entity_col] = entity_value
        out["annee"] = annee
        
        # Assuming no new contracts start in future years (or use parameter)
        out["just_started"] = future_just_started
        
        # Age existing contracts by 1 year and remove finished ones
        out["two_years_old"] = past_values["just_started"] - finished_counts["just_started"]
        out["three_years_old"] = past_values["two_years_old"] - finished_counts["two_years_old"]
        out["more_than_3_years_old"] = (
            past_values["more_than_3_years_old"] + 
            past_values["three_years_old"] - 
            finished_counts["three_years_old"] - 
            finished_counts["more_than_3_years_old"]
        )
        
        # Recalculate total
        out["total_active_contrats"] = (
            out["just_started"] + 
            out["two_years_old"] + 
            out["three_years_old"] + 
            out["more_than_3_years_old"]
        )
        
        # Puissance values - age them similarly
        out["puissance_facturee_just_started"] = 0  # No new contracts
        out["puissance_facturee_two_years_old"] = past_values["puissance_facturee_just_started"]
        out["puissance_facturee_three_years_old"] = past_values["puissance_facturee_two_years_old"]
        out["puissance_facturee_more_than_3_years_old"] = (
            past_values["puissance_facturee_more_than_3_years_old"] + 
            past_values["puissance_facturee_three_years_old"]
        )
        out["puissance_facturee_total"] = (
            out["puissance_facturee_just_started"] +
            out["puissance_facturee_two_years_old"] +
            out["puissance_facturee_three_years_old"] +
            out["puissance_facturee_more_than_3_years_old"]
        )
        
        # Same for puissance_appelee
        out["puissance_appelee_just_started"] = 0
        out["puissance_appelee_two_years_old"] = past_values["puissance_appelee_just_started"]
        out["puissance_appelee_three_years_old"] = past_values["puissance_appelee_two_years_old"]
        out["puissance_appelee_more_than_3_years_old"] = (
            past_values["puissance_appelee_more_than_3_years_old"] + 
            past_values["puissance_appelee_three_years_old"]
        )
        out["puissance_appelee_total"] = (
            out["puissance_appelee_just_started"] +
            out["puissance_appelee_two_years_old"] +
            out["puissance_appelee_three_years_old"] +
            out["puissance_appelee_more_than_3_years_old"]
        )
        
        return out
    
    # Get all unique combinations of entity and annee
    entities = df[entity_col].unique()
    years = df["annee"].unique()
    max_year_in_data = int(years.max())
    
    # Generate results for existing data
    result_df = pd.DataFrame([
        add_features_optimized(entity, annee, contract_data)
        for entity in entities
        for annee in sorted(years)
    ])
    
    # If end_year is provided, forecast future years
    if end_year is not None and end_year > max_year_in_data:
        future_rows = []
        
        for entity in entities:
            # Get the last year's values for this entity
            entity_data = result_df[result_df[entity_col] == entity].sort_values("annee")
            
            if len(entity_data) == 0:
                continue
                
            last_values = entity_data.iloc[-1].to_dict()
            
            # Generate values for each future year
            for future_year in range(max_year_in_data + 1, end_year + 1):
                new_values = get_new_year_values(last_values, entity, future_year, contract_years)
                future_rows.append(new_values)
                last_values = new_values  # Use this year's values for next year
        
        # Append future rows to result
        if future_rows:
            future_df = pd.DataFrame(future_rows)
            result_df = pd.concat([result_df, future_df], ignore_index=True)
    
    return result_df


In [16]:

# Usage examples:
# For activite without forecasting:
result_activite = compute_contract_features(df, entity_col="activite")

# For activite with forecasting until 2030:
result_activite_forecast = compute_contract_features(df, entity_col="activite", end_year=2030)

# For region with forecasting and assuming 5 new contracts per year:
result_region_forecast = compute_contract_features(df, entity_col="region", end_year=2030, future_just_started=5)

  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({
  .apply(lambda g: pd.Series({


In [25]:
result_activite_forecast["activite"].unique()

array(["PRODUCTION ET DISTTRIBUTION D'ELECTRICITE, DE GAZ ET DE CHALEUR",
       'INDUSTRIE AUTOMOBILE', 'INDUSTRIE CHIMIQUE',
       'EXTRACTION, EXPLOITATION ET ENRICHISSEMENT DE MINERAIS METTALLIQUES',
       "CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",
       "FABRICATION D'AUTRES PRODUITS MINERAUX NON METALLIQUES",
       'INDUSTRIE TEXTILE', 'METALLURGIE', 'ADMINISTRATION PUBLIQUE',
       'BATIMENT ET TRAVAUX PUBLICS', 'industries alimentaires',
       'AUTRES INDUSTRIES EXTRACTIVES',
       'SERVIVES AUXILIAIRES DES TRANSPORTS', 'TRANSPORTS TERESSTRES',
       'AGRICULTURE, CHASSE, SERVICES ANNEXES', 'TRAVAIL DES METAUX',
       'COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE',
       "FABRICATION D'EQUIPEMENTS DE RADIO, TELEVISION ET COMMUNICATION"],
      dtype=object)

In [24]:
result_activite_forecast[result_activite_forecast["activite"] == "INDUSTRIE AUTOMOBILE"]

Unnamed: 0,activite,annee,total_active_contrats,just_started,two_years_old,three_years_old,more_than_3_years_old,puissance_facturee_total,puissance_facturee_just_started,puissance_facturee_two_years_old,puissance_facturee_three_years_old,puissance_facturee_more_than_3_years_old,puissance_appelee_total,puissance_appelee_just_started,puissance_appelee_two_years_old,puissance_appelee_three_years_old,puissance_appelee_more_than_3_years_old
11,INDUSTRIE AUTOMOBILE,2013,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,INDUSTRIE AUTOMOBILE,2014,1,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,INDUSTRIE AUTOMOBILE,2015,1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,INDUSTRIE AUTOMOBILE,2016,1,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,INDUSTRIE AUTOMOBILE,2017,1,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,INDUSTRIE AUTOMOBILE,2018,1,0,0,0,1,252000.0,0.0,0.0,0.0,252000.0,246598.2,0.0,0.0,0.0,246598.2
17,INDUSTRIE AUTOMOBILE,2019,1,0,0,0,1,252000.0,0.0,0.0,0.0,252000.0,248660.6,0.0,0.0,0.0,248660.6
18,INDUSTRIE AUTOMOBILE,2020,2,1,0,0,1,283500.0,31500.0,0.0,0.0,252000.0,298386.4,68216.4,0.0,0.0,230170.0
19,INDUSTRIE AUTOMOBILE,2021,3,1,1,0,1,366000.0,0.0,114000.0,0.0,252000.0,336744.0,17.0,97910.4,0.0,238816.6
20,INDUSTRIE AUTOMOBILE,2022,3,0,1,1,1,473400.0,0.0,111000.0,110400.0,252000.0,493842.4,0.0,150060.2,98162.8,245619.4


In [None]:
def get_new_year_values(past_values):
    out = {}
    out["total_active_contrats"] = past_values["total_active_contrats"]
    out["two_years_old"] = past_values["just_started"]
    out["three_years_old"] = past_values["two_years_old"]
    out["more_than_3_years_old"] = past_values["more_than_3_years_old"] + past_values["three_years_old"]
    return out

In [121]:
result.to_csv(PROJECT_ROOT / "data/dev/cd_active_contrats_features.csv")

In [123]:
import sqlite3
db_conn = sqlite3.connect(PROJECT_ROOT / "data/all_data.db")
result.to_sql("Active_Contrats_Features", db_conn, if_exists="replace", index=False)

209