In [1]:
import os
from pathlib import Path
import pandas as pd

In [98]:
import sys
del sys.modules["onee.utils"]

In [99]:
from onee.utils import get_move_in_year, plot_time_evolution, get_move_out_year

In [3]:
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]

In [19]:
db_path = PROJECT_ROOT / "data/all_data.db"
import sqlite3
db = sqlite3.connect(db_path)

In [20]:
df = pd.read_sql_query("SELECT * FROM CD", db)

In [114]:
df.columns

Index(['Region', 'activite', 'Secteur', 'Partenaire', 'Niveau_de_tension',
       'contrat', 'Nom_du_contrat', 'Type_d_heure', 'consommation_kwh',
       'Puissance_facturee', 'Puissance_appelee', 'mois', 'annee',
       'Date_emmenagement', 'Date_demenagement'],
      dtype='object')

In [117]:
df["contrat"].nunique()/len(df["activite"].unique())

8.473684210526315

In [28]:
df.rename(columns={
    "Numero_de_contrat": "contrat",
    "Activite": "activite",
    "Consommation_Kwh": "consommation_kwh",
    "month": "mois",
    "year": "annee"
   } , inplace=True)

In [118]:
# Pre-compute start/end years and aggregate puissance for each contract per year
contract_puissance = (
    df.groupby(["contrat", "annee"])
    .agg({
        "Puissance_facturee": "sum",
        "Puissance_appelee": "sum",
        "activite": "first"
    })
    .reset_index()
)

# Pre-compute start and end years for each contract (done once)
contract_years = (
    df.groupby("contrat")
    .apply(lambda g: pd.Series({
        "start_year": get_move_in_year(g),
        "finish_year": get_move_out_year(g),
        "activite": g["activite"].iloc[0]
    }))
    .reset_index()
)

# Remove contracts with incomplete info
contract_years = contract_years.dropna(subset=["start_year", "finish_year"])
contract_years["start_year"] = contract_years["start_year"].astype(int)
contract_years["finish_year"] = contract_years["finish_year"].astype(int)

# Merge puissance data with contract years
contract_data = contract_years.merge(contract_puissance, on=["contrat", "activite"], how="left")

def add_features_optimized(activite, annee, contract_data):
    # Filter contracts for this activite that are active in this year
    mask = (
        (contract_data["activite"] == activite) &
        (contract_data["start_year"] <= annee) &
        (contract_data["finish_year"] >= annee) &
        (contract_data["annee"] == annee)
    )
    active_contracts = contract_data[mask]
    
    # Calculate age for each contract
    ages = annee - active_contracts["start_year"]
    
    # Create age masks
    just_started_mask = ages == 0
    two_years_mask = ages == 1
    three_years_mask = ages == 2
    more_than_3_mask = ages > 2
    
    return pd.Series({
        "activite": activite,
        "annee": annee,
        # Contract counts
        "total_active_contrats": len(active_contracts),
        "just_started": just_started_mask.sum(),
        "two_years_old": two_years_mask.sum(),
        "three_years_old": three_years_mask.sum(),
        "more_than_3_years_old": more_than_3_mask.sum(),
        # Puissance facturée
        "puissance_facturee_total": active_contracts["Puissance_facturee"].sum(),
        "puissance_facturee_just_started": active_contracts.loc[just_started_mask, "Puissance_facturee"].sum(),
        "puissance_facturee_two_years_old": active_contracts.loc[two_years_mask, "Puissance_facturee"].sum(),
        "puissance_facturee_three_years_old": active_contracts.loc[three_years_mask, "Puissance_facturee"].sum(),
        "puissance_facturee_more_than_3_years_old": active_contracts.loc[more_than_3_mask, "Puissance_facturee"].sum(),
        # Puissance appelée
        "puissance_appelee_total": active_contracts["Puissance_appelee"].sum(),
        "puissance_appelee_just_started": active_contracts.loc[just_started_mask, "Puissance_appelee"].sum(),
        "puissance_appelee_two_years_old": active_contracts.loc[two_years_mask, "Puissance_appelee"].sum(),
        "puissance_appelee_three_years_old": active_contracts.loc[three_years_mask, "Puissance_appelee"].sum(),
        "puissance_appelee_more_than_3_years_old": active_contracts.loc[more_than_3_mask, "Puissance_appelee"].sum(),
    })

# Get all unique combinations of activite and annee
activites = df["activite"].unique()
years = df["annee"].unique()

result = pd.DataFrame([
    add_features_optimized(activite, annee, contract_data)
    for activite in activites
    for annee in sorted(years)
])

  .apply(lambda g: pd.Series({


In [119]:
df["activite"].unique()

array(["PRODUCTION ET DISTTRIBUTION D'ELECTRICITE, DE GAZ ET DE CHALEUR",
       'INDUSTRIE AUTOMOBILE', 'INDUSTRIE CHIMIQUE',
       'EXTRACTION, EXPLOITATION ET ENRICHISSEMENT DE MINERAIS METTALLIQUES',
       "CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",
       "FABRICATION D'AUTRES PRODUITS MINERAUX NON METALLIQUES",
       'INDUSTRIE DU PAPIER ET DU CARTON', 'INDUSTRIE TEXTILE',
       'METALLURGIE', 'ADMINISTRATION PUBLIQUE',
       'BATIMENT ET TRAVAUX PUBLICS', 'industries alimentaires',
       'AUTRES INDUSTRIES EXTRACTIVES',
       'SERVIVES AUXILIAIRES DES TRANSPORTS', 'TRANSPORTS TERESSTRES',
       'AGRICULTURE, CHASSE, SERVICES ANNEXES', 'TRAVAIL DES METAUX',
       'COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE',
       "FABRICATION D'EQUIPEMENTS DE RADIO, TELEVISION ET COMMUNICATION"],
      dtype=object)

In [120]:
result[result["activite"] == "COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE"]

Unnamed: 0,activite,annee,total_active_contrats,just_started,two_years_old,three_years_old,more_than_3_years_old,puissance_facturee_total,puissance_facturee_just_started,puissance_facturee_two_years_old,puissance_facturee_three_years_old,puissance_facturee_more_than_3_years_old,puissance_appelee_total,puissance_appelee_just_started,puissance_appelee_two_years_old,puissance_appelee_three_years_old,puissance_appelee_more_than_3_years_old
187,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2013,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2014,1,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2015,1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
190,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2016,1,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2017,1,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2018,1,0,0,0,1,57600.0,0.0,0.0,0.0,57600.0,51732.0,0.0,0.0,0.0,51732.0
193,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2019,1,0,0,0,1,57600.0,0.0,0.0,0.0,57600.0,50080.0,0.0,0.0,0.0,50080.0
194,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2020,1,0,0,0,1,57600.0,0.0,0.0,0.0,57600.0,42340.0,0.0,0.0,0.0,42340.0
195,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2021,1,0,0,0,1,57600.0,0.0,0.0,0.0,57600.0,39160.0,0.0,0.0,0.0,39160.0
196,"COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE",2022,1,0,0,0,1,57600.0,0.0,0.0,0.0,57600.0,39708.0,0.0,0.0,0.0,39708.0


In [None]:
def get_new_year_values(past_values, nbr_new_contrats):
    out = {}
    out["total_active_contrats"] = past_values["total_active_contrats"] + nbr_new_contrats
    out["just_started"] = nbr_new_contrats
    out["two_years_old"] = past_values["just_started"]
    out["three_years_old"] = past_values["two_years_old"]
    out["more_than_3_years_old"] = past_values["more_than_3_years_old"] + past_values["three_years_old"]
    return out

In [121]:
result.to_csv(PROJECT_ROOT / "data/dev/cd_active_contrats_features.csv")

In [123]:
import sqlite3
db_conn = sqlite3.connect(PROJECT_ROOT / "data/all_data.db")
result.to_sql("Active_Contrats_Features", db_conn, if_exists="replace", index=False)

209