In [1]:
import os
from pathlib import Path
import pandas as pd

In [2]:
from onee.utils import get_move_in_year, plot_time_evolution, get_move_out_year

In [3]:
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]

In [4]:
df = pd.read_csv(PROJECT_ROOT / "data/cd_data_2013_2023.csv")

In [None]:
def add_features(group, df):
    activite, annee = group.name  # get the group keys
    
    # Initialize counters
    total_active = 0
    just_started = 0
    two_years_old = 0
    three_years_old = 0
    more_than_3_years_old = 0

    for c in group["contrat"]:
        df_c = df[df["contrat"] == c]

        start_year = get_move_in_year(df_c)
        finish_year = get_move_out_year(df_c)

        if start_year is None or finish_year is None:
            continue  # skip contracts with incomplete info

        if start_year <= annee <= finish_year:
            total_active += 1  # active during this year

            age = annee - start_year
            if age == 0:
                just_started += 1
            elif age == 1:
                two_years_old += 1
            elif age == 2:
                three_years_old += 1
            elif age > 2:
                more_than_3_years_old += 1

    return pd.Series({
        "activite": activite,
        "annee": annee,
        "total_active_contrats": total_active,
        "just_started": just_started,
        "two_years_old": two_years_old,
        "three_years_old": three_years_old,
        "more_than_3_years_old": more_than_3_years_old
    })


result = (
    df.groupby(["activite", "annee"])
      .apply(lambda g: add_features(g, df=df))
      .reset_index(drop=True)
)

  .apply(lambda g: add_features(g, df=df))


In [11]:
result["activite"].unique()

array(['ADMINISTRATION PUBLIQUE', 'AGRICULTURE, CHASSE, SERVICES ANNEXES',
       'AUTRES INDUSTRIES EXTRACTIVES', 'BATIMENT ET TRAVAUX PUBLICS',
       "CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",
       'COKEFACTION, RAFFINAGE, INDUSTRIE NUCLEAIRE',
       'EXTRACTION, EXPLOITATION ET ENRICHISSEMENT DE MINERAIS METTALLIQUES',
       "FABRICATION D'AUTRES PRODUITS MINERAUX NON METALLIQUES",
       "FABRICATION D'EQUIPEMENTS DE RADIO, TELEVISION ET COMMUNICATION",
       'INDUSTRIE AUTOMOBILE', 'INDUSTRIE CHIMIQUE',
       'INDUSTRIE DU PAPIER ET DU CARTON', 'INDUSTRIE TEXTILE',
       'METALLURGIE',
       "PRODUCTION ET DISTTRIBUTION D'ELECTRICITE, DE GAZ ET DE CHALEUR",
       'SERVIVES AUXILIAIRES DES TRANSPORTS', 'TRANSPORTS TERESSTRES',
       'TRAVAIL DES METAUX', 'industries alimentaires'], dtype=object)

In [19]:
result[result["activite"] == "CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU"]

Unnamed: 0,activite,annee,value,just_started,two_years_old,three_years_old,more_than_3_years_old
44,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2013,24,24,0,0,0
45,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2014,24,0,24,0,0
46,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2015,24,0,0,24,0
47,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2016,24,0,0,0,24
48,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2017,36,12,0,0,24
49,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2018,36,0,12,0,24
50,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2019,36,0,0,12,24
51,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2020,36,0,0,0,36
52,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2021,60,24,0,0,36
53,"CAPTAGE, TRAITEMENT ET DISTRIBUTION D'EAU",2022,60,0,24,0,36


In [None]:
def get_new_year_values(past_values, nbr_new_contrats):
    out = {}
    out["total_active_contrats"] = past_values["total_active_contrats"] + nbr_new_contrats
    out["just_started"] = nbr_new_contrats
    out["two_years_old"] = past_values["just_started"]
    out["three_years_old"] = past_values["two_years_old"]
    out["more_than_3_years_old"] = past_values["more_than_3_years_old"] + past_values["three_years_old"]
    return out

In [27]:
result.to_csv(PROJECT_ROOT / "data/cd_active_contrats_features.csv")

In [29]:
import sqlite3
db_conn = sqlite3.connect(PROJECT_ROOT / "data/cd_database_2013_2023.db")
result.to_sql("active_contrats_features", db_conn, if_exists="replace", index=False)

209