In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import os
from tqdm import tqdm
from functools import lru_cache

def rooter(path : str = 'EnergyBot') -> None:
    """Change the working directory to the root of the project"""
    try :
        os.chdir(os.path.join(os.getcwd().split(path)[0], path))
    except :
        raise Exception("Please run this script from the inside of the project")

rooter()

from  consumption_prediction.src.data.conso_ps_profil import consommation_PS_PROFIL, data_coeff_profil, aggregats_consommation

In [3]:
!pip install xlrd

Collecting xlrd
  Using cached xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
Installing collected packages: xlrd
Successfully installed xlrd-2.0.1


In [4]:
df_conso = consommation_PS_PROFIL(12, "RES2")
df_conso.head()

Unnamed: 0,HORODATE,SOUS_PROFIL,CONSOMMATION,PUISSANCE_SOUSCRITE
0,2021-01-01,RES2,28.782577,12
1,2021-01-02,RES2,30.729357,12
2,2021-01-03,RES2,30.595572,12
3,2021-01-04,RES2,30.686863,12
4,2021-01-05,RES2,30.557003,12


In [5]:
N_paysage = 4
paysage_average = np.random.randint(1, 10, size=N_paysage)
ecolo_score_pond = 2

# un dictionnaire de type Dict[id_profil_ps, [profil, ps]]
dict_profil_ps = {
                    1: ['RES1', 3], 2: ['RES1', 6], 3: ['RES11', 9],
                    4: ['RES11', 12],  5: ['RES11', 15], 6: ['RES11', 18],
                    7: ['RES11', 24], 8: ['RES11', 30], 9: ['RES11', 36],
                    10: ['RES2', 3], 11: ['RES2', 6], 12: ['RES2', 9],
                    13: ['RES2', 12], 14: ['RES2', 15], 16: ['RES2', 18],
                    17: ['RES2', 24], 18: ['RES2', 30], 19: ['RES2', 36]
                 }

In [6]:
def check_inputs(func):
    def wrapper(consommation, ecolo_score, workday_occupation, paysage, nb_habitant):
        if ecolo_score < 0 or ecolo_score > 10:
            raise ValueError("Le score écologique doit être compris entre 0 et 10")
        if workday_occupation < 0 or workday_occupation > 7:
            raise ValueError("Le taux d'occupation doit être compris entre 0 et 7")
        if len(paysage) != N_paysage:
            raise ValueError("Le paysage doit être un vecteur de taille {}".format(N_paysage))
        if nb_habitant < 0:
            raise ValueError("Le nombre d'habitant doit être positif")
        return func(consommation, ecolo_score, workday_occupation, paysage, nb_habitant)
    return wrapper



@check_inputs
def generate_conso(consommation, ecolo_score, workday_occupation, paysage, nb_habitant) -> pd.DataFrame:
    paysage_tot = paysage@paysage_average
    contribution_ecolo = ecolo_score_pond * ecolo_score
    contribution_workday = workday_occupation ** 2
    contribution_paysage = paysage_tot ** 2
    contribution_nb_habitant = nb_habitant ** (3/2)
    coeff = 1 + (contribution_ecolo + contribution_workday + contribution_paysage + contribution_nb_habitant) / 100
    return consommation * coeff + np.random.normal(0, 0.1, size=1)


def aggregate_Xy(X, y):
    df = pd.DataFrame(np.concatenate([X, y.reshape(-1, 1)], axis=1))
    df.columns = ["ecolo_score", "workday_occupation", *["paysage_{}".format(i) for i in range(N_paysage)], "nb_habitant", "conso"]
    return df

@lru_cache
def aggregats_consommation_cached(ps, profil):
    return aggregats_consommation(ps, profil)

def generate_one_sample(a:int):
    id_profil_ps = np.random.choice([*dict_profil_ps.keys()])
    profil, ps = dict_profil_ps[id_profil_ps][0], dict_profil_ps[id_profil_ps][1]
    df_conso = aggregats_consommation_cached(ps, profil)['CONSOMMATION_MOYENNE_JOURNALIERE'].values
    ecolo_score = np.random.randint(11)
    workday_occupation = np.random.randint(8)
    paysage = np.random.randint(2, size=N_paysage)
    nb_habitant = np.random.randint(low=1, high=9)
    df_conso = generate_conso(df_conso, ecolo_score, workday_occupation, paysage, nb_habitant)
    if len(df_conso) == 1:
        df_conso = df_conso[0]
        X = np.array([ecolo_score, workday_occupation, *paysage, nb_habitant])
        y = np.array(df_conso)
    return X, y

#generate xy with threading
from concurrent.futures import ThreadPoolExecutor




def generate_Xy(N_samples:int=500):
    X, y = [], []
    executor = ThreadPoolExecutor(20)
    for X_, y_ in executor.map(generate_one_sample, tqdm(range(N_samples))):
        X.append(X_)
        y.append(y_)
    return np.array(X), np.array(y)

X, y = generate_Xy(1000)

100%|██████████| 1000/1000 [00:00<00:00, 3304.68it/s]


In [7]:
Xy = np.concatenate([X,y.reshape((len(y), 1))], axis=1)
print(Xy.shape)
pd.DataFrame(Xy, columns=["ecolo score", "worday", *[f"paysage_{i}" for i in range(N_paysage)], 'nb_habitant', 'target']).to_csv("data.csv")

(1000, 8)


In [10]:
y

array([ 92.77041175, 427.52010941,  73.79189435, 303.65523161,
       169.73693907, 384.92857444,  22.55830036,  97.2216933 ,
         8.25133377, 214.19877443])

In [24]:
[*dict_profil_ps.keys()]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19]