In [0]:
pip install mlflow

In [0]:
%run ./Common/DB_Utils

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np

import os
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest

## Caricamento dei dati in PySpark e conversione in dataframe Pandas

In [0]:
%scala
CopyTableFromDB("dbo.Log", "log")
CopyTableFromDB("[dbo].[Group]", "group")
CopyTableFromDB("dbo.Building", "building")

In [0]:
df_log = spark.sql('select * from log').toPandas()
df_group = spark.sql('select * from group').toPandas()
df_building = spark.sql('select * from building').toPandas()

In [0]:
df_log.head()

Unnamed: 0,Data,IdBuilding,IndividualAddress,GroupAddress,TelegramType,Value,ValueType,Description
0,2019-02-22 12:27:53.323333,1,3.1.6,2/2/0,write,-33429.7,W,Produzione totale fotovoltaico
1,2019-02-22 12:28:56.400000,1,3.1.201,2/0/3,write,35.68625,%,C1/0/M102 Umidità
2,2019-02-22 12:29:37.176666,1,3.2.202,2/0/0,write,400.96,ppm,C1/0/M101 CO2
3,2019-02-22 12:30:21.933333,1,3.3.201,2/1/8,write,36.86272,%,C1/0/U105 Umidità
4,2019-02-22 12:31:23.480000,1,3.3.202,2/1/0,write,536.96,ppm,C1/0/U101 C02


In [0]:
df_group.head()

Unnamed: 0,Id,IdBuilding,GroupAddress,DataPoint,ValueType,Description
0,1,1,2/0/0,9,ppm,C1/0/M101 CO2
1,2,1,2/0/1,9,ppm,C1/0/M102 CO2
2,3,1,2/1/0,9,ppm,C1/0/U101 C02
3,4,1,2/1/1,9,ppm,C1/0/U105 C02
4,5,1,2/0/4,9,C°,C1/0/M101 TEMP


In [0]:
df_building.head()

Unnamed: 0,Id,Nome,Ip,TabStorico
0,1,Edificio1,10.2.1.142,LogEdificioUno
1,2,Edificio2,10.2.3.254,LogEdificioDue
2,5,Edificio3,10.2.3.22,LogEdificioTre
3,6,Villa,10.2.3.14,LogVilla


## Definizione delle funzioni principali per il preprocessing e il training del modello

In [0]:
def signals_selection(logs, groups):
    
    # Rimuove le rilevazioni di tipo boolean, quelle non mappate nel dataset "group" e quelle che non presentano un valore nei
    # campi "Data", "Value" e "Id". Restituisce un dataframe di segnali che saranno effettivamente preprocessati e passati
    # come input al modello per l'addestramento
    
    # Unità di misura da tenere e tipologie di rilevazioni da rimuovere
    measures_to_keep = ['ppm', 'C°', '%', 'W', 'Wh']
    # description_to_remove = ['Daikin Active Power Total','Consumo enel di E1 + Villa']
    
     # Rimuovo i record associati a rilevazioni di tipo booleano
    logs = logs[logs['ValueType'].isin(measures_to_keep) == True]
    
    # Rimuovo i record associati a rilevazioni di tipo "Daikin Active Power Total" e "Consumo Enel di E1 + Villa"
    #logs = logs[logs['Description'].isin(description_to_remove) == False]
    
    # Effettuo il left join con la tabella group per ricavare l'ID della rilevazione
    logs = logs.merge(groups[['Description', 'Id']], how='left', on='Description')
    logs = logs.dropna(subset=['Data', 'Id', 'Value']).reset_index(drop=True)
    
    # Converto la colonna 'Data' in formato datetime
    logs['Data'] = pd.to_datetime(logs['Data'])
    
    # Converto la colonna 'Value' in formato numerico
    logs["Value"] = pd.to_numeric(np.char.replace(logs['Value'].to_numpy().astype(str),',','.'))
    
    return logs

In [0]:
def preprocessing(sel_logs):
    
    # Ricavo il mese e il giorno della settimana della rilevazione
    sel_logs['Month'] = pd.DatetimeIndex(sel_logs['Data']).month
    sel_logs['Weekday'] = pd.DatetimeIndex(sel_logs['Data']).weekday
    
    # 3) Rimuovo le features che non servono ad addestrare il modello
    features_to_drop = ['Data', 'IdBuilding', 'IndividualAddress', 'GroupAddress', 'TelegramType', 'ValueType', 'Description']
    sel_logs = sel_logs.drop(columns=features_to_drop)   
    
    return sel_logs

In [0]:
def forest_train_save(train_logs, model_name, num_est=100, cnt_rate=0.01):
  
  with mlflow.start_run(run_name=model_name) as run:
    iso_forest = IsolationForest(n_estimators=num_est, random_state=19, contamination=cnt_rate, behaviour='deprecated').fit(train_logs)

    # Log parameters and metrics using the MLflow APIs
    mlflow.log_params({"n_estimators": num_est, "contamination_rate": cnt_rate})

    # Log the sklearn model and register as version 1
    mlflow.sklearn.log_model(
        sk_model=iso_forest,
        artifact_path="sklearn-model",
        registered_model_name=model_name
    )

## Run del training dei modelli

In [0]:
sel_logs = signals_selection(df_log, df_group)
sel_logs.head()

Unnamed: 0,Data,IdBuilding,IndividualAddress,GroupAddress,TelegramType,Value,ValueType,Description,Id
0,2019-02-22 12:27:53.323333,1,3.1.6,2/2/0,write,-33429.7,W,Produzione totale fotovoltaico,13.0
1,2019-02-22 12:28:56.400000,1,3.1.201,2/0/3,write,35.68625,%,C1/0/M102 Umidità,10.0
2,2019-02-22 12:29:37.176666,1,3.2.202,2/0/0,write,400.96,ppm,C1/0/M101 CO2,1.0
3,2019-02-22 12:30:21.933333,1,3.3.201,2/1/8,write,36.86272,%,C1/0/U105 Umidità,12.0
4,2019-02-22 12:31:23.480000,1,3.3.202,2/1/0,write,536.96,ppm,C1/0/U101 C02,3.0


In [0]:
# Train del modello per CO2
train_logs = preprocessing(sel_logs[sel_logs['ValueType'] == 'ppm'])
forest_model = forest_train_save(train_logs, model_name="team1_CO2")

In [0]:
# Train del modello per umidità
train_logs = preprocessing(sel_logs[sel_logs['ValueType'] == '%'])
forest_model = forest_train_save(train_logs, model_name="team1_umidita")

In [0]:
# Train del modello per temperatura
train_logs = preprocessing(sel_logs[sel_logs['ValueType'] == 'C°'])
forest_model = forest_train_save(train_logs, model_name="team1_temperatura")

In [0]:
# Train del modello per Watt
train_logs = preprocessing(sel_logs[sel_logs['ValueType'] == 'W'])
forest_model = forest_train_save(train_logs, model_name="team1_W")

In [0]:
# Train del modello per Wh
train_logs = preprocessing(sel_logs[sel_logs['ValueType'] == 'Wh'])
forest_model = forest_train_save(train_logs, model_name="team1_Wh")