In [85]:
import azureml.core
from azureml.core import Workspace

#Chargement de l'espace de travail
ws = Workspace.from_config()

In [86]:
#On récupère un dossier crée préalablement où l'on inscrira le script d'entrainement
dossier_entrainement = 'train_folder'

In [87]:
%%writefile $dossier_entrainement/training.py
#Cette cellule sera enregistré comme un fichier python dans le dossier d'entrainement

#Script d'entrainement

#Importation des librairies ou methodes nécessaires

#Pour Azure ML
from azureml.core import Run

#Librairies classiques python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Librairies / méthodes ML/DL
from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense,LSTM,Dropout,RepeatVector,TimeDistributed

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Librairies / méthodes pour sauvegarde du modèle en ONNX
import onnx
import onnxruntime as rt
import keras2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import DoubleTensorType

#On commence l'experiment
run = Run.get_context()

#Creation du dossier outputs qui upload automatiquement les fichiers créés dans l'expérience
os.makedirs('outputs', exist_ok=True)

#Chargement des données
X_train = run.input_datasets['dataset1'].to_pandas_dataframe()
X_train = X_train.iloc[:57600,:]
X_train = X_train.values 

#Standardisation (seule étape de preprocessing ici)
#Mise en pipeline pour la convertion en ONNX
pipeline = Pipeline([('scaler', StandardScaler())])
pipeline.fit(X_train)
X_train_std = pipeline.transform(X_train)

#Enregistrement de la pipeline en ONNX
model_onnx = convert_sklearn(pipeline,"pipe",initial_types=[("input", DoubleTensorType([60, 4]))])
onnx.save_model(model_onnx, "outputs/pipeline_std.onnx")

#Creation d'un jeu de validation
X_train_std,X_valid_std = train_test_split(X_train_std, test_size = 0.2, random_state = 42)

#Transformation en sérire temporelle de 60 périodes
def mise_en_serie_temporelle(X):
    periode = 60
    X_final = []
    for i in range(len(X)-periode-1):
        t = []
        for j in range(1,periode+1):
            t.append(X[[(i+j+1)], :])
        X_final.append(t)
    X_final = np.array(X_final)
    X_final = X_final.reshape(X_final.shape[0],periode,4)
    return X_final

X_train_std = mise_en_serie_temporelle(X_train_std)
X_valid_std = mise_en_serie_temporelle(X_valid_std)

#Modèle lstm autoencodeur
timesteps = 60
n_features = 4

epochs = 50
batch = 512
d = 0.3

#Definition du modele
model = Sequential()
model.add(LSTM(180,  activation = 'relu', input_shape=(timesteps,n_features), return_sequences = True))
model.add(Dropout(d))
model.add(LSTM(120,   activation = 'relu', return_sequences = False))
model.add(Dropout(d))
model.add(RepeatVector(n = timesteps))
model.add(LSTM(120,   activation = 'relu', return_sequences=True))
model.add(Dropout(d))
model.add(LSTM(180,  activation = 'relu', return_sequences=True))
model.add(TimeDistributed(Dense(n_features)))

#Compilation du modele
adam = optimizers.Adam()
model.compile(loss='mae', optimizer=adam)

#Entrainement du modele
history = model.fit(X_train_std, X_train_std, validation_data=(X_valid_std,X_valid_std),
                     epochs=epochs, 
                     batch_size=batch)

#Enregistrement du modele
model.save("outputs/model_final.hdf5")
#onnx_model = keras2onnx.convert_keras(model, model.name)
#onnx.save_model(onnx_model, "outputs/lstmautoencodeur.onnx")

#Plot de l'erreur dans l'experiment
history = history.history
figure,axe = plt.subplots(figsize = (15,8), dpi = 100)
axe.plot(history['loss'],'b', label = 'Train_data', linewidth = 5)
axe.plot(history['val_loss'],'r', label = 'Validation_data', linewidth = 5)
axe.set_title('Model_loss', fontsize = 16)
axe.set_ylabel('Erreur_MAE')
axe.set_xlabel('Epoch')
axe.legend()
run.log_image("Loss_MAE", plot=plt)

#Prediction
X_pred = model.predict(X_train_std)

#Cacul de l'erreur pour chaque série temporelle de la base d'entrainement
def erreur(a,b):
    err = 0
    for i in range(a.shape[0]):
        for j in range(a.shape[1]):
            err = err + abs(a[i][j] - b[i][j])
    err = err / (a.shape[0] * a.shape[1])        
    return err

L_train = []
for i in range (len(X_train_std)):
    L_train.append(erreur(X_train_std[i],X_pred[i]))


#Plot des erreurs des series temporelles de la base d'entrainement
#Pour déterminer le seuil d'anomalie
plt.figure(figsize=(15,8), dpi = 100)
plt.title('Distribution_de_l_erreur_training_data', fontsize = 16)
sns.distplot(L_train, bins = 20, kde = True, color = 'red');
plt.xlim([0.5,1.0])
run.log_image("Plot_erreur_train_data", plot=plt)

#Sur les nouvelles données

#Chargement des nouvelles donnees
X_test = run.input_datasets['dataset2'].to_pandas_dataframe()
X_test = X_test.iloc[:28000,:]
X_test = X_test.values 

#Transformation des donnees
X_test_std = pipeline.transform(X_test)
X_test_std = mise_en_serie_temporelle(X_test_std)

#Prediction
X_pred2 = model.predict(X_test_std)

#Calcul de l'erreur
L_test = []
for i in range (len(X_pred2)):
    L_test.append(erreur(X_test_std[i],X_pred2[i]))

seuil = []
for i in range (len(L_test)):
    seuil.append(0.9)

#Plot anomalie
figure,axe = plt.subplots(figsize = (15,8), dpi = 100)
axe.plot(L_test,'b', label = 'Erreur_test_data', linewidth = 2)
axe.plot(seuil,'r', label = 'Seuil', linewidth = 2)
axe.set_title('Prediction_test_data', fontsize = 16)
axe.set_ylabel('Erreur')
axe.legend()
plt.ylim([0,10])
run.log_image("Prediction_test_data", plot=plt)

run.complete()

Overwriting train_folder/training.py


In [88]:
#On recupere la cible de calcul (crée préalablement)
from azureml.core.compute import ComputeInstance

compute_name = "Big-Compute"
training_compute = ComputeInstance(workspace=ws, name=compute_name)

In [89]:
#On recupere l'environnement / (code de creation de l'environnement en commentaire)
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

training_env = Environment.get(name = "training-environnement",workspace=ws)


'''
#Code de creation de l'environnement avec ajout des packages utilisés
train_packages = CondaDependencies.create(conda_packages=['scikit-learn','matplotlib','seaborn'],
                                          pip_packages=['azureml-defaults', 'azureml-dataprep[pandas]',
                                                        'tensorflow==2.1.0','keras==2.3.1',
                                                        'keras2onnx','skl2onnx','onnx','onnxruntime'])

training_env.python.conda_dependencies = train_packages
'''

"\n#Code de creation de l'environnement avec ajout des packages utilisés\ntrain_packages = CondaDependencies.create(conda_packages=['scikit-learn','matplotlib','seaborn'],\n                                          pip_packages=['azureml-defaults', 'azureml-dataprep[pandas]',\n                                                        'tensorflow==2.1.0','keras==2.3.1',\n                                                        'keras2onnx','skl2onnx','onnx','onnxruntime'])\n\ntraining_env.python.conda_dependencies = train_packages\n"

In [90]:
from azureml.core import ScriptRunConfig
from azureml.core import Experiment
from azureml.widgets import RunDetails

iRobot_ds1 = ws.datasets.get("Dataset_iRobot_without_anomaly")
iRobot_ds2 = ws.datasets.get("Dataset_iRobot_with_anomaly")

#Creation du ScriptRunConfig
config = ScriptRunConfig(source_directory = dossier_entrainement,
                        script='training.py',
                        arguments=[iRobot_ds1.as_named_input('dataset1'),
                                   iRobot_ds2.as_named_input('dataset2')],
                        compute_target = training_compute,
                        environment = training_env)

#Creation de l'experience
experiment = Experiment(workspace = ws, name = 'robot_training')

#Execution de l'experience
run = experiment.submit(config = config)
RunDetails(run).show()
run.wait_for_completion()

Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/big-compute/code/Users/teywaoziol/train_folder directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…



{'runId': 'robot_training_1609251514_200785f2',
 'target': 'Big-Compute',
 'status': 'Completed',
 'startTimeUtc': '2020-12-29T14:18:47.601153Z',
 'endTimeUtc': '2020-12-29T16:16:09.483259Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'a6fc0f33-3975-436f-93b1-b5b8fb6c5c0b',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '9c138d19-f07b-4856-9d30-3a3d38e85c89'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'dataset1', 'mechanism': 'Direct'}}, {'dataset': {'id': '88339177-4e65-42ea-8e14-f4f0bc9063d3'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'dataset2', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['DatasetConsumptionConfig:dataset1',
   'DatasetConsumptionConfig:dataset2'],
  'sourceDirectoryDataStore': None,
  

In [91]:
#Enregistrement de l'environnement la première fois où il est executé
#training_env.register(workspace=ws)