In [1]:
import pandas as pd
import numpy as np
import warnings
from math import sqrt
warnings.filterwarnings('ignore')
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.model import Model
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.train.automl import AutoMLConfig
import pickle
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

In [2]:
from azureml.core import Workspace, Dataset

subscription_id = 'xxxxxxxxx'
resource_group = 'xxxxxx'
workspace_name = 'xxxxxx'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [3]:
# Importing pre-processed dataset
dataset = Dataset.get_by_name(workspace, name='processed_weather_data_portofTurku')
print(dataset.name, dataset.version)

processed_weather_data_portofTurku 1


In [4]:
df = dataset.to_pandas_dataframe()

In [5]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


# Spliting Pre-Processed data into Training and Validation datasets

In [6]:
# Validation set is used later to evaluate model performance post training. 

In [7]:
df_training = df.iloc[:77160]

In [8]:
df_training.shape

(77160, 10)

In [9]:
df_validation = df.drop(df_training.index)

In [10]:
df_validation.shape

(19289, 10)

# Registering Training and Validation data to the datastore on the workspace. 

In [11]:
!mkdir Data

mkdir: cannot create directory ‘Data’: File exists


In [12]:
df_training.to_csv('Data/training_data.csv',index=False)

In [13]:
df_validation.to_csv('Data/validation_data.csv',index=False)

In [14]:
datastore = workspace.get_default_datastore()

In [15]:
datastore.upload(src_dir='Data', target_path='data')

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 2 files
Target already exists. Skipping upload for data/training_data.csv
Target already exists. Skipping upload for data/validation_data.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_25868046c0d743d7a43dee39f1ee4b9c

In [16]:
training_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/training_data.csv'))

In [17]:
validation_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/validation_data.csv'))

In [18]:
training_ds = training_dataset.register(workspace=workspace,
                                 name='training_dataset',
                                 description='Dataset to use for ML training')

In [19]:
validation_ds = validation_dataset.register(workspace=workspace,
                                 name='validation_dataset',
                                 description='Dataset for validation ML models')

# Data ingestion step - Training dataset

In [20]:
dataset = Dataset.get_by_name(workspace, name='training_dataset')
print(dataset.name, dataset.version)

training_dataset 1


In [21]:
df = dataset.to_pandas_dataframe()

In [22]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


In [23]:
df.shape

(77160, 10)

#### Feature Selection and scaling

In [24]:
X = df[['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars', 'Current_weather_condition']].values
y = df['Future_weather_condition'].values
y

array([1, 1, 1, ..., 1, 1, 1])

In [25]:
# Splitting the Training dataset into Train and Test set for ML training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [27]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training and Testing Step

## 1. Support Vector Machine

In [28]:
myexperiment = Experiment(workspace, "support-vector-machine")

In [29]:
#from sklearn.svm import SVC
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [30]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [31]:
svc = svm.SVC()

In [32]:
# initialize a run in Azureml
run = myexperiment.start_logging()


run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

In [33]:
svc_grid = GridSearchCV(svc, parameters)

In [34]:
%%time
svc_grid.fit(X_train, y_train)

CPU times: user 13min 18s, sys: 1.1 s, total: 13min 19s
Wall time: 13min 19s


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})

In [35]:
svc_grid.get_params(deep=True)

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'rbf'), 'C': [1, 10]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [36]:
from sklearn.svm import SVC

In [37]:
svc = SVC(C=svc_grid.get_params(deep=True)['estimator__C'], kernel=svc_grid.get_params(deep=True)['estimator__kernel'])

In [38]:
svc.fit(X_train, y_train)

SVC()

In [39]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("C", svc_grid.get_params(deep=True)['estimator__C'])
run.log("Kernel", svc_grid.get_params(deep=True)['estimator__kernel'])

In [40]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [41]:
predicted_svc = svc.predict(X_test)

In [42]:
acc = accuracy_score(y_test, predicted_svc)

In [43]:
fscore = f1_score(y_test, predicted_svc, average="macro")
precision = precision_score(y_test, predicted_svc, average="macro")
recall = recall_score(y_test, predicted_svc, average="macro")

In [44]:
# Log to AzureML and MLflow
run.log("Test_accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)
run.log("F-Score", fscore)

In [45]:
run.complete()
print ("run id:", run.id)

run id: d5a1cf60-edc8-4758-90f2-df68aadaf85c


In [46]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset Version': 1,
 'C': 1.0,
 'Kernel': 'rbf',
 'Test_accuracy': 0.9519180922757906,
 'Precision': 0.8869828453699851,
 'Recall': 0.8859050416892464,
 'F-Score': 0.8864428755463128}

# Model Packaging Step

pickle file or onnx

In [47]:
# Convert into SVC model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(svc, initial_types=initial_type)
with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.


# Model Registering Step

In [48]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/svc.onnx', # this points to a local file 
                       model_name = "support-vector-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9519'}, 
                       model_framework='pandas==0.23.4',
                       description = "Support vector classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model support-vector-classifier
Name: support-vector-classifier
Version: 1


# Save model artefacts

In [49]:
import pickle

with open('./outputs/scaler.pkl', 'wb') as scaler_pkl:
    pickle.dump(sc, scaler_pkl)

In [50]:
# Register Model on AzureML WS
scaler = Model.register(model_path = './outputs/scaler.pkl', # this points to a local file 
                       model_name = "scaler", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version}, 
                       model_framework='pandas==0.23.4',
                       description = "Scaler used for scaling incoming inference data",
                       workspace = workspace)

print('Name:', scaler.name)
print('Version:', scaler.version)

Registering model scaler
Name: scaler
Version: 1
