In [5]:
#Set up Azure!
#And Imports
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace, Environment, Experiment, Dataset, ScriptRunConfig

#Set up ze workspace
config_path = '../utils/config.json'
tenant_id = '72f988bf-86f1-41af-91ab-2d7cd011db47' #outputted post 'az login'
interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id) #Create login object
ws = Workspace.from_config(path=config_path, auth=interactive_auth) #link ze workspace

#Set up ze environment
# - obtain environment.yml from 'conda env export > environment.yml
env_name = 'SampleEnv'
env_path = '../utils/environment.yml'
env = Environment.from_conda_specification(name=env_name, file_path=env_path)

#Set up ze experiment
dataset_path = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
ds = Dataset.File.from_files(dataset_path)

#Set up ze run
src_dir = '../src'
src_name = 'azure_isolation_forest.py' #This is where the model src will go
compute_name = 'WorkspaceCompute'
arguments = ['--data-path', ds.as_mount()]
src = ScriptRunConfig(source_directory=src_dir, script=src_name, compute_target=compute_name, environment=env, arguments=arguments)

Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
Interactive authentication successfully completed.
Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.


AuthenticationException: AuthenticationException:
	Message: Could not retrieve user token. Please run 'az login'
	InnerException It is required that you pass in a value for the "algorithms" argument when calling decode().
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "inner_error": {
            "code": "Authentication"
        },
        "message": "Could not retrieve user token. Please run 'az login'"
    }
}

In [4]:
import mlflow
import argparse
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# set constants
DEBUG = True
RAND_STATE = 0
TEST_SIZE = 1/4
np.random.seed(RAND_STATE)  # for consistency

# helper functions
def process_data(df):
    '''
    takes raw dataframe and 1) subsets 1% and 2) filters it for top3 labels
    '''
    # subsetting
    if(DEBUG): print(f'Processing {df.head()}')
    if(DEBUG): print(f'Variables {df.columns}')
    if(DEBUG): print(f'Subsetting data of shape {df.shape}')
    subset_size = round(df.shape[0] * 0.01)  # 1% of size
    if(DEBUG): print(f'Subsetting with size {subset_size}')
    valid_idxs = np.random.choice(df.index, size=subset_size, replace=False)
    if(DEBUG): print(f'Subsetting with {subset_size} barcodes')
    mask = df.index.isin(valid_idxs)
    if(DEBUG): print(f'Subsetting with mask with {sum(mask)} positives')
    df = df[mask]
    if(DEBUG): print(f'Subsetted data to shape {df.shape}')
    # filter labels
    labels = df[41]  # get labels, column name is given
    label_counts = labels.value_counts()  # n-obs / label
    mask = label_counts > df.shape[0] * 0.05  # label must match >5% of the data
    valid_labels = label_counts.index[mask]  # get passing labels
    if(DEBUG): print(f'Passing labels = {valid_labels.tolist()}')
    df = df[df[41].isin(valid_labels)]  # subset
    
    return(df)

def split_data(df):
    '''
    splits the data based on normal or anomaly
    '''
    # prepare constants
    labels = df[41]  # for rapid_reusing
    # split by normal vs. anomaly observation
    mask_normal, mask_anomaly = labels=='normal.', labels!='normal.'
    if(DEBUG): print(f'Splitting data with {sum(mask_normal)} NORMAL and {sum(mask_anomaly)} ANOMALY')
    # only select for numerical columns
    df = df.select_dtypes(['number'])
    if(DEBUG): print(f'Data post numerical variable filtering of shape {df.shape}')
    # split data for training (normal)
    X_train, X_test_norm, y_train, y_test_norm = train_test_split(df[mask_normal], labels[mask_normal],
                                                                  shuffle=True, test_size=TEST_SIZE,
                                                                  random_state=RAND_STATE, stratify=labels[mask_normal])
    # split data for testing (anomaly)
    X_test_anom, y_test_anom = df[mask_anomaly], labels[mask_anomaly]
    
    return (X_train, y_train), (X_test_norm, y_test_norm), (X_test_anom, y_test_anom)

def compute_f1(model, data, pos_label):
    '''
    computes an f1 score, pos_label 1=normal, -1=anomaly
    '''
    X, y = data  # unpack data
    true_labels = [pos_label] * X.shape[0]
    predicted_labels = model.predict(X)
    score = f1_score(true_labels, predicted_labels, pos_label=pos_label)
    
    return score

# main method
if __name__ == "__main__":
    # read arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path', type=str, help='path to the dataset')
    args = parser.parse_args()
    
    # process data
    df = pd.read_csv(args.data_path, index_col=None, header=None)  # read it
    df = process_data(df)

    # split data
    train, test_norm, test_anom = split_data(df)
    X_train, y_train = train  # unpack training data
    
    # train model
    model = IsolationForest(random_state=RAND_STATE)
    model.fit(X_train)
    
    # score model
    mlflow.log_metric('F1-Score Training Normal', compute_f1(model, train, 1))
    mlflow.log_metric('F1-Score Testing Normal', compute_f1(model, test_norm, 1))
    mlflow.log_metric('F1-Score Testing Anomaly', compute_f1(model, test_anom, -1))

usage: ipykernel_launcher [-h] [--data-path DATA_PATH]
ipykernel_launcher: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"314df599-7a2b-4036-85d2-d7fb6dea557d" --shell=9002 --transport="tcp" --iopub=9004 --f=C:\Users\Pip51\AppData\Local\Temp\tmp-1512869nCnmu3B2s1.json


SystemExit: 2