In [None]:
# Verify latest version of azure-ai-ml package is installed. If not - pip install azure-ai-ml
pip show azure-ai-ml

In [None]:
# Connect to workspace
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential does not work
    credential = InteractiveBrowserCredential()

In [None]:
# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

In [None]:
# Create src folder for the script files
import os

script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

In [None]:
# Create script to prepare the data

%%writefile $script_folder/prep-data.py
# Import libraries
import argparse
import pandas as pd
import numpy as np
from langdetect import detect_langs, DetectorFactory
from nltk import word_tokenize
from pathlib import Path

def main(args):
    # Read data
    df = get_data(args.input_data)

    cleaned_data = clean_data(df)

    feature_engineer_data = feature_engineer(cleaned_data)

    output_df = feature_engineer_data.to_csv((Path(args.output_data) / 'customer-support-tickets.csv'), index = False)

# Function that reads the data
def get_data(path):
    df = pd.read_csv(path)

    # Count the rows and print the result
    row_count = (len(df))
    print('Preparing {} rows of data'.format(row_count))

    return df

# Function that removes missing values
def clean_data(df):
    # Subset data frame
    df_set = df[['body', 'type', 'language']].copy()

    # Remove missing values
    df_set = df_set.dropna().reset_index(drop=True)

    return df_set

# Function to feature engineer data
def feature_engineer(df):
    # Ensure languages are correct and keep English 'en' tickets only
    languages = []
    
    DetectorFactory.seed = 9
    
    for row in range(len(df)):
        languages.append(detect_langs(df.iloc[row, 0]))

    languages = [str(lang).split(':')[0][1:] for lang in languages]

    df['language'] = languages

    it_ticks = df[df['language'] == 'en'].copy()
    it_ticks.reset_index(inplace=True, drop=True)

    # Rename columns
    it_ticks.rename(columns={'body': 'text', 'type': 'label'}, inplace=True)

    # Create new len_words column
    word_tokens = [word_tokenize(text) for text in it_ticks['text']]

    len_tokens = []

    for i in range(len(word_tokens)):
        len_tokens.append(len(word_tokens[i]))

    it_ticks['len_words'] = len_tokens

    return it_ticks

def parse_args():
    # Setup arg parser
    parser = argparse.ArgumentParser()

    # Add arguments
    parser.add_argument('--input_data', dest='input_data',
                        type=str)
    parser.add_argument('--output_data', dest='output_data',
                        type=str)

    # Parse args
    args = parser.parse_args()

    # Return args
    return args

# Run script
if __name__ == '__main__':
    # Add space in logs
    print('\n\n')
    print('*' * 60)

    # Parse args
    args = parse_args()

    # Run main function
    main(args)

    # Add space in logs
    print('*' * 60)
    print('\n\n')

In [None]:
# Create script to train the model

%%writefile $script_folder/train-model.py
# Import libraries
import mlflow
import glob
import argparse
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def main(args):
    # Enable autologging
    mlflow.autolog()

    # Read data
    df = get_data(args.training_data)

    # Split data
    X_train, X_test, y_train, y_test = split_data(df)

    # Create pipeline to train model
    pipeline = creat_pipeline(args.reg_rate)

    # Train and evaluate model
    model = train_model(pipeline, X_train, X_test, y_train, y_test)

    eval_model(model, X_test, y_test)

# Function that reads the data
def get_data(data_path):

    all_files = glob.glob(data_path + '\*.csv')
    df = pd.concat((pd.read_csv(f) for f in all_files), sort=False)

    return df

# Function that splits the data
def split_data(df):
    print('Splitting data...')
    X, y = df[['text', 'len_words']], df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

    return X_train, X_test, y_train, y_test

# Function to create pipeline
def create_pipeline(reg_rate):
    preprocessor = ColumnTransformer(
        transformers=[
            ('vect', TfidfVectorizer(lowercase=False, ngram_range=(1, 2)), 'text'),
            ('len', 'passthrough', ['len_words'])
        ]
    )

    mlflow.log_param('Regularization rate', reg_rate)
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('scaler', MaxAbsScaler()),
        ('logreg', LogisticeRegression(C=1/reg_rate, max_iter=5000, random_state=9))
    ])

    return pipeline

# Function that trains the model
def train_model(model, X_train, X_test, y_train, y_test):
    print('Training model...')
    model = model.fit(X_train, y_train)

    mlflow.sklearn.save_model(model, args.model_output)

    return model

# Function that evaluates the model
def eval_model(model, X_test, y_test):
    # Calculate accuracy
    y_pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    print('LogisticRegression Accuracy score: {:.1%}'.format(acc_score))

    # Display confusion matrix and classification report
    conf_matr = confusion_matrix(y_test, y_pred)
    cla_rep = classification_report(y_test, y_pred)
    print('\nConfusion Matrix:\n{}'.format(conf_matr))
    print('\nClassification Report:\n{}'.format(cla_rep))

def parse_args():
    # Setup arg parser
    parser = argparse.ArgumentParser()

    # Add arguments
    parser.add_argument('--training_data', dest='training_data',
                        type=str)
    parser.add_argument('--reg_rate', dest='reg_rate',
                        type=float, default=1.0)
    parser.add_argument('--model_output', dest='model_output',
                        type=str)

    # Parse args
    args = parser.parse_args()

    # Return args
    return args

# Run script
if __name__ == '__main__':
    # Add space in logs
    print('\n\n')
    print('*' * 60)

    # Parse args
    args = parse_args()

    # Run main function
    main(args)

    # Add space in logs
    print('*' * 60)
    print('\n\n')


In [None]:
# Define the components - create YAML for each component you want to run as a pipeline step
%%writefile prep-data.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: Prepare training data
version: 1
type: command
inputs:
  input_data: 
    type: uri_file
outputs:
  output_data:
    type: uri_folder
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python prep-data.py 
  --input_data ${{inputs.input_data}} 
  --output_data ${{outputs.output_data}}

In [None]:
%%writefile train-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: Train a logistic regression model
version: 1
type: command
inputs:
  training_data: 
    type: uri_folder
  reg_rate:
    type: number
    default: 1.0
outputs:
  model_output:
    type: mlflow_model
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python train-model.py 
  --training_data ${{inputs.training_data}}
  --reg_rate ${{inputs.reg_rate}} 
  --model_output ${{outputs.model_output}} 

In [None]:
# Load the components
from azure.ai.ml import load_component
parent_dir = ''

prep_data = load_component(source=parent_dir + './prep-data.yml')
train_logistic_regression = load_component(source=parent_dir + './train-model.yml')

In [None]:
# Build the pipeline
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline
def tickets_classification(pipeline_job_input):
    clean_data = prep_data(input_data=pipeline_job_input)
    train_model = train_logistic_regression(training_data=clean_data.outputs.output_data)

    return {
        'pipeline_job_transformed_data': clean_data.outputs.output_data,
        'pipeline_job_trained_model': train_model.outputs.model_output,
    }

pipeline_job = tickets_classification(Input(type=AssetTypes.URI_FILE, path='azureml:tickets-data:1'))

In [None]:
print(pipeline_job)

In [None]:
# Change the output mode
pipeline_job.outputs.pipeline_job_transformed_data.mode = 'upload'
pipeline_job.outputs.pipeline_job_trained_model.mode = 'upload'
# Set pipeline level compute
pipeline_job.settings.default_compute = 'aml-cluster'
# Set pipeline level datastore
pipeline_job.settings.default_datastore = 'workspaceblobstore'

# Print the pipeline job again to review the changes
print(pipeline_job)

In [None]:
# Submit the pipeline job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name='pipeline_tickets'
)
pipeline_job