# Auto Claims Solution

## Training Pipeline

**Imports**

In [1]:
import os
import azureml.core
from azureml.core import Dataset
from azureml.core import Workspace

import requests
import json

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.30.0 to work with dp100ws


**Connect to default datastore**

In [2]:
default_ds = ws.get_default_datastore()

**Create folders for the solutions**

In [41]:
# Create a folder for the pipeline step files
pipeline_folder = 'auto_claims_pipeline'
service_folder = 'auto_claims_service'

os.makedirs(pipeline_folder, exist_ok=True)
os.makedirs(service_folder, exist_ok=True)

**Load the dataset**

In [4]:
df = ws.datasets.get('auto claims').to_pandas_dataframe()
df[:3]

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60,0,11,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43,0,11,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1,0,Highly Urban/ Urban
2,921317019,0,18NOV51,48,0,11,"$52,881",No,$0,z_No,...,Van,yes,$0,0,No,2,$0,10,0,Highly Urban/ Urban


**Write Preprocessing Script File in the pipeline folder**

In [125]:
%%writefile $pipeline_folder/preprocessing.py

import numpy as np
import pandas as pd

def format_data(
    dataframe, 
    currency_columns=['HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT','INCOME'],
    to_clean = ['EDUCATION','OCCUPATION','MSTATUS','GENDER','CAR_TYPE','URBANICITY']
    ):

    df = dataframe.copy()
    for c in currency_columns:
        if c in df.columns:
            df[c] = df[c].str.replace('$','').str.replace(',','').astype(float)

    for col in to_clean:
        if col in df.columns:
            df[col] = df[col].str.replace('z_','').str.replace('<','')

    return df


def clean_categoricals(dataframe):
    df = dataframe.copy()

    categorical_map = {
        'PARENT1': {'No': 0, 'Yes': 1}, 
        'MSTATUS': {'Yes': 0, 'No': 1},
        'GENDER': {'F': 0, 'M': 1}, 
        'EDUCATION': {'High School': 0, 'Bachelors': 1, 'Masters': 2, 'PhD': 3}, 
        'CAR_USE': {'Private': 0, 'Commercial': 1}, 
        'CAR_TYPE': {'SUV': 0, 'Minivan': 1, 'Pickup': 2, 'Sports Car': 3, 'Van': 4, 'Panel Truck': 5}, 
        'RED_CAR': {'no': 0, 'yes': 1},
        'REVOKED': {'No': 0, 'Yes': 1}, 
        'URBANICITY': {'Highly Urban/ Urban': 0, 'Highly Rural/ Rural': 1},
        'OCCUPATION': {'Blue Collar': 0, 'Clerical': 1, 'Professional': 2, 'Manager': 3, 'Lawyer': 4, 'Student': 5, 'Home Maker': 6, 'Doctor': 7}
    }

    for k,v in categorical_map.items():
        df[k] = df[k].replace(v)

    return df

def clean_numericals(dataframe, numerical_columns=['AGE', 'INCOME','BLUEBOOK','TIF','OLDCLAIM','HOME_VAL']):
    df = dataframe.copy()

    for c in numerical_columns:
        df[c] = df[c].apply(lambda x: np.log(x) + 1).replace([np.inf,-np.inf],1)

    return df

def preprocessing(data,inference=False):
    columns = ['ID','KIDSDRIV','BIRTH','AGE','HOMEKIDS','YOJ','INCOME','PARENT1','HOME_VAL','MSTATUS','GENDER','EDUCATION','OCCUPATION','TRAVTIME','CAR_USE','BLUEBOOK','TIF','CAR_TYPE','RED_CAR','OLDCLAIM','CLM_FREQ','REVOKED','MVR_PTS','CLM_AMT','CAR_AGE','CLAIM_FLAG','URBANICITY']
    to_drop = ['ID','YOJ','BIRTH']
    targets = ['CLAIM_FLAG','CLM_FREQ','CLM_AMT']

    if inference:
        columns = [x for x in columns if x not in targets]

    if isinstance(data,np.ndarray):
        data = pd.DataFrame(data,columns=columns)

    data = data.drop(to_drop,axis=1)
    data = format_data(dataframe=data)
    data = clean_categoricals(dataframe=data).astype(float)
    data = clean_numericals(dataframe=data)



    return data



Overwriting auto_claims_pipeline/preprocessing.py


**Copy the preprocessing script in the service folder**

In [126]:
!cp $pipeline_folder/preprocessing.py $service_folder/preprocessing.py

**Use the preprocessing function iniside the data preparation step**

In [127]:
%%writefile $experiment_folder/prep_data.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from preprocessing import preprocessing

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

df = preprocessing(df)

# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)

# remove nulls
df = df.dropna()

# Log processed rows
row_count = (len(df))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
df.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Overwriting auto_claims_pipeline/prep_data.py


**Train and register models on the processed data**

In [128]:
%%writefile $experiment_folder/train_model.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, PoissonRegressor, GammaRegressor
from sklearn import metrics
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
df = pd.read_csv(file_path)

lr_target = 'CLAIM_FLAG'
po_target = 'CLM_FREQ'
gm_target = 'CLM_AMT'

target_cols = [lr_target, po_target, gm_target]

# Split data into training set and test set
train, test = train_test_split(df, test_size=0.33, random_state=123, stratify=df[lr_target])

train_no_claims = train[train[gm_target] > 0].copy()


lr_model = LogisticRegression()
lr_model.fit(train.drop(target_cols,axis=1),train[lr_target])

po_model = PoissonRegressor()
po_model.fit(train_no_claims.drop(target_cols,axis=1),train_no_claims[po_target])

gm_model = GammaRegressor()
gm_model.fit(train_no_claims.drop(target_cols,axis=1),train_no_claims[gm_target])

lr_pred = lr_model.predict_proba(test.drop(target_cols,axis=1))[:,1]
po_pred = lr_model.predict(test.drop(target_cols,axis=1))
gm_pred = lr_model.predict(test.drop(target_cols,axis=1))


loss_cost = (lr_pred * po_pred) * gm_pred

rmse = metrics.mean_squared_error(test[gm_target],loss_cost,squared=False)

run.log('RMSE', np.float(rmse))


model = dict(
    lr_model=lr_model,
    po_model=po_model,
    gm_model=gm_model
)

model_file = os.path.join('outputs', 'zip_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'auto_claims_zip_model',
               tags={'Training context':'Pipeline'},
               properties={'RMSE': np.float(rmse)})


run.complete()

Overwriting auto_claims_pipeline/train_model.py


**Connect to a compute cluster or create one if does not exists**

In [129]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "dp100cc"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


**Write the environment file for the training pipeline**

In [130]:
%%writefile $pipeline_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting auto_claims_pipeline/experiment_env.yml


**Create environment using the conda yaml**

In [131]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", pipeline_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


**Confiigure the pipeline steps**

In [132]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
auto_claims_dataset = ws.datasets.get("auto claims")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = pipeline_folder,
                                script_name = "prep_data.py",
                                arguments = ['--input-data', auto_claims_dataset.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = pipeline_folder,
                                script_name = "train_model.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


**Upload and run the pipeline**

In [133]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'auto-claims-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [28b2524c][16a51b59-fd08-4b55-aa12-9145410ab855], (This step will run and generate new outputs)Created step Train and Register Model [5a16bbd6][118b1f4d-5c32-457e-abfa-959035e7aec6], (This step will run and generate new outputs)

Submitted PipelineRun f3abba4d-ed34-46c7-af13-b138ff1ae88b
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/f3abba4d-ed34-46c7-af13-b138ff1ae88b?wsid=/subscriptions/d1cbef34-81e9-4f99-9cf3-8d36f1255e04/resourcegroups/dp100/workspaces/dp100ws&tid=37219092-bab7-4671-af41-d7ded0486bc7
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: f3abba4d-ed34-46c7-af13-b138ff1ae88b
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/f3abba4d-ed34-46c7-af13-b138ff1ae88b?wsid=/subscriptions/d1cbef34-81e9-4f99-9cf3-8d36f1255e04/resourcegroups/dp100/workspaces/dp100ws&tid=37219092-bab7-4671-af41-d7ded0486bc7
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 715cb5ea-43a5-46ae-b91c-02fa0d9e9440
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/715cb5ea-43a5-46ae-b91c-02fa0d9e9440?wsid=/subscriptions/d1cbef34-81e9-4f99-9cf3-8d36f1255e04/resourcegroups/dp100/workspaces/dp100ws&tid=37219092-bab7-4671-af41-d7ded0486bc7
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_dc8ea1a209373cc114cd479b7048bc51bc6824226f418b47bc28290d1087ceda_d.txt
2021-06-26T22:02:33Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/dp100ws/azureml/715cb5ea-43a5-46ae-b91c-02fa0d9e9440/mounts/workspaceblobstore
2021-06-26

'Finished'

**Pipeline run outputs and metrics**

In [134]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 RMSE : 4527.777539508901
Prepare Data :
	 raw_rows : 10302
	 processed_rows : 8092


**Get latest registered model**

In [135]:
model = ws.models['auto_claims_zip_model']
print(model.name, 'version', model.version)

auto_claims_zip_model version 5


**Create prediction functions for the service**

In [151]:
%%writefile $service_folder/score_auto_claims.py
import json
import joblib
import numpy as np
import pandas as pd
from azureml.core.model import Model
from preprocessing import preprocessing

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = Model.get_model_path('auto_claims_zip_model')
    model = joblib.load(model_path)

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])
    # Get a prediction from the model
    data = preprocessing(data,inference=True).dropna()

    def predict(models,data):
        out = dict(
            lr_pred = models['lr_model'].predict_proba(data)[:,1],
            po_pred = models['po_model'].predict(data),
            gm_pred = models['gm_model'].predict(data)
        )
        out['loss_cost'] = (out['lr_pred'] * out['po_pred']) * out['gm_pred']

        for k in out.keys():
            out[k] = out[k].tolist()
            
        return out

    predictions = predict(model,data)

    return json.dumps(predictions)

Overwriting auto_claims_service/score_auto_claims.py


**Create conda environment file for the service**

In [152]:
%%writefile $service_folder/auto_claims_env.yml
name: inference_env
dependencies:
- python=3.6.2
- scikit-learn
- pip
- pip:
  - azureml-defaults


Overwriting auto_claims_service/auto_claims_env.yml


**Configure and Deploy Service**

In [153]:
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig
from azureml.core import Model
# Configure the scoring environment
inference_config = InferenceConfig(
                                   source_directory=service_folder,
                                   runtime='python',
                                   entry_script='score_auto_claims.py',
                                   conda_file='auto_claims_env.yml')

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

service_name = "auto-claims-service"

service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)

service.wait_for_deployment(True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-06-26 22:10:47+00:00 Creating Container Registry if not exists.
2021-06-26 22:10:47+00:00 Registering the environment.
2021-06-26 22:10:48+00:00 Use the existing image.
2021-06-26 22:10:48+00:00 Generating deployment configuration.
2021-06-26 22:10:49+00:00 Submitting deployment to compute.
2021-06-26 22:10:53+00:00 Checking the status of deployment auto-claims-service..
2021-06-26 22:13:23+00:00 Checking the status of inference endpoint auto-claims-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


**Take a random sample from the initial data and run it through the service**

In [155]:
sample_data = df[df.columns[~df.columns.isin(['CLM_AMT','CLAIM_FLAG','CLM_FREQ'])]].sample(5)
endpoint_url = service.scoring_uri
print('Endpoint:',endpoint_url)
input_json = json.dumps({"data": sample_data.values.tolist()})

# Set the content type
headers = { 'Content-Type':'application/json' }

response = requests.post(endpoint_url, input_json, headers = headers)
if response.status_code == 200:
    preds = pd.read_json(response.json())
    sample_pred = sample_data.copy().reset_index(drop=True)
    sample_pred[preds.columns.tolist()] = preds
    print(sample_pred)
else:
    print(response.text,response.status_code)

Endpoint: http://e76a45a3-317c-4859-ac97-bd750939bb38.centralus.azurecontainer.io/score
          ID KIDSDRIV    BIRTH AGE HOMEKIDS   YOJ    INCOME PARENT1  HOME_VAL  \
0  695733470        0  02MAY57  42        2  None   $80,054     Yes        $0   
1  563884496        0  01OCT66  33        2    14   $36,351      No  $193,200   
2  711084452        0  24JUN54  45        2     9      None     Yes        $0   
3   57346072        0  13MAY57  42        0    10  $101,636      No        $0   
4  356077226        0  21NOV55  43        0    11   $75,232      No        $0   

  MSTATUS  ... RED_CAR OLDCLAIM REVOKED MVR_PTS CAR_AGE  \
0    z_No  ...      no       $0      No       1      10   
1     Yes  ...     yes   $3,647      No       3      13   
2    z_No  ...     yes   $1,055      No       1       1   
3    z_No  ...      no   $1,452      No       7      11   
4    z_No  ...     yes       $0      No       5       7   

              URBANICITY   lr_pred   po_pred      gm_pred    loss_cost

**Remove Service**

In [156]:
service.delete()