In [1]:
import pandas as pd
import requests

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import mlflow
import numpy as np
from sklearn.linear_model import LogisticRegression
from azureml.core import Workspace

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'c1661420-7c3a-4941-9a01-7666935ca8c1'
resource_group = 'CREDIT-CARD-PROJECT'
workspace_name = 'creditcardproject'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='ccp_data')
df = dataset.to_pandas_dataframe()

In [3]:
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1


In [4]:
df.columns

Index(['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'Industry',
       'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
       'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'Approved'],
      dtype='object')

In [5]:
cols = ['Industry', 'Ethnicity', 'Citizen']

for col in cols:
    print(df[col].value_counts())
    print('\n')

Energy                   146
Materials                 78
Industrials               64
ConsumerDiscretionary     59
ConsumerStaples           54
Healthcare                53
Financials                51
InformationTechnology     41
Utilities                 38
CommunicationServices     38
Real Estate               30
Education                 25
Research                  10
Transport                  3
Name: Industry, dtype: int64


White     408
Black     138
Asian      59
Latino     57
Other      28
Name: Ethnicity, dtype: int64


ByBirth         625
ByOtherMeans     57
Temporary         8
Name: Citizen, dtype: int64




In [6]:
exclude_cols = ['Industry', 'Ethnicity', 'Citizen', 'ZipCode']
df1 = df.drop(exclude_cols, axis=1)
df1.columns

Index(['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'YearsEmployed',
       'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Income',
       'Approved'],
      dtype='object')

In [7]:
df1.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Income,Approved
0,1,30.83,0.0,1,1,1.25,1,1,1,0,0,1
1,0,58.67,4.46,1,1,3.04,1,1,6,0,560,1
2,0,24.5,0.5,1,1,1.5,1,0,0,0,824,1
3,1,27.83,1.54,1,1,3.75,1,1,5,1,3,1
4,1,20.17,5.625,1,1,1.71,1,0,0,0,0,1


In [8]:
X = df1.drop('Approved', axis=1)
y = df1['Approved']

In [9]:
sc = MinMaxScaler(feature_range=(0,1))
X = sc.fit_transform(X)

In [10]:
# connect to your workspace
ws = Workspace.from_config()

In [11]:
# create experiment and start logging to a new run in the experiment
experiment_name = "credit-card-project"

In [12]:
# set up MLflow to track the metrics
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
mlflow.autolog()

<Experiment: artifact_location='', experiment_id='99f2a1e4-7ca5-4def-8700-82471b9bdaa7', lifecycle_stage='active', name='credit-card-project', tags={}>

2022/05/26 04:41:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/05/26 04:41:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2022/05/26 04:41:28 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.
2022/05/26 04:41:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=493)

In [14]:
# set up the Logistic regression model
reg = 0.5
clf = LogisticRegression(
    C=1.0 / reg, solver="liblinear", multi_class="auto", random_state=42
)

In [15]:
# train the model
with mlflow.start_run() as run:
    clf.fit(X_train, y_train)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

Model before and after ...

In [16]:
# register the model
model_uri = "runs:/{}/model".format(run.info.run_id)
model = mlflow.register_model(model_uri, "credit_card_model")

Registered model 'credit_card_model' already exists. Creating a new version of this model...
2022/05/26 04:41:48 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: credit_card_model, version 6
Created version '6' of model 'credit_card_model'.


In [17]:
# create environment for the deploy
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.webservice import AciWebservice

In [18]:
# get a curated environment
env = Environment.get(
    workspace=ws, 
    name="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu",
    version=1
)
env.inferencing_stack_version='latest'

In [19]:
# create deployment config i.e. compute resources
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=1,
    tags={"data": "credit_card", "method": "sklearn"},
    description="Predict credit card approval with sklearn",
)

In [20]:
%%time
import uuid
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.model import Model

# get the registered model
model = Model(ws, "credit_card_model")

# create an inference config i.e. the scoring script and environment
inference_config = InferenceConfig(entry_script="score.py", environment=env)

# deploy the service
service_name = "sklearn-credit-card-svc-" + str(uuid.uuid4())[:4]
service = Model.deploy(
    workspace=ws,
    name=service_name,
    models=[model],
    inference_config=inference_config,
    deployment_config=aciconfig,
)

service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-05-26 04:41:51+00:00 Creating Container Registry if not exists.
2022-05-26 04:41:51+00:00 Registering the environment.
2022-05-26 04:41:52+00:00 Use the existing image.
2022-05-26 04:41:52+00:00 Generating deployment configuration.
2022-05-26 04:41:53+00:00 Submitting deployment to compute.
2022-05-26 04:41:57+00:00 Checking the status of deployment sklearn-credit-card-svc-9067..
2022-05-26 04:44:04+00:00 Checking the status of inference endpoint sklearn-credit-card-svc-9067.
Succeeded
ACI service creation operation finished, operation "Succeeded"
CPU times: user 816 ms, sys: 118 ms, total: 934 ms
Wall time: 2min 47s


In [21]:
sample_index = 68

In [22]:
input_data = '{"data": [' + str(list(X_test[sample_index])) + "]}"

In [23]:
headers = {"Content-Type": "application/json"}

In [24]:
resp = requests.post(service.scoring_uri, input_data, headers=headers)

In [25]:
print("POST to url", service.scoring_uri)
print("label:", y_test[sample_index])
print("prediction:", resp.text)

POST to url http://821c7046-a27c-4f21-a226-fd4aa530bc43.centralus.azurecontainer.io/score
label: 1
prediction: [1]


In [26]:
df1.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Income,Approved
0,1,30.83,0.0,1,1,1.25,1,1,1,0,0,1
1,0,58.67,4.46,1,1,3.04,1,1,6,0,560,1
2,0,24.5,0.5,1,1,1.5,1,0,0,0,824,1
3,1,27.83,1.54,1,1,3.75,1,1,5,1,3,1
4,1,20.17,5.625,1,1,1.71,1,0,0,0,0,1


In [27]:
input_data

'{"data": [[1.0, 0.5237593984962405, 0.007321428571428571, 0.0, 0.0, 0.008771929824561403, 1.0, 1.0, 0.16417910447761194, 0.0, 0.02732]]}'

In [28]:
service.scoring_uri

'http://821c7046-a27c-4f21-a226-fd4aa530bc43.centralus.azurecontainer.io/score'

In [29]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.035038,0.011964,1.0,1.0,0.0,0.0,1.0,0.014925,0.0,0.00126
1,1.0,0.159098,0.236607,0.0,0.0,0.192982,1.0,0.0,0.0,1.0,0.0
2,1.0,0.211729,0.142857,0.0,0.0,0.201754,1.0,1.0,0.029851,1.0,0.0
3,1.0,0.142857,0.450893,1.0,1.0,0.004386,0.0,1.0,0.029851,0.0,0.05552
4,1.0,0.194286,0.096786,0.0,0.0,0.184211,1.0,1.0,0.014925,0.0,0.0
5,1.0,0.412331,0.044643,0.0,0.0,0.008772,0.0,0.0,0.0,0.0,0.00195
6,1.0,0.165414,0.019286,1.0,1.0,0.035088,0.0,0.0,0.0,1.0,1e-05
7,1.0,0.085263,0.232143,1.0,1.0,0.051228,1.0,1.0,0.104478,0.0,0.02954
8,1.0,0.275639,0.142857,0.0,0.0,0.052632,0.0,0.0,0.0,1.0,0.0
9,1.0,0.510075,0.089286,1.0,1.0,0.087719,1.0,1.0,0.179104,1.0,0.0251


In [30]:
pd.DataFrame(y_test)

Unnamed: 0,Approved
294,0
211,1
104,0
284,0
214,1
394,0
393,0
68,1
663,0
126,1
