In [3]:
pip install --user witwidget

Collecting witwidget
  Using cached witwidget-1.8.1-py3-none-any.whl (1.5 MB)
Collecting ipywidgets>=7.0.0
  Using cached ipywidgets-7.7.1-py2.py3-none-any.whl (123 kB)
Collecting widgetsnbextension~=3.6.0
  Using cached widgetsnbextension-3.6.1-py2.py3-none-any.whl (1.6 MB)
Collecting jupyterlab-widgets>=1.0.0
  Using cached jupyterlab_widgets-1.1.1-py3-none-any.whl (245 kB)
Installing collected packages: jupyterlab-widgets, widgetsnbextension, ipywidgets, witwidget
Successfully installed ipywidgets-7.7.1 jupyterlab-widgets-1.1.1 widgetsnbextension-3.6.1 witwidget-1.8.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget
from sklearn.linear_model import LogisticRegression
import google.cloud.aiplatform as aiplatform


In [7]:
%%bigquery 
SELECT
  age,
  workclass,
  marital_status,
  education_num,
  occupation,
  hours_per_week,
  income_bracket
FROM
  `bigquery-public-data.ml_datasets.census_adult_income`
LIMIT
  100;

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 795.13query/s] 
Downloading: 100%|██████████| 100/100 [00:01<00:00, 82.54rows/s] 


Unnamed: 0,age,workclass,marital_status,education_num,occupation,hours_per_week,income_bracket
0,39,Private,Married-civ-spouse,5,Other-service,34,<=50K
1,72,Private,Married-civ-spouse,5,Exec-managerial,48,>50K
2,45,Private,Married-civ-spouse,5,Machine-op-inspct,40,>50K
3,31,Private,Married-civ-spouse,5,Exec-managerial,40,<=50K
4,55,Private,Married-civ-spouse,5,Tech-support,23,<=50K
...,...,...,...,...,...,...,...
95,30,Local-gov,Married-civ-spouse,8,Adm-clerical,40,<=50K
96,53,?,Married-civ-spouse,8,?,40,<=50K
97,52,?,Married-civ-spouse,8,?,40,>50K
98,45,Private,Married-civ-spouse,2,Other-service,25,<=50K


In [8]:
%%bigquery
CREATE OR REPLACE VIEW
  `census.input_view` AS
SELECT
  age,
  workclass,
  marital_status,
  education_num,
  occupation,
  hours_per_week,
  income_bracket,
  CASE
    WHEN MOD(functional_weight, 10) < 8 THEN 'training'
    WHEN MOD(functional_weight, 10) = 8 THEN 'evaluation'
    WHEN MOD(functional_weight, 10) = 9 THEN 'prediction'
  END AS dataframe
FROM
  `bigquery-public-data.ml_datasets.census_adult_income`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 776.00query/s] 


In [9]:
%%bigquery
CREATE OR REPLACE MODEL
  `census.census_model`
OPTIONS
  ( model_type='LOGISTIC_REG',
    auto_class_weights=TRUE,
    input_label_cols=['income_bracket']
  ) AS
SELECT
  * EXCEPT(dataframe)
FROM
  `census.input_view`
WHERE
  dataframe = 'training'
    

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1333.50query/s]                        


In [11]:
%%bigquery input_df
SELECT * EXCEPT (dataframe) FROM `census.input_view` 
WHERE dataframe = 'training'

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 710.78query/s]                         
Downloading: 100%|██████████| 26193/26193 [00:01<00:00, 22529.09rows/s]


In [12]:
input_df.head()

Unnamed: 0,age,workclass,marital_status,education_num,occupation,hours_per_week,income_bracket
0,77,Private,Married-civ-spouse,5,Priv-house-serv,10,<=50K
1,26,Private,Married-civ-spouse,8,Priv-house-serv,40,<=50K
2,53,Private,Married-civ-spouse,3,Priv-house-serv,10,<=50K
3,55,Private,Married-civ-spouse,9,Priv-house-serv,30,<=50K
4,46,Private,Married-civ-spouse,9,Priv-house-serv,40,<=50K


In [13]:
transform_df = pd.get_dummies(input_df,
                     columns = ['workclass', 'marital_status', 'occupation', 'education_num'])
transform_df

Unnamed: 0,age,hours_per_week,income_bracket,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,education_num_7,education_num_8,education_num_9,education_num_10,education_num_11,education_num_12,education_num_13,education_num_14,education_num_15,education_num_16
0,77,10,<=50K,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,26,40,<=50K,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,53,10,<=50K,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,55,30,<=50K,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,46,40,<=50K,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26188,32,40,<=50K,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
26189,39,40,<=50K,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
26190,32,55,<=50K,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
26191,27,40,<=50K,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
X = transform_df.drop('income_bracket', axis=1)
y = transform_df["income_bracket"]

In [16]:
X = X[['hours_per_week', 'age']]

In [17]:
X.head()

Unnamed: 0,hours_per_week,age
0,10,77
1,40,26
2,10,53
3,30,55
4,40,46


In [18]:
y.head()

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: income_bracket, dtype: object

In [19]:
clf = LogisticRegression(random_state=0).fit(X, y)

In [20]:
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [21]:
clf.feature_names = list(X.columns.values)

In [22]:
clf.feature_names

['hours_per_week', 'age']

In [23]:
clf.feature_names_in_

array(['hours_per_week', 'age'], dtype=object)

In [24]:
clf.predict(X[:1])

array([' <=50K'], dtype=object)

In [25]:
import joblib
joblib.dump(clf, 'model.joblib')

['model.joblib']

In [26]:
new_clf = joblib.load('model.joblib')
clf.predict(X[:1])

array([' <=50K'], dtype=object)

In [None]:
!gsutil cp model.joblib gs://felipe-sandbox/logistic_regression/

In [7]:
%%writefile explanation-metadata.json

{
  "inputs": {
    "features": {"index_feature_mapping": ["hours_per_week","age"], "encoding": "BAG_OF_FEATURES"}
  },
  "outputs": {
    "income_bracket": {
    }
  }
}

Overwriting explanation-metadata.json


In [6]:
!gsutil ls gs://felipe-sandbox/logistic_regression

gs://felipe-sandbox/logistic_regression/model.joblib


In [8]:
%%bash
gcloud ai models upload \
  --region=us-central1 \
  --display-name=logistic_regression_xai_v3 \
  --container-image-uri=us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest \
  --artifact-uri=gs://felipe-sandbox/logistic_regression \
  --explanation-method=sampled-shapley \
  --explanation-path-count=10 \
  --explanation-metadata-file=explanation-metadata.json

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ERROR: (gcloud.ai.models.upload) Failed to parse value(s) in protobuf [GoogleCloudAiplatformV1ExplanationMetadata]:
  GoogleCloudAiplatformV1ExplanationMetadata.inputs[features].index_feature_mapping


CalledProcessError: Command 'b'gcloud ai models upload \\\n  --region=us-central1 \\\n  --display-name=logistic_regression_xai_v3 \\\n  --container-image-uri=us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest \\\n  --artifact-uri=gs://felipe-sandbox/logistic_regression \\\n  --explanation-method=sampled-shapley \\\n  --explanation-path-count=10 \\\n  --explanation-metadata-file=explanation-metadata.json\n'' returned non-zero exit status 1.

In [8]:
import google.cloud.aiplatform as aip
aip.init(project="felipe-sandbox", staging_bucket="gs://felipe-sandbox")


In [9]:
REGION="us-central1"

In [10]:

XAI = "shapley"  # [ shapley, ig, xrai ]

if XAI == "shapley":
    PARAMETERS = {"sampled_shapley_attribution": {"path_count": 10}}
elif XAI == "ig":
    PARAMETERS = {"integrated_gradients_attribution": {"step_count": 50}}
elif XAI == "xrai":
    PARAMETERS = {"xrai_attribution": {"step_count": 50}}

parameters = aip.explain.ExplanationParameters(PARAMETERS)

COLUMNS = [
    "hours_per_week",
    "age"
]
metadata = aip.explain.ExplanationMetadata(
    inputs={
        "features": {"index_feature_mapping": COLUMNS, "encoding": "BAG_OF_FEATURES"}
    },
    outputs={"income_bracket": {}},
)

MODEL_DIR = "felipe-sandbox" + "/model"

DEPLOY_VERSION = "sklearn-cpu.0-23"
DEPLOY_IMAGE = "{}-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(
    REGION.split("-")[0], DEPLOY_VERSION
)

In [11]:
model = aip.Model.upload(
    display_name="logistic_regression_v4",
    artifact_uri="gs://felipe-sandbox/logistic_regression",
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",
    explanation_parameters=parameters,
    explanation_metadata=metadata,
    sync=False,
)

model.wait()

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/581970904807/locations/us-central1/models/5460742091535548416/operations/8263605656909512704


E0601 15:45:38.042885700      84 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/581970904807/locations/us-central1/models/5460742091535548416
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/581970904807/locations/us-central1/models/5460742091535548416')


In [30]:
model.list

<bound method Model.list of <class 'google.cloud.aiplatform.models.Model'>>

In [14]:
DEPLOYED_NAME = "logistic-regression-endpoint-v4" 

TRAFFIC_SPLIT = {"0": 100}

endpoint = model.deploy(
    deployed_model_display_name=DEPLOYED_NAME,
    traffic_split=TRAFFIC_SPLIT,
    machine_type="n1-standard-4",
    min_replica_count=1,
    max_replica_count=1,
)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/581970904807/locations/us-central1/endpoints/7459254008599805952/operations/1284152134392086528
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/581970904807/locations/us-central1/endpoints/7459254008599805952
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/581970904807/locations/us-central1/endpoints/7459254008599805952')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/581970904807/locations/us-central1/endpoints/7459254008599805952
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/581970904807/locations/us-central1/endpoints/7459254008599805952/operations/1668084002625421312
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/58197090480

In [35]:
INSTANCE = [
    10,10
]
instances = [INSTANCE]

In [36]:
endpoint.predict(instances=instances)

Prediction(predictions=[' <=50K'], deployed_model_id='6808870890537746432', explanations=None)

In [37]:
prediction = endpoint.explain(instances=instances)
print(prediction)

InvalidArgument: 400 {"error": "Unable to explain the requested instance(s) because: unsupported operand type(s) for -: 'str' and 'str'"}

In [44]:
help(endpoint.explain)

Help on method explain in module google.cloud.aiplatform.models:

explain(instances: List[Dict], parameters: Union[Dict, NoneType] = None, deployed_model_id: Union[str, NoneType] = None) -> google.cloud.aiplatform.models.Prediction method of google.cloud.aiplatform.models.Endpoint instance
    Make a prediction with explanations against this Endpoint.
    
    Example usage:
        response = my_endpoint.explain(instances=[...])
        my_explanations = response.explanations
    
    Args:
        instances (List):
            Required. The instances that are the input to the
            prediction call. A DeployedModel may have an upper limit
            on the number of instances it supports per request, and
            when it is exceeded the prediction call errors in case
            of AutoML Models, or, in case of customer created
            Models, the behaviour is as documented by that Model.
            The schema of any single instance may be specified via
            Endp

In [43]:
endpoint.explain(instances=[{"age":10,"hours_per_week":10}])

FailedPrecondition: 400 "Explainability failed with exception: 'no_name_input'"


In [None]:
gcloud ai endpoints create \
  --region=us-central1 \
  --display-name=logistic_regression_xai_v2

In [33]:
num_wit_examples = 500
test_examples = np.hstack((X[:num_wit_examples],y.values.reshape(-1,1)[:num_wit_examples].reshape(-1,1)))
test_examples[0:3]

array([[77, 10, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, ' <=50K'],
       [26, 40, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, ' <=50K'],
       [53, 10, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, ' <=50K']], dtype=object)

In [39]:
num_data = 500
tool_height = 500
config_builder = (WitConfigBuilder(test_examples.tolist(), X.columns.tolist() + ['income_bracket'])
  .set_custom_predict_fn(clf.predict_proba)
  .set_target_feature('income_bracket'))
WitWidget(config_builder, height=tool_height)

WitWidget(config={'model_type': 'classification', 'label_vocab': [], 'feature_names': ['age', 'hours_per_week'…

In [38]:
np.array(X.loc[0].values.flatten().tolist())

array([77, 10,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [5]:
%%writefile input.json

{
  "instances" : [
      [4.8, 3]
  ]
}

Overwriting input.json


In [6]:
!gcloud ai endpoints explain 8246258043482800128 \
  --region=us-central1 \
  --json-request=input.json

Using endpoint [https://us-central1-prediction-aiplatform.googleapis.com/]
[1;31mERROR:[0m (gcloud.ai.endpoints.explain) INVALID_ARGUMENT: {"error": "Unable to explain the requested instance(s) because: unsupported operand type(s) for -: 'str' and 'str'"}


In [8]:
def explain_tabular_sample(
    project: str, location: str, endpoint_id: str, instance_dict: dict
):

    aiplatform.init(project=project, location=location)

    endpoint = aiplatform.Endpoint(endpoint_id)

    response = endpoint.explain(instances=[instance_dict], parameters={})

    for explanation in response.explanations:
        print(" explanation")
        # Feature attributions.
        attributions = explanation.attributions
        for attribution in attributions:
            print("  attribution")
            print("   baseline_output_value:", attribution.baseline_output_value)
            print("   instance_output_value:", attribution.instance_output_value)
            print("   output_display_name:", attribution.output_display_name)
            print("   approximation_error:", attribution.approximation_error)
            print("   output_name:", attribution.output_name)
            output_index = attribution.output_index
            for output_index in output_index:
                print("   output_index:", output_index)

    for prediction in response.predictions:
        print(prediction)

In [9]:
instances = [[77, 10]]

In [10]:
from google.cloud import aiplatform


In [11]:
explain_tabular_sample("felipe-sandbox", "us-central1", "7246881138671616000", instances)

InvalidArgument: 400 {"error": "Unable to explain the requested instance(s) because: Nameless inputs are allowed only if there is a single input in the explanation metadata."}

In [22]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import joblib

class IrisClassifier:
    def __init__(self):
        self.X, self.y = load_iris(return_X_y=True)
        self.clf = self.train_model()
        self.iris_type = {
            0: 'setosa',
            1: 'versicolor',
            2: 'virginica'
        }

    def train_model(self) -> LogisticRegression:
        return LogisticRegression(solver='lbfgs',
                                  max_iter=1000,
                                  multi_class='multinomial').fit(self.X, self.y)

    def predict(self, features: dict):
        
        X = [features['sepal_length'], features['sepal_width'], features['petal_length'], features['petal_width']]
        print(X)
        prediction = self.clf.predict_proba([X])
        return {'class': self.iris_type[np.argmax(prediction)],
                'probability': round(max(prediction[0]), 2)}

model = IrisClassifier()
joblib.dump(model, "model.joblib")

['model.joblib']

In [23]:
model.predict(features={"sepal_length": 4.8,"sepal_width": 3,"petal_length": 1.4,"petal_width": 0.3})

[4.8, 3, 1.4, 0.3]


{'class': 'setosa', 'probability': 0.97}