In [1]:
from time import time
start=time()

In [2]:
import os 
assert os.path.exists("/opt/deeplearning/metadata/env_version"), "Should be running in Notebook"

In [3]:
#! pip install --user --upgrade google-cloud-aiplatform
#! pip install --user  xgboost==1.2
#! pip install --user  witwidget
#!pip install --user wheel
#! pip install --user  pandas



In [4]:
csvfile='fraud-header-withy.csv'
gs_csvfile=f'gs://joshuafraud/{csvfile}' 
#!gsutil cp $gs_csvfile .

In [5]:
import pandas as pd

import xgboost as xgb

import numpy as np

import collections

import witwidget

import datetime
import json

from google.cloud import aiplatform

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.utils import shuffle
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

In [26]:
REGION = "us-west1"  
 
shell_output = !gcloud config get-value project
PROJECT_ID = shell_output[0]
assert PROJECT_ID

ERROR! Session/line number was not unique in database. History logging moved to new session 5


In [27]:
proj = PROJECT_ID.replace("-", "")
date_s= datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
STUDY_DISPLAY_NAME = f"{proj}_study_{date_s}"
ENDPOINT = REGION + "-aiplatform.googleapis.com"
PARENT = f"projects/{PROJECT_ID}/locations/{REGION}"

print("ENDPOINT", ENDPOINT,"REGION", REGION, "PARENT", PARENT,"PROJECT_ID", PROJECT_ID)

ENDPOINT us-west1-aiplatform.googleapis.com REGION us-west1 PARENT projects/joshua-playground/locations/us-west1 PROJECT_ID joshua-playground


In [28]:
 COLUMN_NAMES =  {
    "isFraud": np.int8,
    "type_CASH_IN": np.int8,
    "type_CASH_OUT": np.int8,
    "type_DEBIT": np.int8,
    "type_PAYMENT": np.int8,
    "type_TRANSFER": np.int8,
    "amount_nml": np.float64,
    "oldBalanceOrigin_nml": np.float64,
    "newBalanceOrigin_nml": np.float64,
    "oldBalanceDestination_nml": np.float64,
    "newBalanceDestination_nml": np.float64,
    "peakHours":np.int8
}

In [29]:
data = pd.read_csv(
  csvfile,
  index_col=None,
  dtype=COLUMN_NAMES
)
data = data.dropna()
data = shuffle(data, random_state=2)

data.head()

Unnamed: 0,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount_nml,oldBalanceOrigin_nml,newBalanceOrigin_nml,oldBalanceDestination_nml,newBalanceDestination_nml,peakHours
4801897,0,0,1,0,0,0,0.070289,-0.28214,-0.292442,-0.323814,-0.272905,0
2779475,0,0,1,0,0,0,0.324777,-0.284135,-0.292442,-0.323814,-0.231079,0
784465,0,0,1,0,0,0,-0.105303,-0.276696,-0.292442,-0.323814,-0.319018,0
5635648,0,0,1,0,0,0,-0.211367,-0.288381,-0.292442,-0.323814,-0.307404,0
301298,0,1,0,0,0,0,-0.101613,5.590381,5.555191,-0.095958,0.011459,0


In [43]:
labels = data['isFraud'].values
data = data.drop(columns=['isFraud'])
data.head()

In [44]:
print("length of data", len(data))

length of data 6362620


In [18]:
# eta is learning rate
param_eta = {"parameter_id": "eta", "double_value_spec": {"min_value": 0, "max_value": 1}}
param_min_child_weight = {"parameter_id": "min_child_weight", "double_value_spec": {"min_value": 0, "max_value": 3}}
param_alpha = {"parameter_id": "alpha", "double_value_spec": {"min_value": 0, "max_value": 3}}
param_max_depth = {"parameter_id": "max_depth", "integer_value_spec": {"min_value": 3, "max_value": 9}}

balanced_accuracy = {"metric_id": "balanced_accuracy", "goal": "MAXIMIZE"}
parameter_defs=[  param_eta,param_min_child_weight,param_alpha,param_max_depth ]
study = {
    "display_name": STUDY_DISPLAY_NAME,
    "study_spec": {
        "algorithm": "ALGORITHM_UNSPECIFIED",
        "parameters": parameter_defs,
        "metrics": [balanced_accuracy],
    },
}

In [19]:
int_params = { p["parameter_id"] for p in parameter_defs if "integer_value_spec" in p  }

In [20]:
vizier_client = aiplatform.gapic.VizierServiceClient(client_options=dict(api_endpoint=ENDPOINT))
study = vizier_client.create_study(parent=PARENT, study=study)

In [21]:
def create_metric(trial_id, params, data, labels):
    print(round(time()-start,1), f"seconds elapsed; Trial #{trial_id.split('/')[-1]}:" , params )

    x,y = data.values, labels
    x_train, x_test, y_train, y_test = train_test_split(x,y)

    model = xgb.XGBClassifier(objective='reg:logistic',   **params)

    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred.round())
    print("balanced_accuracy", balanced_accuracy)
    
    balanced_accuracy_metric = {"metric_id": "balanced_accuracy", "value": balanced_accuracy}
 
    return [balanced_accuracy_metric]

In [22]:
def trial_params_to_dict_with_rounded_ints(trial_params):
    ret = {}
    for p in trial_params:
        if p.parameter_id in int_params:
            ret[p.parameter_id] = int(round(p.value))
        else:
            ret[p.parameter_id]=p.value
    return ret      

In [39]:
def do_trial(trial_id, client_id, suggestions_per_request, study_name):
    suggest_response = vizier_client.suggest_trials(
        {
            "parent": study_name,
            "suggestion_count": suggestions_per_request,
            "client_id": client_id,
        }
    )

    for suggested_trial in suggest_response.result().trials:
        trial_id = suggested_trial.name.split("/")[-1]
        trial = vizier_client.get_trial({"name": suggested_trial.name})

        if trial.state in ["COMPLETED", "INFEASIBLE"]:
            continue
        
        parameters = trial_params_to_dict_with_rounded_ints(trial.parameters) 
        
        tries = 0
        while tries <3:
            tries +=1
            try:
                vizier_client.add_trial_measurement(
                        {
                            "trial_name": suggested_trial.name,
                            "measurement": {
                                "metrics": create_metric(suggested_trial.name, parameters, data, labels)
                            },
                        }
                    )
            except Exception as e:
                print("Try number", tries, "failed", e)
                time.sleep(2**tries)#exp backoff
                
        response = vizier_client.complete_trial(
                {"name": suggested_trial.name, "trial_infeasible": False} )
        
        
    return trial_id

In [48]:
client_id = "client1"  
suggestions_per_request = 5
trials_to_do = 5

print("client_id:", client_id, ", suggestion_count_per_request:", suggestions_per_request, ", trials_to_do:", trials_to_do)

client_id: client1 , suggestion_count_per_request: 5 , trials_to_do: 5


In [49]:
print("Before running trials", round(time()-start,1), "seconds elapsed")

Before running trials 4976.1 seconds elapsed


In [50]:
trial_id = 0

# trial_id can be string, parsed from the suggested_trial.name, or else an int returned from do_trial()
while int(trial_id) < trials_to_do:
    trial_id = do_trial(trial_id, client_id,suggestions_per_request, study.name) 


4979.6 seconds elapsed; Trial #10: {'alpha': 3.0, 'eta': 1.0, 'max_depth': 3, 'min_child_weight': 3.0}
balanced_accuracy 0.9069552988882963
5323.5 seconds elapsed; Trial #10: {'alpha': 3.0, 'eta': 1.0, 'max_depth': 3, 'min_child_weight': 3.0}
balanced_accuracy 0.9034509865420439
5654.2 seconds elapsed; Trial #10: {'alpha': 3.0, 'eta': 1.0, 'max_depth': 3, 'min_child_weight': 3.0}
balanced_accuracy 0.9102959615625328
5986.0 seconds elapsed; Trial #11: {'alpha': 3.0, 'eta': 0.45282161038241964, 'max_depth': 8, 'min_child_weight': 2.1206757293045646}
balanced_accuracy 0.9359411452930418
6623.9 seconds elapsed; Trial #11: {'alpha': 3.0, 'eta': 0.45282161038241964, 'max_depth': 8, 'min_child_weight': 2.1206757293045646}
balanced_accuracy 0.9252351065966394
7308.6 seconds elapsed; Trial #11: {'alpha': 3.0, 'eta': 0.45282161038241964, 'max_depth': 8, 'min_child_weight': 2.1206757293045646}
balanced_accuracy 0.9194562323247502
8016.8 seconds elapsed; Trial #12: {'alpha': 0.0, 'eta': 0.0, 'max_

In [51]:
optimal_trials = vizier_client.list_optimal_trials({"parent": study.name})

print(f"optimal_trials: {optimal_trials}")

optimal_trials: optimal_trials {
  name: "projects/401966870909/locations/us-west1/studies/3589342127350/trials/1"
  state: SUCCEEDED
  parameters {
    parameter_id: "alpha"
    value {
      number_value: 0.15097435890285424
    }
  }
  parameters {
    parameter_id: "eta"
    value {
      number_value: 0.9611885092660706
    }
  }
  parameters {
    parameter_id: "max_depth"
    value {
      number_value: 8.0
    }
  }
  parameters {
    parameter_id: "min_child_weight"
    value {
      number_value: 0.46708410711467974
    }
  }
  final_measurement {
    metrics {
      metric_id: "balanced_accuracy"
      value: 1.0
    }
  }
  measurements {
    metrics {
      metric_id: "balanced_accuracy"
      value: 1.0
    }
  }
  start_time {
    seconds: 1640009394
  }
  end_time {
    seconds: 1640009804
  }
  client_id: "client1"
}
optimal_trials {
  name: "projects/401966870909/locations/us-west1/studies/3589342127350/trials/2"
  state: SUCCEEDED
  parameters {
    parameter_id: "al

In [52]:
#vizier_client.delete_study({"name": study.name})