### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
#adjust ads library for 2.5.9 version
!pip install -U oracle-ads==2.5.9

In [None]:
#import all needed libraries
import ads
import json
import logging
import oci
import os
import random
import shutil
import string
import tempfile
import uuid
import warnings
from os import path

from ads.catalog.model import ModelCatalog
from ads.common.model import ADSModel
from ads.automl.driver import AutoML
from ads.model.framework.automl_model import AutoMLModel
from ads.automl.provider import OracleAutoMLProvider
from ads.dataset.label_encoder import DataFrameLabelEncoder
from ads.dataset.factory import DatasetFactory
from oci.data_science import models
from ads.model.deployment import ModelDeployer, ModelDeploymentProperties
import numpy as np
import pandas as pd

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
warnings.filterwarnings('ignore')
logging.getLogger('ads').setLevel(level=logging.ERROR)
logging.getLogger('ADS').setLevel(level=logging.ERROR)
logging.getLogger('ODSC-ModelDeployment').setLevel(level=logging.ERROR)

In [None]:
#load the dataset
URL="https://www2.cs.arizona.edu/classes/cs120/fall17/ASSIGNMENTS/assg02/Pokemon.csv"
ds = DatasetFactory.open(pd.read_csv(URL, header = 0)).set_target('Legendary')
ds.head()

In [None]:
#check if the target columns is unbalanced
ds.plot("Legendary").show_in_notebook(figsize=(4,4))

In [None]:
#balancing the target feature using upsampling technique
ds = ds.up_sample()

In [None]:
#confirm the target is now balanced
ds.plot("Legendary").show_in_notebook(figsize=(4,4))

In [None]:
#apply one hot encondig to category features and drop "#" and "name" columns
df = ds.to_pandas()
data_t1 = ds.to_pandas()['Type_1']
data_t2 = ds.to_pandas()['Type_2']

onehot_t1 = pd.get_dummies(data_t1, prefix='Type_1', drop_first=True)
onehot_t2 = pd.get_dummies(data_t2, prefix='Type_2', drop_first=True)

df_ohe = df.merge(onehot_t1, right_index=True, left_index=True)
df_ohe = df_ohe.merge(onehot_t2, right_index=True, left_index=True)
df_ohe = df_ohe.drop(['#', 'Name', 'Type_1','Type_2'], axis=1)
ds_ohe = DatasetFactory.open(df_ohe).set_target('Legendary')
ds_ohe.head()

In [None]:
#split dataframe into train and test
train, test = ds_ohe.train_test_split(test_size=0.15)

In [None]:
#run Oracle AutoMl engine to automatic adjust the model using Random Forest algorithm
ml_engine = OracleAutoMLProvider(n_jobs=-1, loglevel=logging.ERROR)

oracle_automl = AutoML(train, provider=ml_engine)
model, baseline = oracle_automl.train(model_list=['RandomForestClassifier'], 
                                      random_state = 42, time_budget = 500)

In [None]:
#evaluate the model using the first 10 data
model.predict(test.X.iloc[:10])

In [None]:
#plot the model evaluating
from ads.evaluations.evaluator import ADSEvaluator
evaluator = ADSEvaluator(test, models=[model], training_data=train)

evaluator.show_in_notebook()

In [None]:
#prepare a directory to store the model artifact
artifact_dir = '/home/datascience/Deeper Model Deployment Logs Analysis/Model'
print(f"Model artifact director: {artifact_dir}")
automl_model = AutoMLModel(estimator=model, artifact_dir=artifact_dir)

In [None]:
#genarate the model artifact
conda_env = 'generalml_p37_cpu_v1'

automl_model.prepare(inference_conda_env=conda_env,
                     training_conda_env=conda_env,
                     use_case_type='binary_classification',
                     X_sample=train.X,
                     y_sample=train.y,
                     force_overwrite = True)

In [None]:
#retrieve the score.py content
with open(path.join(artifact_dir, "score.py"), 'r') as f:
    print(f.read())

In [None]:
#update the score.py content
score = '''
import json
import os
import sys
from cloudpickle import cloudpickle
from functools import lru_cache
import logging
import sys
import automl


model_name = 'model.pkl'


"""
   Inference script. This script is used for prediction by scoring server when schema is known.
"""

def init_automl_logger():
    logger = logging.getLogger("automl")
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.ERROR)
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    automl.init(engine="local", engine_opts={"n_jobs": 1}, logger=logger)


@lru_cache(maxsize=10)
def load_model(model_file_name=model_name):
    """
    Loads model from the serialized format

    Returns
    -------
    model:  a model instance on which predict API can be invoked
    """
    init_automl_logger()
    model_dir = os.path.dirname(os.path.realpath(__file__))
    if model_dir not in sys.path:
        sys.path.insert(0, model_dir)
    contents = os.listdir(model_dir)
    if model_file_name in contents:
        print(f'Start loading {model_file_name} from model directory {model_dir} ...')
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), model_file_name), "rb") as file:
            loaded_model = cloudpickle.load(file)

        print("Model is successfully loaded.")
        return loaded_model
    else:
        raise Exception(f'{model_file_name} is not found in model directory {model_dir}')


def pre_inference(data):
    """
    Preprocess data

    Parameters
    ----------
    data: Data format as expected by the predict API of the core estimator.

    Returns
    -------
    data: Data format after any processing.

    """
    return data

def post_inference(yhat):
    """
    Post-process the model results

    Parameters
    ----------
    yhat: Data format after calling model.predict.

    Returns
    -------
    yhat: Data format after any processing.

    """
    return yhat.tolist()

def predict(data, model=load_model()):
    """
    Returns prediction given the model and data to predict

    Parameters
    ----------
    model: Model instance returned by load_model API
    data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Pandas DataFrame

    Returns
    -------
    predictions: Output from scoring server
        Format: {'prediction': output from model.predict method}

    """
    from pandas import read_json, DataFrame
    from io import StringIO
    X = read_json(StringIO(data)) if isinstance(data, str) else DataFrame.from_dict(data)
    features = pre_inference(X)
    yhat = post_inference(
        model.predict(features)
    )
    logging.info(yhat)
    return {'prediction': yhat}
'''
with open(path.join(artifact_dir, "score.py"), 'w') as f:
    f.write(score)

In [None]:
#display model taxonomy
automl_model.metadata_taxonomy.to_dataframe()

In [None]:
#display the model status list
automl_model.summary_status()

In [None]:
#verify the model prediction
automl_model.verify(test.X[0:5])

In [None]:
#display model information
automl_model.runtime_info

In [None]:
#save the model to the Model Catalog
model_id = automl_model.save(display_name='Deeper Analysis using AutoML Model')

In [None]:
#create a log group to receive the model deployment logs
from ads.common.oci_logging import OCILogGroup

# Generate a random log group and log name
log_group_name = "ModelDeployment-Demo-" + str(uuid.uuid4())
access_log_name = "ModelDeployment-Demo-Access_Log-" + str(uuid.uuid4())
predict_log_name = "ModelDeployment-Demo-Predict_Log-" + str(uuid.uuid4())

# Create a log group
log_group = OCILogGroup(display_name=log_group_name).create()
log_group_ocid = log_group.id
print(f"Log group OCID: {log_group_ocid}")

# Create an access log in the log group
access_log = log_group.create_log(access_log_name)
access_log_ocid = access_log.id
print(f"Access log OCID: {access_log_ocid}")

# Create a predict log in the log group
predict_log = log_group.create_log(predict_log_name)
predict_log_ocid = predict_log.id
print(f"Predict log OCID: {predict_log_ocid}")

In [None]:
%%time
#initialize ModelDeploymentProperties
compartment_id = os.environ['NB_SESSION_COMPARTMENT_OCID']
project_id = os.environ['PROJECT_OCID']
model_deployment_properties = ModelDeploymentProperties(
    model_id
).with_prop(
    'display_name', "Model Deployment Demo using ADS"
).with_prop(
    "project_id", project_id
).with_prop(
    "compartment_id", compartment_id
).with_logging_configuration(
    log_group_ocid, access_log_ocid, log_group_ocid, predict_log_ocid
).with_instance_configuration(
    config={"INSTANCE_SHAPE":"VM.Standard2.1", "INSTANCE_COUNT":"1",'bandwidth_mbps':10}
)

#deploy the model
deployer = ModelDeployer()
deployment = deployer.deploy(
    model_deployment_properties,
    max_wait_time = 2700
)

deployment_id = deployment.model_deployment_id
print(f"Deployment {deployment_id} is {deployment.state.name}")

In [None]:
#test the model deployment
deployment.predict(train.X[0:1].to_json())

In [None]:
#run a loop invocation to genarate logs
from time import sleep
import random

random.seed(42)
for i in range (0, 221, 1):
    w = random.randrange(0, 60, 2)
    try:
        deployment.predict(test.X[i:i+1].to_json())
        print('Model Deployment was invoke successfully.')
    except ValueError as e:
        print(f'Failed to invoke the Model Deployment with the following error: {e}')
    sleep(w)