### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split

import automlx
from automlx import init

In [3]:
# ウォレットファイルを使用する場合は、`wallet_location` に zip ファイルのパスを指定する。
connection_parameters = {
    "user_name": "oml_user",
    "password": "",
    "service_name": "adwml_high",
    "wallet_location": "/home/datascience/Wallet_ADWML.zip",
}

In [4]:
# ADWのテーブルからデータを抽出し、データセットを作成する。
import ads

df = pd.DataFrame.ads.read_sql(
    "SELECT * FROM oml_user.トレニング",
    connection_parameters=connection_parameters,
)

In [5]:
# 特徴列の抽出「ターゲット列の削除」
X = df.drop(columns=['顧客優先順位'])

# ターゲット列の抽出
y = df['顧客優先順位']

from sklearn.model_selection import train_test_split

# データセットを分割し、80%をトレーニング用、20%をテスト用にする。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 分割データセット情報を表示する。
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# トレーニングセットデータの一部を表示する。
print(X_train)
print(y_train)

X_train shape: (63968, 9)
X_test shape: (15992, 9)
y_train shape: (63968,)
y_test shape: (15992,)
        クレジット利用率  クレジット履歴年齢      毎月の投資額        月次残高          年収          月額手当  \
34383  26.765401        390  376.073072  765.039178  105997.470   8711.122500   
33548  39.652274        335  753.362868  498.082941  143252.760  12131.730000   
21982  32.046885        188  170.334070  426.535562   54420.450   4681.037500   
16871  35.969331        234  238.824131  689.996171  108491.280   9272.940000   
18817  26.818588        347   86.765946  405.472766   53003.180   4275.931667   
...          ...        ...         ...         ...         ...           ...   
6265   31.075220        394  103.522443  456.461896   43592.450   3758.704167   
54886  25.287418        150   72.941844  312.800973    9917.025   1099.418750   
76820  39.873482        170   77.310889  309.023358   42216.930   3524.077500   
860    33.471395        245  758.070487  229.212547  100433.580   8074.465000   
15795  36.4

In [6]:
init(engine='local')

In [7]:
# automlxの自動機械学習機能を利用して、分類のモデルを作成する。
est1 = automlx.Pipeline(task='classification')
est1.fit(X_train, y_train)

[2024-07-23 02:40:27,831] [automlx.interface] Dataset shape: (63968,9)
[2024-07-23 02:40:27,920] [automlx.data_transform] Running preprocessing. Number of features: 10
[2024-07-23 02:40:28,189] [automlx.data_transform] Preprocessing completed. Took 0.268 secs
[2024-07-23 02:40:28,258] [automlx.process] Running Model Generation
[2024-07-23 02:40:28,302] [automlx] Provided model (TorchMLPClassifier) is not supported.Supported models are: ['CatBoostClassifier', 'LGBMClassifier', 'SVC', 'AdaBoostClassifier', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'GradientBoostingClassifier', 'KNeighborsClassifier', 'LinearSVC', 'LogisticRegressionClassifier', 'RandomForestClassifier', 'XGBClassifier']
[2024-07-23 02:40:28,302] [automlx.process] KNeighborsClassifier is disabled. The KNeighborsClassifier model is only recommended for datasets with less than 10000 samples and 1000 features.
[2024-07-23 02:40:28,303] [automlx.process] SVC is disabled. The SVC model is only recommended

<automlx._interface.classifier.AutoClassifier at 0x7f0f49cec460>

In [8]:
# モデルの精度を算出する。
y_proba = est1.predict_proba(X_test)
score_default = roc_auc_score(y_test, y_proba, multi_class='ovr')

print(f'Score on test data : {score_default}')

Score on test data : 0.910922612973947


<summary><font size="5">ここから、自動機械学習で作成したモデルをOCIにデプロイします。</font></summary>

In [9]:
import ads
import logging
import os
import random
import tempfile
import warnings

from ads.common.model_metadata import UseCaseType
from ads.model.generic_model import GenericModel
from shutil import rmtree

logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.ERROR)
warnings.filterwarnings("ignore")

In [10]:
ads.set_auth(auth="resource_principal")

In [11]:
artifact_dir = tempfile.mkdtemp()
print(f"Model artifact director: {artifact_dir}")
generic_model = GenericModel(estimator=est1, artifact_dir=artifact_dir)

Model artifact director: /tmp/tmp_vv0358q
[2024-07-23 03:00:49,588] [ads.common] In the future model input will be serialized by `cloudpickle` by default. Currently, model input are serialized into a dictionary containing serialized input data and original data type information.Set `model_input_serializer="cloudpickle"` to use cloudpickle model input serializer.


In [12]:
conda_env = "automlx242_p38_cpu_x86_64_v1"

generic_model.prepare(
    inference_conda_env=conda_env,
    training_conda_env=conda_env,
    use_case_type=UseCaseType.MULTINOMIAL_CLASSIFICATION,
    X_sample=X_test,
    y_sample=y_test,
    force_overwrite=True
)

                                                                                                                                                                                                      ?, ?it/s]

algorithm: null
artifact_dir:
  /tmp/tmp_vv0358q:
  - - output_schema.json
    - .model-ignore
    - model.pkl
    - runtime.yaml
    - score.py
    - input_schema.json
framework: null
model_deployment_id: null
model_id: null

In [13]:
os.listdir(artifact_dir)

['output_schema.json',
 '.model-ignore',
 'model.pkl',
 'runtime.yaml',
 'score.py',
 'input_schema.json']

In [14]:
generic_model.verify(X_test.iloc[:2], auto_serialize_data=True)

Start loading model.pkl from model directory /tmp/tmp_vv0358q ...
Model is successfully loaded.


{'prediction': ['低い', '高い']}

In [15]:
# モデルをカテゴリに保存する。
model_id = generic_model.save(display_name="OACとData Science Cloudでの顧客優先順位判断1.0")

Start loading model.pkl from model directory /tmp/tmp_vv0358q ...
Model is successfully loaded.
['output_schema.json', '.model-ignore', 'model.pkl', 'runtime.yaml', 'score.py', 'input_schema.json']


loop1:   0%|          | 0/4 [00:00<?, ?it/s]

[2024-07-23 03:04:34,159] [ads.model.datascience_model] the JSON object must be str, bytes or bytearray, not Schema
[2024-07-23 03:04:34,160] [ads.model.datascience_model] the JSON object must be str, bytes or bytearray, not Schema


In [16]:
# モデルをOCIにデプロイする。
deploy = generic_model.deploy(display_name="OACとData Science Cloudでの顧客優先順位判断1.0")

Model Deployment OCID: ocid1.datasciencemodeldeployment.oc1.ap-tokyo-1.amaaaaaaak7gbriatbefqwjzkgz6xvv24qeposxmxbt7tpejwdukjmizwh3a


Creating model deployment:   0%|          | [00:00<?, ?it/s]

In [17]:
# デプロイの結果を確認する。
generic_model.summary_status()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Done,Generated runtime.yaml,
prepare(),Done,Generated score.py,
prepare(),Done,Serialized model,
prepare(),Done,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Done,Local tested .predict from score.py,
save(),Done,Conducted Introspect Test,
save(),Done,Uploaded artifact to model catalog,
deploy(),ACTIVE,Deployed the model,
predict(),Available,Called deployment predict endpoint,


In [18]:
# デプロイしたモデルをテストする。
generic_model.predict(X_test.iloc[:2], auto_serialize_data=True)

{'prediction': ['低い', '高い']}

<summary><font size="5">ここから、デプロイされたAutoMLモデルを呼び出し、顧客の優先順位を予測します。優先順位の高い顧客を先に訪問します。</font></summary>

In [22]:
connection_parameters = {
    "user_name": "oml_user",
    "password": "",
    "service_name": "adwml_high",
    "wallet_location": "/home/datascience/Wallet_ADWML.zip",
}

In [23]:
import ads
import pandas as pd

# モデルで予測する顧客のデータをADWのテーブルから抽出する。
df = pd.DataFrame.ads.read_sql(
    "SELECT * FROM oml_user.テスト",
    connection_parameters=connection_parameters,
)

In [24]:
X_NewCustomer = df.drop(columns=['顧客優先順位'])

y_NewCustomer = df['顧客優先順位']

In [25]:
# 予測する顧客の特徴量データを確認する。
X_NewCustomer

Unnamed: 0,クレジット利用率,クレジット履歴年齢,毎月の投資額,月次残高,年収,月額手当,変更したクレジット限度額,未払い債務,月額合計EMI
0,33.703590,274,93.844445,819.864338,88088.070,7248.672500,8.27,195.23,51.158467
1,29.115004,345,153.143286,388.327790,56125.500,4875.125000,17.89,370.22,81.822857
2,24.352594,165,43.781298,365.958624,32198.230,2862.185833,18.27,857.14,126.478662
3,23.153949,279,81.378283,302.989946,12246.215,1181.517917,9.32,641.98,13.783563
4,28.711566,355,331.949599,161.202087,31040.140,2474.678333,13.77,1288.66,44.316148
...,...,...,...,...,...,...,...,...,...
19995,27.163649,382,71.054239,457.854743,31741.570,2858.130833,7.78,1231.17,16.904101
19996,29.655650,256,1271.089453,49.736069,172534.160,14344.846667,5.99,1343.50,393.659144
19997,33.066420,377,174.002546,344.186454,28762.680,2281.890000,15.23,1391.77,0.000000
19998,30.934892,142,147.426314,221.598191,20592.970,1843.080833,19.78,1555.94,85.283579


In [27]:
from ads.model.generic_model import GenericModel

generic_model_NewCustomer = GenericModel.from_model_deployment(
    "ocid1.datasciencemodeldeployment.oc1.ap-tokyo-1.amaaaaaaak7gbriatbefqwjzkgz6xvv24qeposxmxbt7tpejwdukjmizwh3a",
    model_file_name="model.joblib",
    artifact_dir="ocid1.datasciencemodel.oc1.ap-tokyo-1.amaaaaaaak7gbriarp7artmflsrul22nn4pfrkzvywuyxodbmdldbuhzwp3q.zip",
    force_overwrite=True,
)

[2024-07-23 04:20:37,836] [ads.model.datascience_model] the JSON object must be str, bytes or bytearray, not Schema
[2024-07-23 04:20:37,837] [ads.model.datascience_model] the JSON object must be str, bytes or bytearray, not Schema


loop1:   0%|          | 0/4 [00:00<?, ?it/s]

[2024-07-23 04:21:39,317] [ads.common] In the future model input will be serialized by `cloudpickle` by default. Currently, model input are serialized into a dictionary containing serialized input data and original data type information.Set `model_input_serializer="cloudpickle"` to use cloudpickle model input serializer.


In [28]:
# 顧客の特徴データで優先順位を予測する。
predictions = generic_model_NewCustomer.predict(X_NewCustomer, auto_serialize_data=True)

# 予測結果をDataFrameに変換する。
pred_df = pd.DataFrame(predictions)

# `X_NewCustomer`に予測結果を追加する。
X_NewCustomer_with_predictions = X_NewCustomer.copy()
X_NewCustomer_with_predictions['予測した顧客優先順位'] = pred_df['prediction']

# 新しいデータフレームを表示する。
print(X_NewCustomer_with_predictions)

        クレジット利用率  クレジット履歴年齢       毎月の投資額        月次残高          年収  \
0      33.703590        274    93.844445  819.864338   88088.070   
1      29.115004        345   153.143286  388.327790   56125.500   
2      24.352594        165    43.781298  365.958624   32198.230   
3      23.153949        279    81.378283  302.989946   12246.215   
4      28.711566        355   331.949599  161.202087   31040.140   
...          ...        ...          ...         ...         ...   
19995  27.163649        382    71.054239  457.854743   31741.570   
19996  29.655650        256  1271.089453   49.736069  172534.160   
19997  33.066420        377   174.002546  344.186454   28762.680   
19998  30.934892        142   147.426314  221.598191   20592.970   
19999  35.682471        247    74.056233  461.459350   33049.870   

               月額手当  変更したクレジット限度額    未払い債務     月額合計EMI 予測した顧客優先順位  
0       7248.672500          8.27   195.23   51.158467          中  
1       4875.125000         17.89   370.22   81

In [36]:
# 予測した結果をADWに保存する。
X_NewCustomer_with_predictions['顧客ID'] = X_NewCustomer_with_predictions.index + 1

X_NewCustomer_with_predictions.ads.to_sql(
    "予測結果",
    connection_parameters=connection_parameters, # Should contain wallet location if you are connecting to ADB
    if_exists="replace"
)

In [71]:
from sklearn.metrics import precision_score
import numpy as np

# dictから予測ラベルを抽出する
predictions_dict = generic_model_NewCustomer.predict(X_NewCustomer, auto_serialize_data=True)
predictions = predictions_dict.get('prediction', [])

# 予測結果の長さを出力する
print(f"予測結果の数: {len(predictions)}")

# 予測結果を NumPy に変換する
predictions = np.array(predictions)

# 実際の値
y_NewCustomer = df['顧客優先順位'].values

# y_NewCustomer が一維配列であることを確認する
y_NewCustomer = np.ravel(y_NewCustomer)

# 実際の値の長さを出力する
print(f"実際値の数: {len(y_NewCustomer)}")

# サンプル数が一致していることを確認する
if len(predictions) == len(y_NewCustomer):
    # 精度を計算する（実際値と予測値の形式が一致している必要がある）
    # 予測値が数字でない場合（例: '低い', '中', '高い'）、数字形式に変換する必要がある
    # 仮定: '低い': 0, '中': 1, '高い': 2
    label_map = {'低い': 0, '中': 1, '高い': 2}
    y_NewCustomer_numeric = np.array([label_map[label] for label in y_NewCustomer])
    predictions_numeric = np.array([label_map[label] for label in predictions])

    # 精度を計算する
    precision = precision_score(y_NewCustomer_numeric, predictions_numeric, average='weighted')

    # 精度を出力する
    print("精度スコア:", precision)
else:
    print("エラー: 予測値と実際の値のサンプル数が異なります。")


予測結果の数: 20000
実際値の数: 20000
精度スコア: 0.7956540828640349
