### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import pandas as pd
fraud3 = pd.read_csv('fraud3.csv')

In [2]:
import numpy as np
fraud3 = np.array(fraud3)

In [3]:
!pip install ray

Collecting ray
  Using cached ray-2.47.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (20 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray)
  Using cached msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting protobuf!=3.19.5,>=3.15.3 (from ray)
  Using cached protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Using cached ray-2.47.0-cp311-cp311-manylinux2014_x86_64.whl (68.9 MB)
Using cached msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (429 kB)
Using cached protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl (321 kB)
Installing collected packages: protobuf, msgpack, ray
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [ray][32m2/3[0m [ray]
[1A[2KSuccessfully installed msgpack-1.1.1 protobuf-6.31.1 ray-2.47.0


In [4]:
!pip install -U "ray[train]"

Collecting tensorboardX>=1.9 (from ray[train])
  Using cached tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting pyarrow>=9.0.0 (from ray[train])
  Using cached pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Using cached pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (42.3 MB)
Using cached tensorboardx-2.6.4-py3-none-any.whl (87 kB)
Installing collected packages: tensorboardX, pyarrow
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pyarrow]m1/2[0m [pyarrow]
[1A[2KSuccessfully installed pyarrow-20.0.0 tensorboardX-2.6.4


In [5]:
from ray.train import ScalingConfig

In [8]:
# Single worker with a CPU
scaling_config = ScalingConfig(num_workers=4, use_gpu=False)

In [9]:
class K_Means(object):
    # Initialize input values n_clusters and max_iter   
    def __init__(self, n_clusters, max_iter):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    # Function that assigns points to a cluster
    def assign_points_to_cluster(self, X):
        # Label points according to the minimum euclidean distance
        self.labels_ = [self._nearest(self.cluster_centers_, x) for x in X]
        # Map labels to data points
        indices=[]
        for j in range(self.n_clusters):
            cluster=[]
            for i, l in enumerate(self.labels_):
                if l==j: cluster.append(i)
            indices.append(cluster)
        X_by_cluster = [X[i] for i in indices]
        return X_by_cluster
    
    # Function that randomly selects initial centroids
    def initial_centroid(self, X):
        initial = np.random.permutation(X.shape[0])[:self.n_clusters]
        return X[initial]

    # Function that updates centroids and repeats 
    # assign_points_to_cluster until convergence or max_iter is reached
    def train_fun(self, X):  
        # initialize centroids      
        self.cluster_centers_ = self.initial_centroid(X)
        # process of assigning points to clusters until convergence or until max_iter is reached
        for i in range(self.max_iter):
            X_by_cluster = self.assign_points_to_cluster(X)
            # calculate the new centers 
            new_centers=[c.sum(axis=0)/len(c) for c in X_by_cluster]
            new_centers = [arr.tolist() for arr in new_centers]
            old_centers=self.cluster_centers_
            # if the new centroid are the same as the old centroids then the algorithm has converged
            if np.all(new_centers == old_centers): 
                self.number_of_iter=i
                break;
            else: 
                # set self.cluster_centers_ as new centers 
                self.cluster_centers_ = new_centers
        self.number_of_iter=i
        return self
    
    # Function that calculates the minimum euclidean distance
    def _nearest(self, clusters, x):
        return np.argmin([self._distance(x, c) for c in clusters])
    
    # Function to calculate euclidean distance between two points
    def _distance(self, a, b):
        return np.sqrt(((a - b)**2).sum())

    # Function that returns predicted clusters for each point
    def predict(self, X):
        return self.labels_

In [17]:
from ray.train.torch import TorchTrainer
import time
kmeans = K_Means(n_clusters=2, max_iter=300)
trainer = TorchTrainer(kmeans.train_fun, scaling_config=scaling_config)
debut = time.time()
trainer.fit()
fin = time.time()
print(f'temps {fin-debut:.4f}')

2025-06-16 14:28:52,863	INFO worker.py:1917 -- Started a local Ray instance.
2025-06-16 14:28:53,866	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `<FrameworkTrainer>(...)`.
2025-06-16 14:28:53,869	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-06-16 14:28:54 (running for 00:00:00.11)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:28:59 (running for 00:00:05.13)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:29:04 (running for 00:00:10.15)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Cur



== Status ==
Current time: 2025-06-16 14:29:54 (running for 00:01:00.36)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:29:59 (running for 00:01:05.38)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:30:04 (running for 00:01:10.41)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Cur



== Status ==
Current time: 2025-06-16 14:30:54 (running for 00:02:00.61)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:30:59 (running for 00:02:05.63)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:31:04 (running for 00:02:10.65)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Cur



== Status ==
Current time: 2025-06-16 14:31:54 (running for 00:03:00.86)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:31:59 (running for 00:03:05.88)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 14:32:04 (running for 00:03:10.91)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Cur

2025-06-16 14:32:34,617	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/datascience/ray_results/TorchTrainer_2025-06-16_14-28-50' in 0.0024s.
2025-06-16 14:32:34,618	INFO tune.py:1041 -- Total run time: 220.75 seconds (220.73 seconds for the tuning loop).
Resume training with: <FrameworkTrainer>.restore(path="/home/datascience/ray_results/TorchTrainer_2025-06-16_14-28-50", ...)
- TorchTrainer_3ac50_00000: FileNotFoundError('Could not fetch metrics for TorchTrainer_3ac50_00000: both result.json and progress.csv were not found at /home/datascience/ray_results/TorchTrainer_2025-06-16_14-28-50/TorchTrainer_3ac50_00000_0_2025-06-16_14-28-53')


== Status ==
Current time: 2025-06-16 14:32:34 (running for 00:03:40.73)
Using FIFO scheduling algorithm.
Logical resource usage: 0/3 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_14-28-50_607097_415/artifacts/2025-06-16_14-28-53/TorchTrainer_2025-06-16_14-28-50/driver_artifacts
Number of trials: 1/1 (1 PENDING)


temps 224.0276


In [8]:
ray.shutdown()

In [9]:
import ray
import joblib
from ray.util.joblib import register_ray
register_ray()
ray.init(num_cpus = 4, include_dashboard=True, dashboard_host="0.0.0.0", dashboard_port=8265)

2025-06-16 20:17:10,584	INFO worker.py:1908 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://10.1.30.200:8265 [39m[22m


0,1
Python version:,3.11.9
Ray version:,2.47.0
Dashboard:,http://10.1.30.200:8265


In [10]:
import IPython.display
IPython.display.IFrame(src="http://localhost:8265", width="100%", height="600px")

In [11]:
from IPython.display import display, HTML

display(HTML("<a href='http://localhost:8265' target='_blank'>Open Ray Dashboard</a>"))


In [13]:
import socket
ip = socket.gethostbyname(socket.gethostname())
print(f"Try: http://172.19.0.2:8265")


Try: http://172.19.0.2:8265


In [14]:
IPython.display.IFrame(src=f"http://172.19.0.2:8265", width="100%", height="600px")


In [32]:
ray.available_resources()

{'object_store_memory': 8476622438.0,
 'CPU': 4.0,
 'node:__internal_head__': 1.0,
 'node:10.1.57.93': 1.0,
 'memory': 19778785690.0}

In [12]:
import socket
ip = socket.gethostbyname(socket.gethostname())
print("Private IP:", ip)

Private IP: 172.19.0.2


In [28]:
debut = time.time()
with joblib.parallel_backend('ray'):
    kmeans.train_fun(fraud3)
fin = time.time()
print(f'temps {fin-debut:.4f}')

temps 84.7770


In [36]:
import timeit
def run_kmeans():
    with joblib.parallel_backend('ray'):
        kmeans.train_fun(fraud3)

# Time it using timeit (default is 1 million loops, so set number=1 or more reasonably)
duration = timeit.timeit(run_kmeans, number=1)
print(f"Elapsed time (via timeit): {duration:.4f} seconds")

Elapsed time (via timeit): 60.3603 seconds


In [29]:
len(fraud3)

250000

In [7]:
pip install "ray[default]"  # [default] pulls in the dashboard dependencies


Collecting aiohttp_cors (from ray[default])
  Using cached aiohttp_cors-0.8.1-py3-none-any.whl.metadata (20 kB)
Collecting colorful (from ray[default])
  Using cached colorful-0.5.6-py2.py3-none-any.whl.metadata (16 kB)
Collecting py-spy>=0.2.0 (from ray[default])
  Using cached py_spy-0.4.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (16 kB)
Collecting grpcio>=1.42.0 (from ray[default])
  Using cached grpcio-1.73.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting opencensus (from ray[default])
  Using cached opencensus-0.11.4-py2.py3-none-any.whl.metadata (12 kB)
Collecting opentelemetry-sdk (from ray[default])
  Using cached opentelemetry_sdk-1.34.1-py3-none-any.whl.metadata (1.6 kB)
Collecting opentelemetry-exporter-prometheus (from ray[default])
  Using cached opentelemetry_exporter_prometheus-0.55b1-py3-none-any.whl.metadata (1.9 kB)
Collecting opentelemetry-proto (from ray[default])
  Using cached opentelemetry_proto-1.3

In [42]:
!curl ifconfig.me


150.230.128.189

In [51]:
!ssh -i SHA256:axZLXoDe0h3lp7VCuLaCFfmhzpZpjAHL8uV+TWjuSvo datascience@1796be48b5c1 opc@150.230.128.189



^C


In [47]:
!ssh-keygen -t rsa -b 4096 -f ~/.ssh/my_key.pem


Generating public/private rsa key pair.
^C


In [53]:
!ssh-rsa SHA256:axZLXoDe0h3lp7VCuLaCFfmhzpZpjAHL8uV+TWjuSvo datascience@1796be48b5c1


/bin/bash: ssh-rsa: command not found


In [54]:
!ssh -i ~/.ssh/my_key.pem opc@150.230.128.189

^C


In [55]:
!whoami

datascience
