### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
!pip install ray kmeans-pytorch pandas torch

Collecting kmeans-pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl.metadata (1.6 kB)
Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans-pytorch
Successfully installed kmeans-pytorch-0.3


In [2]:
!pip install ray
!pip install -U "ray[train]"



In [2]:
pip install ray==2.24.0

Note: you may need to restart the kernel to use updated packages.


In [4]:
import ray
print(ray.__version__)

2.24.0


In [10]:
import numpy as np
import ray

@ray.remote
class Worker:
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters

    def assign_and_update(self, X_chunk, centroids):
        labels = [np.argmin([np.linalg.norm(x - c) for c in centroids]) for x in X_chunk]
        new_centroids = [np.mean([x for x, l in zip(X_chunk, labels) if l == i], axis=0) 
                         if any(l == i for l in labels) else centroids[i] for i in range(self.n_clusters)]
        return new_centroids, labels

class K_Means_Distributed:
    def __init__(self, n_clusters, max_iter, num_workers):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.num_workers = num_workers

    def fit(self, X):
        ray.init(ignore_reinit_error=True)
        workers = [Worker.remote(self.n_clusters) for _ in range(self.num_workers)]

        self.cluster_centers_ = X[np.random.permutation(X.shape[0])[:self.n_clusters]]

        for _ in range(self.max_iter):
            chunks = np.array_split(X, self.num_workers)
            futures = [w.assign_and_update.remote(chunk, self.cluster_centers_) for w, chunk in zip(workers, chunks)]
            results = ray.get(futures)

            all_centroids = np.array([c for c_list, _ in results for c in c_list])
            self.cluster_centers_ = np.array([
                np.mean(all_centroids[i::self.n_clusters], axis=0) for i in range(self.n_clusters)
            ])

        ray.shutdown()
        return self

    def predict(self, X):
        return [np.argmin([np.linalg.norm(x - c) for c in self.cluster_centers_]) for x in X]


In [7]:
fraud3 = pd.read_csv('fraud3.csv')
fraud3 = np.array(fraud3)

In [13]:
ray.init(num_cpus= 4)
n_clusters = 2
max_iter = 300
num_workers = 4

# Create the model
model = K_Means_Distributed(n_clusters=n_clusters, max_iter=max_iter, num_workers=num_workers)
debut = time.time()
model.fit(fraud3)
print(f'time = {time.time()-debut:.4f} secondes')


2025-06-19 15:34:16,755	INFO worker.py:1753 -- Started a local Ray instance.
2025-06-19 15:34:17,684	INFO worker.py:1586 -- Calling ray.init() again after it has already been called.


time = 581.6056 secondes


In [14]:
ray.shutdown()

In [15]:
class K_Means(object):
    # Initialize input values n_clusters and max_iter   
    def __init__(self, n_clusters, max_iter):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    # Function that assigns points to a cluster
    def assign_points_to_cluster(self, X):
        # Label points according to the minimum euclidean distance
        self.labels_ = [self._nearest(self.cluster_centers_, x) for x in X]
        # Map labels to data points
        indices=[]
        for j in range(self.n_clusters):
            cluster=[]
            for i, l in enumerate(self.labels_):
                if l==j: cluster.append(i)
            indices.append(cluster)
        X_by_cluster = [X[i] for i in indices]
        return X_by_cluster
    
    # Function that randomly selects initial centroids
    def initial_centroid(self, X):
        initial = np.random.permutation(X.shape[0])[:self.n_clusters]
        return X[initial]

    # Function that updates centroids and repeats 
    # assign_points_to_cluster until convergence or max_iter is reached
    def train_fun(self, X):  
        # initialize centroids      
        self.cluster_centers_ = self.initial_centroid(X)
        # process of assigning points to clusters until convergence or until max_iter is reached
        for i in range(self.max_iter):
            X_by_cluster = self.assign_points_to_cluster(X)
            # calculate the new centers 
            new_centers=[c.sum(axis=0)/len(c) for c in X_by_cluster]
            new_centers = [arr.tolist() for arr in new_centers]
            old_centers=self.cluster_centers_
            # if the new centroid are the same as the old centroids then the algorithm has converged
            if np.all(new_centers == old_centers): 
                self.number_of_iter=i
                break;
            else: 
                # set self.cluster_centers_ as new centers 
                self.cluster_centers_ = new_centers
        self.number_of_iter=i
        print(f'number of iterations{i}')
        return self
    
    # Function that calculates the minimum euclidean distance
    def _nearest(self, clusters, x):
        return np.argmin([self._distance(x, c) for c in clusters])
    
    # Function to calculate euclidean distance between two points
    def _distance(self, a, b):
        return np.sqrt(((a - b)**2).sum())

    # Function that returns predicted clusters for each point
    def predict(self, X):
        return self.labels_

In [16]:
model = K_Means(2,300)
model.train_fun(fraud3)

number of iterations22


<__main__.K_Means at 0x7f3339f0e290>

In [20]:
from sklearn.metrics import silhouette_score

# Compute silhouette score
score = silhouette_score(fraud3, predicted_labels)
print(f"Silhouette Score: {score:.4f}")

Silhouette Score: 0.5297


In [18]:
from sklearn.metrics import davies_bouldin_score
# Get predictions (cluster labels)
predicted_labels = model.predict(fraud3)
dbi_score = davies_bouldin_score(fraud3, predicted_labels)
print(f"Davies-Bouldin Index: {dbi_score:.4f}")

Davies-Bouldin Index: 0.6706


In [19]:
from sklearn.metrics import calinski_harabasz_score
ch_score = calinski_harabasz_score(fraud3, predicted_labels)
print(f"Calinski-Harabasz Index: {ch_score:.4f}")

Calinski-Harabasz Index: 482392.5654
