### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [36]:
import pandas as pd
fraud3 = pd.read_csv('fraud3 (1).csv')

In [39]:
fraud3.shape

(250000, 29)

In [38]:
fraud3 = fraud3.drop(['Unnamed: 0'], axis = 1)

In [40]:
fraud3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   amt                      250000 non-null  float64
 1   gender                   250000 non-null  int64  
 2   zip                      250000 non-null  int64  
 3   lat                      250000 non-null  float64
 4   long                     250000 non-null  float64
 5   merch_lat                250000 non-null  float64
 6   merch_long               250000 non-null  float64
 7   is_fraud                 250000 non-null  int64  
 8   year                     250000 non-null  int64  
 9   month                    250000 non-null  int64  
 10  day                      250000 non-null  int64  
 11  hour                     250000 non-null  int64  
 12  minute                   250000 non-null  int64  
 13  sec                      250000 non-null  int64  
 14  age 

In [41]:
x_train = fraud3.drop(['is_fraud'], axis = 1)
y_train = (fraud3['is_fraud'])

In [42]:
print(x_train.shape)
print(y_train.shape)

(250000, 28)
(250000,)


In [8]:
test = pd.read_csv('test.csv')

In [43]:
test.shape

(296675, 29)

In [44]:
test = test.drop(['Unnamed: 0'], axis = 1)

KeyError: "['Unnamed: 0'] not found in axis"

In [46]:
test.shape

(296675, 29)

In [47]:
test3 = test.iloc[0:50000,:]

In [48]:
test3.shape

(50000, 29)

In [50]:
x_test = test3.drop(['is_fraud'], axis = 1)
y_test = (test3['is_fraud'])

In [51]:
print(x_test.shape)
print(y_test.shape)

(50000, 28)
(50000,)


In [17]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [52]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

In [53]:
import time
import statistics
training = []
prediction = []
model = KNeighborsClassifier(n_neighbors=5)
for i in range(10):
    print(f"round: {i+1}/10")
    debut = time.time()
    model.fit(x_train,y_train)
    fin = time.time()
    print(f'training = {fin-debut:.4f} secondes')
    training.append(fin-debut)
    begin = time.time()
    y_pred = model.predict(x_test)
    end = time.time()
    print(f'prediction = {end-begin:.4f} secondes')
    prediction.append(end-begin)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
print(f'length training = {len(training)}')
print(f'mean training time = {statistics.mean(training):.4f} secondes')
print(f'length prediction = {len(prediction)}')
print(f'mean prediction time = {statistics.mean(prediction):.4f} secondes')

round: 1/10
training = 0.0280 secondes
prediction = 28.3611 secondes
Accuracy: 99.32%
round: 2/10
training = 0.0234 secondes
prediction = 27.5110 secondes
Accuracy: 99.32%
round: 3/10
training = 0.0234 secondes
prediction = 27.1530 secondes
Accuracy: 99.32%
round: 4/10
training = 0.0233 secondes
prediction = 28.4403 secondes
Accuracy: 99.32%
round: 5/10
training = 0.0236 secondes
prediction = 29.0555 secondes
Accuracy: 99.32%
round: 6/10
training = 0.0236 secondes
prediction = 28.2445 secondes
Accuracy: 99.32%
round: 7/10
training = 0.0235 secondes
prediction = 29.7167 secondes
Accuracy: 99.32%
round: 8/10
training = 0.0235 secondes
prediction = 29.6446 secondes
Accuracy: 99.32%
round: 9/10
training = 0.0234 secondes
prediction = 29.4370 secondes
Accuracy: 99.32%
round: 10/10
training = 0.0234 secondes
prediction = 28.5472 secondes
Accuracy: 99.32%
length training = 10
mean training time = 0.0239 secondes
length prediction = 10
mean prediction time = 28.6111 secondes


In [54]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from multiprocessing import Pool
import time

# This function will run in each process
def knn_predict_partition(args):
    x_train_part, y_train_part, x_test = args
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train_part, y_train_part)
    predictions = knn.predict(x_test)  # Shape: (num_test_samples,)
    return predictions

def run_parallel_knn_with_pool(x_train, y_train, x_test, y_test, num_partitions):
    # Split training data
    x_train_parts = np.array_split(x_train, num_partitions)
    y_train_parts = np.array_split(y_train, num_partitions)

    # Prepare argument tuples
    args_list = [(x_train_parts[i], y_train_parts[i], x_test) for i in range(num_partitions)]

    start = time.time()

    with Pool(processes=num_partitions) as pool:
        # Each process predicts for the entire test set
        predictions_list = pool.map(knn_predict_partition, args_list)

    pred_time = time.time()
    print(f"Prediction Time: {pred_time - start:.4f} seconds")

    # Convert to shape: (num_test_samples, num_partitions)
    predictions_array = np.array(predictions_list).T

    # Majority voting
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions_array)

    end = time.time()
    print(f"Total Time: {end - start:.4f} seconds")

    # Evaluation
    accuracy = accuracy_score(y_test, final_predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    return final_predictions


In [55]:
num_partitions = 4  # Set this based on your CPU cores
final_preds = run_parallel_knn_with_pool(x_train, y_train, x_test, y_test, num_partitions)

KeyboardInterrupt: 

In [39]:
!pip install ray

Collecting ray
  Using cached ray-2.47.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (20 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray)
  Using cached msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting protobuf!=3.19.5,>=3.15.3 (from ray)
  Using cached protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Using cached ray-2.47.1-cp311-cp311-manylinux2014_x86_64.whl (68.9 MB)
Using cached msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (429 kB)
Using cached protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl (321 kB)
Installing collected packages: protobuf, msgpack, ray
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [ray][32m2/3[0m [ray]
[1A[2KSuccessfully installed msgpack-1.1.1 protobuf-6.31.1 ray-2.47.1


In [60]:
import warnings
import time
import numpy as np
import ray
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

warnings.simplefilter(action='ignore', category=FutureWarning)

ray.init(num_cpus=4)
# Number of classifiers (partitions)
num_partitions = 4 # Number of cores used
total_times = []
# Define a Ray remote function for KNN prediction
@ray.remote
def knn_predict(x_train_part, y_train_part, x_test):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train_part, y_train_part)  # Just stores the data (training phase)
    return knn.predict(x_test)  # Predict labels for test set

# Main execution
def run_parallel_knn(x_train, y_train, x_test, y_test):
    # Split training data into partitions
    x_train_refs = [ray.put(part) for part in np.array_split(x_train, num_partitions)]
    y_train_refs = [ray.put(part) for part in np.array_split(y_train, num_partitions)]
    x_test_ref = ray.put(x_test)
    
    start = time.time()
    
    # Parallel execution with Ray - Submit all tasks to Ray
    prediction_refs = [
        knn_predict.remote(x_train_ref, y_train_ref, x_test_ref) 
        for x_train_ref, y_train_ref in zip(x_train_refs, y_train_refs)
    ]
    fin = time.time()
    print(f"prediction Time: {fin-start:.4f} seconds")
    # Get all predictions at once - more efficient than incremental collection
    debut = time.time()
    predictions = ray.get(prediction_refs)
    finito = time.time()
    print(f"Collect Time: {finito-debut:.4f} seconds")
    
    # Convert predictions list into array shape (num_test_samples, num_partitions) for majority voting
    predictions = np.array(predictions).T  # Transpose so each row corresponds to a test sample
    
    # Majority voting for final prediction
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)
    
    end = time.time()
    total_time = end-start
    print(f"Total Time: {end-start:.4f} seconds")
    total_times.append(total_time)
    # Evaluate performance
    accuracy = accuracy_score(y_test, final_predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    return final_predictions

    

2025-06-26 22:09:10,162	INFO worker.py:1917 -- Started a local Ray instance.


In [61]:
raytime = []
for i in range(10):
    print(f"round: {i+1}/10")
    debut = time.time()
    final_predictions = run_parallel_knn(x_train, y_train, x_test, y_test)
    fin = time.time()
    print(f'timing = {fin-debut:.4f} secondes')
    raytime.append(fin-debut)
print(f'mean time = {statistics.mean(raytime)} secondes')
    

round: 1/10
prediction Time: 0.0024 seconds
Collect Time: 29.2962 seconds
Total Time: 29.3972 seconds
Accuracy: 99.35%
timing = 29.5440 secondes
round: 2/10
prediction Time: 0.0008 seconds
Collect Time: 28.1011 seconds
Total Time: 28.2066 seconds
Accuracy: 99.35%
timing = 28.3495 secondes
round: 3/10
prediction Time: 0.0008 seconds
Collect Time: 28.1044 seconds
Total Time: 28.2026 seconds
Accuracy: 99.35%
timing = 28.3129 secondes
round: 4/10
prediction Time: 0.0009 seconds
Collect Time: 27.8752 seconds
Total Time: 27.9785 seconds
Accuracy: 99.35%
timing = 28.0923 secondes
round: 5/10
prediction Time: 0.0009 seconds
Collect Time: 27.8815 seconds
Total Time: 27.9806 seconds
Accuracy: 99.35%
timing = 28.0893 secondes
round: 6/10
prediction Time: 0.0009 seconds
Collect Time: 27.8022 seconds
Total Time: 27.9011 seconds
Accuracy: 99.35%
timing = 28.0075 secondes
round: 7/10
prediction Time: 0.0009 seconds
Collect Time: 29.1437 seconds
Total Time: 29.2424 seconds
Accuracy: 99.35%
timing = 29

In [62]:
ray.shutdown()

In [23]:
import statistics
print(f'mean time = {statistics.mean(raytime)} secondes')

mean time = 300.2875088691711 secondes
