### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [48]:
!pip install parsl
!pip install torch



In [47]:
import pandas as pd
fraud3 = pd.read_csv('fraud3 (1).csv')

In [49]:
fraud3.shape

(250000, 30)

In [50]:
fraud3 = fraud3.drop(['Unnamed: 0'], axis = 1)

In [51]:
x_train = fraud3.drop(['is_fraud'], axis = 1)
y_train = (fraud3['is_fraud'])

In [52]:
print(x_train.shape)
print(y_train.shape)

(250000, 28)
(250000,)


In [53]:
test = pd.read_csv('test.csv')

In [54]:
test = test.drop(['Unnamed: 0'], axis = 1)

In [55]:
test.shape

(296675, 29)

In [56]:
test3 = test.iloc[0:50000,:]

In [57]:
test3.shape

(50000, 29)

In [58]:
x_test = test3.drop(['is_fraud'], axis = 1)
y_test = (test3['is_fraud'])

In [59]:
print(x_test.shape)
print(y_test.shape)

(50000, 28)
(50000,)


In [1]:
!pip install parsl



In [2]:
!pip install torch



In [62]:
import os
import numpy as np
import torch
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import parsl
from parsl import python_app
from parsl.config import Config
from parsl.executors.threads import ThreadPoolExecutor

# Convert to NumPy if using DataFrames
x_train = x_train.to_numpy() if hasattr(x_train, 'to_numpy') else x_train
y_train = y_train.to_numpy() if hasattr(y_train, 'to_numpy') else y_train
x_test = x_test.to_numpy() if hasattr(x_test, 'to_numpy') else x_test
y_test = y_test.to_numpy() if hasattr(y_test, 'to_numpy') else y_test

# Configure Parsl with multithreading
parsl.clear()
config = Config(
    executors=[ThreadPoolExecutor(max_threads=4, label="threads")],
    strategy=None
)
parsl.load(config)

@python_app(executors=['threads'])
def train_and_predict(x_train_subset, y_train_subset, x_test):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train_subset, y_train_subset)
    return knn.predict(x_test)

def run_knn_parsl(x_train, y_train, x_test, y_test, num_workers):
    x_train_parts = np.array_split(x_train, num_workers)
    y_train_parts = np.array_split(y_train, num_workers)

    futures = [train_and_predict(x_train_parts[i], y_train_parts[i], x_test)
               for i in range(num_workers)]

    results = [fut.result() for fut in futures]

    # Majority voting
    predictions = np.array(results).T
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)

    accuracy = accuracy_score(y_test, final_predictions)
    print(f"[Parsl Majority Voting] Accuracy: {accuracy * 100:.2f}%")

    return final_predictions

In [None]:
import time
import statistics
import logging
logging.getLogger('parsl').setLevel(logging.WARNING)
parsltime = []
for i in range(10):
    print(f'---------------------------------------------- round {i+1}/10 --------------------------------------------------------------')
    debut = time.time()
    pred = run_knn_parsl(x_train, y_train, x_test, y_test, 4)
    fin = time.time()
    print(f'prediction time = {fin - debut:.2f} seconds')
    parsltime.append(fin-debut)
print (f'mean prediction time = {statistics.mean(parsltime)} secondes')

---------------------------------------------- round 1/10 --------------------------------------------------------------
[Parsl Majority Voting] Accuracy: 99.35%
prediction time = 41.72 seconds
---------------------------------------------- round 2/10 --------------------------------------------------------------
[Parsl Majority Voting] Accuracy: 99.35%
prediction time = 42.32 seconds
---------------------------------------------- round 3/10 --------------------------------------------------------------
[Parsl Majority Voting] Accuracy: 99.35%
prediction time = 41.68 seconds
---------------------------------------------- round 4/10 --------------------------------------------------------------
[Parsl Majority Voting] Accuracy: 99.35%
prediction time = 41.95 seconds
---------------------------------------------- round 5/10 --------------------------------------------------------------
[Parsl Majority Voting] Accuracy: 99.35%
prediction time = 41.72 seconds
------------------------------