In [1]:
import ray
if ray.is_initialized:
    ray.shutdown()
ray.init(address='ray://localhost:10001', ignore_reinit_error=True)
CHUNK_SIZE = 1024
REPLICATION_FACTOR = 3

2025-05-13 15:39:17,768	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: ignore_reinit_error, log_to_driver
    Ray: 2.44.1
    Python: 3.11.11
This process on Ray Client was started with:
    Ray: 2.44.1
    Python: 3.11.5



In [2]:
@ray.remote
class StorageNode:
    def __init__(self, node_id):
        self.node_id = node_id
        self.chunks = {}
        self.alive = True

    def store_chunk(self, artifact_id, chunk_idx, data):
        if not self.alive:
            raise RuntimeError(f"Node {self.node_id} is down")
        self.chunks.setdefault(artifact_id, {})[chunk_idx] = data
        return True

    def get_chunk(self, artifact_id, chunk_idx):
        return self.chunks.get(artifact_id, {}).get(chunk_idx)

    def delete_artifact(self, artifact_id):
        self.chunks.pop(artifact_id, None)
        return True

    def list_chunks(self):
        return {aid: list(m.keys()) for aid, m in self.chunks.items()}

    def set_status(self, alive: bool):
        self.alive = alive
        return self.alive

    def is_alive(self):
        return self.alive




In [3]:
@ray.remote
class NameNode:
    def __init__(self):
        self.metadata = {}
        self.locations = {}
        self.nodes = {}

    def register_node(self, node_id, actor_handle):
        self.nodes[node_id] = actor_handle
        return True

    def unregister_node(self, node_id):
        self.nodes.pop(node_id, None)
        return True

    def put_artifact(self, artifact_id, name, data):
        chunks = [data[i:i+CHUNK_SIZE] for i in range(0, len(data), CHUNK_SIZE)]
        self.metadata[artifact_id] = {'name': name, 'chunks': len(chunks)}
        self.locations[artifact_id] = {}
        node_ids = list(self.nodes.keys())
        for idx, chunk in enumerate(chunks):
            replicas = [node_ids[(idx + r) % len(node_ids)] for r in range(REPLICATION_FACTOR)]
            self.locations[artifact_id][idx] = replicas
            for n_id in replicas:
                self.nodes[n_id].store_chunk.remote(artifact_id, idx, chunk)
        return True

    def get_artifact(self, artifact_id):
        if artifact_id not in self.metadata:
            raise KeyError("Artifact not found")
        total = self.metadata[artifact_id]['chunks']
        result = []
        for idx in range(total):
            for n_id in self.locations[artifact_id][idx]:
                node = self.nodes[n_id]
                if ray.get(node.is_alive.remote()):
                    data = ray.get(node.get_chunk.remote(artifact_id, idx))
                    result.append(data)
                    break
            else:
                raise RuntimeError(f"No alive replica for chunk {idx}")
        return self.metadata[artifact_id]['name'], ''.join(result)

    def update_artifact(self, artifact_id, new_data):
        name = self.metadata[artifact_id]['name']
        self.delete_artifact(artifact_id)
        return self.put_artifact(artifact_id, name, new_data)

    def delete_artifact(self, artifact_id):
        for idx, nodes in self.locations.get(artifact_id, {}).items():
            for n_id in nodes:
                if n_id in self.nodes:
                    self.nodes[n_id].delete_artifact.remote(artifact_id)
        self.metadata.pop(artifact_id, None)
        self.locations.pop(artifact_id, None)
        return True

    def list_metadata(self):
        return self.metadata

    def list_locations(self):
        return self.locations


In [12]:
name_node = NameNode.remote()
nodes = {}
for i in range(5):
    nid = f"node-{i}"
    node = StorageNode.remote(nid)
    nodes[nid] = node
    ray.get(name_node.register_node.remote(nid, node))

node_with_options = StorageNode.options(name="node-special", num_cpus=0.3).remote("node-special")
nodes['node-special'] = node_with_options
ray.get(name_node.register_node.remote("node-special", node_with_options))

# PUT
long_data = 'A' * 5000
ray.get(name_node.put_artifact.remote('art1', 'TestArtifact', long_data))
print('Metadata after PUT:', ray.get(name_node.list_metadata.remote()))
print('Locations after PUT:', ray.get(name_node.list_locations.remote()))

# GET
name, content = ray.get(name_node.get_artifact.remote('art1'))
print(f"GET: {name}, length={len(content)}")

# UPDATE
new_data = 'B' * 3000
ray.get(name_node.update_artifact.remote('art1', new_data))
print('Metadata after UPDATE:', ray.get(name_node.list_metadata.remote()))

# DELETE
ray.get(name_node.delete_artifact.remote('art1'))
print('Metadata after DELETE:', ray.get(name_node.list_metadata.remote()))

# Failover
ray.get(nodes['node-2'].set_status.remote(False))
ray.get(name_node.put_artifact.remote('art2', 'Artifact2', 'C'*2048))
print('Locations with failure:', ray.get(name_node.list_locations.remote()))
name2, content2 = ray.get(name_node.get_artifact.remote('art2'))
print(f"GET after failure: {name2}, length={len(content2)}")



Metadata after PUT: {'art1': {'name': 'TestArtifact', 'chunks': 5}}
Locations after PUT: {'art1': {0: ['node-0', 'node-1', 'node-2'], 1: ['node-1', 'node-2', 'node-3'], 2: ['node-2', 'node-3', 'node-4'], 3: ['node-3', 'node-4', 'node-special'], 4: ['node-4', 'node-special', 'node-0']}}


RayTaskError(OutOfMemoryError): [36mray::NameNode.get_artifact()[39m (pid=687, ip=172.63.0.3, actor_id=6b47d04db87bc42b32d4b85901000000, repr=<__main__.NameNode object at 0x7f4a28db8b90>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dominik\ComputerScienceStudiesAGH\term_VI\DISTRIBUTED_SYSTEMS\lab_5\lab5-ray\Lib\site-packages\ray\util\tracing\tracing_helper.py", line 463, in _resume_span
  File "C:\Users\Dominik\AppData\Local\Temp\ipykernel_26056\3690584755.py", line 36, in get_artifact
           ^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ray.exceptions.OutOfMemoryError: Task was killed due to the node running low on memory.
Memory on the node (IP: 172.63.0.2, ID: ea270dbab81b9c7f5da6ad7ba916018d9fe795c223fe57dffc3f7e98) where the task (task ID: ffffffffffffffffe84026cf006ea2204420f3ba01000000, name=StorageNode.__init__, pid=1310, memory used=0.05GB) was running was 1.91GB / 2.00GB (0.956745), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: c62b83f78a5b92cb95c480e2a14a746f6b5995fec5b9e08ee8fd286b) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.63.0.2`. To see the logs of the worker, use `ray logs worker-c62b83f78a5b92cb95c480e2a14a746f6b5995fec5b9e08ee8fd286b*out -ip 172.63.0.2. Top 10 memory users:
PID	MEM(GB)	COMMAND
82	0.38	/home/ray/anaconda3/bin/python3.11 /home/ray/anaconda3/lib/python3.11/site-packages/ray/dashboard/da...
389	0.32	/home/ray/anaconda3/bin/python3.11 -m ray.util.client.server --address=172.63.0.2:6379 --host=0.0.0....
81	0.27	/home/ray/anaconda3/bin/python3.11 -m ray.util.client.server --address=172.63.0.2:6379 --host=0.0.0....
22	0.20	/home/ray/anaconda3/lib/python3.11/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/...
218	0.19	/home/ray/anaconda3/lib/python3.11/site-packages/ray/core/src/ray/raylet/raylet --raylet_socket_name...
219	0.14	/home/ray/anaconda3/bin/python3.11 -u /home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/...
80	0.11	/home/ray/anaconda3/bin/python3.11 -u /home/ray/anaconda3/lib/python3.11/site-packages/ray/autoscale...
294	0.08	/home/ray/anaconda3/bin/python3.11 -u /home/ray/anaconda3/lib/python3.11/site-packages/ray/dashboard...
1310	0.05	ray::StorageNode.__init__
296	0.05	/home/ray/anaconda3/bin/python3.11 -u /home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/...
Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. Set max_restarts and max_task_retries to enable retry when the task crashes due to OOM. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.