# Version Control with Icechunk

In [1]:
import numpy as np
import zarr
import icechunk as ic

### Create a new Icechunk *Repository*

In [2]:
storage = ic.s3_storage(
    bucket="icechunk-test",
    prefix="dvc-webinar/version-control-demo",
)
repo = ic.Repository.create(storage)
repo

<icechunk.repository.Repository at 0x7f6ec4118250>

### Write some data and *commit*, requires opening a *writable Session*

In [3]:
session = repo.writable_session("main")
store = session.store # A Zarr Store

group = zarr.group(store, zarr_format=3)
group.attrs['title'] = "My amazing dataset!"
array = group.create_array("array1", shape=10_000_000, chunks=1_000_000, dtype='i4')
array[:] = np.arange(10_000_000)

session.status()

Groups created:
    /

Arrays created:
    /array1

User attributes updated:
    /
    /array1

Chunks updated:
    /array1:
        [0]
        [1]
        [2]
        [3]
        [4]
        [5]
        [6]
        [7]
        [8]
        [9]

In [4]:
cid = session.commit("wrote initial data")
cid # snapshot ID

'SMNR3FM3NTPGA2G97HZ0'

Making more changes requires a new session and re-opening the array.

In [5]:
session = repo.writable_session("main")
array = zarr.open_array(store=session.store, path="array1")
array

<Array <icechunk.store.IcechunkStore object at 0x7f6ec411bed0>/array1 shape=(10000000,) dtype=int32>

In [6]:
array.append(np.random.rand(2_000_000))
session.status()

Zarr metadata updated:
    /array1

Chunks updated:
    /array1:
        [10]
        [11]

In [7]:
session.discard_changes()
session.status()



In [8]:
array.append(np.random.randint(999, size=3_000_000))
session.status()

Zarr metadata updated:
    /array1

Chunks updated:
    /array1:
        [12]
        [13]
        [14]

In [9]:
session.commit("appended data")

'P21CT7704W1N2M2ABVPG'

In [10]:
repo.ancestry(branch="main")

[SnapshotInfo(id="P21CT7704W1N2M2ABVPG", parent_id="SMNR3FM3NTPGA2G97HZ0", written_at=datetime.datetime(2025,2,18,18,32,17,279791, tzinfo=datetime.timezone.utc), message="appended d..."),
 SnapshotInfo(id="SMNR3FM3NTPGA2G97HZ0", parent_id="NJZ9FCPQW8ZS0AHSQVDG", written_at=datetime.datetime(2025,2,18,18,30,32,889718, tzinfo=datetime.timezone.utc), message="wrote init..."),
 SnapshotInfo(id="NJZ9FCPQW8ZS0AHSQVDG", parent_id=None, written_at=datetime.datetime(2025,2,18,18,28,47,126396, tzinfo=datetime.timezone.utc), message="Repository...")]

### Create tags and branches

In [11]:
# create tag referencing old commit
repo.create_tag("v1.0", snapshot_id=cid)

In [12]:
# create new branch off of main
repo.create_branch("dev", snapshot_id=repo.lookup_branch("main"))

In [13]:
repo.list_branches()

{'dev', 'main'}

In [14]:
repo.list_tags()

{'v1.0'}

In [15]:
repo.diff(from_tag="v1.0", to_branch="main")

Zarr metadata updated:
    /array1

Chunks updated:
    /array1:
        [12]
        [13]
        [14]

In [16]:
repo.lookup_branch("main"), repo.lookup_branch("dev")

('P21CT7704W1N2M2ABVPG', 'P21CT7704W1N2M2ABVPG')

### Modify the `dev` branch

In [17]:
session = repo.writable_session("dev")
array = zarr.open_array(session.store, path="array1")
array[-5_000_000:] = -1
session.status()

Chunks updated:
    /array1:
        [10]
        [11]
        [12]
        [13]
        [14]

In [18]:
session.commit("set some data to -1")

'VK9629ZB7CFKTKTZM63G'

### Switch between branches using *read-only Sessions*

In [19]:
zarr.open_array(repo.readonly_session(branch="main").store, path="array1")[-1]

array(873, dtype=int32)

In [20]:
zarr.open_array(repo.readonly_session(tag="v1.0").store, path="array1")[-1]

array(9999999, dtype=int32)

In [21]:
zarr.open_array(repo.readonly_session(branch="dev").store, path="array1")[-1]

array(-1, dtype=int32)

### Bring `main` up to date

In [22]:
repo.reset_branch("main", repo.lookup_branch("dev"))

## Conflict Detection

### Concurrent updates with no conflicts

In [23]:
session_A = repo.writable_session("main")
array_A = zarr.open_array(session_A.store, path="array1")

session_B = repo.writable_session("main")
array_B = zarr.open_array(session_B.store, path="array1")

array_A[0] = 42
array_B[-1] = -42

In [26]:
session_A.commit("wrote first element")

'QVFBBHZ4RJNDHEXVPM8G'

In [27]:
session_B.commit("wrote last element")

ConflictError: Failed to commit, expected parent: Some("VK9629ZB7CFKTKTZM63G"), actual parent: Some("QVFBBHZ4RJNDHEXVPM8G")

In [28]:
session_B.rebase(ic.ConflictDetector())
session_B.commit("rebased and merged")

'JFRN89GWSGR4RY08SRZG'

In [29]:
repo.ancestry(branch="main")

[SnapshotInfo(id="JFRN89GWSGR4RY08SRZG", parent_id="QVFBBHZ4RJNDHEXVPM8G", written_at=datetime.datetime(2025,2,18,18,39,40,364178, tzinfo=datetime.timezone.utc), message="rebased an..."),
 SnapshotInfo(id="QVFBBHZ4RJNDHEXVPM8G", parent_id="VK9629ZB7CFKTKTZM63G", written_at=datetime.datetime(2025,2,18,18,38,19,605025, tzinfo=datetime.timezone.utc), message="wrote firs..."),
 SnapshotInfo(id="VK9629ZB7CFKTKTZM63G", parent_id="P21CT7704W1N2M2ABVPG", written_at=datetime.datetime(2025,2,18,18,34,39,676402, tzinfo=datetime.timezone.utc), message="set some d..."),
 SnapshotInfo(id="P21CT7704W1N2M2ABVPG", parent_id="SMNR3FM3NTPGA2G97HZ0", written_at=datetime.datetime(2025,2,18,18,32,17,279791, tzinfo=datetime.timezone.utc), message="appended d..."),
 SnapshotInfo(id="SMNR3FM3NTPGA2G97HZ0", parent_id="NJZ9FCPQW8ZS0AHSQVDG", written_at=datetime.datetime(2025,2,18,18,30,32,889718, tzinfo=datetime.timezone.utc), message="wrote init..."),
 SnapshotInfo(id="NJZ9FCPQW8ZS0AHSQVDG", parent_id=None, wri

### Concurrent updates _with_ conflicts

In [30]:
session_A = repo.writable_session("main")
array_A = zarr.open_array(session_A.store, path="array1")

session_B = repo.writable_session("main")
array_B = zarr.open_array(session_B.store, path="array1")

array_A[999_999] = 42
array_B[999_999] = -42

In [31]:
session_A.commit("wrote from A")
session_B.commit("wrote from B")

ConflictError: Failed to commit, expected parent: Some("JFRN89GWSGR4RY08SRZG"), actual parent: Some("K241WBRCD0T4Y698C6JG")

In [32]:
try:
    session_B.rebase(ic.ConflictDetector())
except ic.RebaseFailedError as e:
    print(e.conflicts)

[Conflict(ChunkDoubleUpdate, path=/array1)]


In [33]:
session_B.rebase(ic.BasicConflictSolver(on_chunk_conflict=ic.VersionSelection.UseOurs))
session_B.commit("commited from B, overwriting A")

'G4WJSNP1M4AP77EEAY6G'

In [34]:
zarr.open_array(repo.readonly_session(branch="main").store, path="array1")[999_999]

array(-42, dtype=int32)

## Clean up repo

In [35]:
repo.delete_branch("dev")
repo.delete_tag("v1.0")

In [36]:
anc = repo.ancestry(branch="main")
anc

[SnapshotInfo(id="G4WJSNP1M4AP77EEAY6G", parent_id="K241WBRCD0T4Y698C6JG", written_at=datetime.datetime(2025,2,18,18,41,31,934556, tzinfo=datetime.timezone.utc), message="commited f..."),
 SnapshotInfo(id="K241WBRCD0T4Y698C6JG", parent_id="JFRN89GWSGR4RY08SRZG", written_at=datetime.datetime(2025,2,18,18,40,42,983156, tzinfo=datetime.timezone.utc), message="wrote from..."),
 SnapshotInfo(id="JFRN89GWSGR4RY08SRZG", parent_id="QVFBBHZ4RJNDHEXVPM8G", written_at=datetime.datetime(2025,2,18,18,39,40,364178, tzinfo=datetime.timezone.utc), message="rebased an..."),
 SnapshotInfo(id="QVFBBHZ4RJNDHEXVPM8G", parent_id="VK9629ZB7CFKTKTZM63G", written_at=datetime.datetime(2025,2,18,18,38,19,605025, tzinfo=datetime.timezone.utc), message="wrote firs..."),
 SnapshotInfo(id="VK9629ZB7CFKTKTZM63G", parent_id="P21CT7704W1N2M2ABVPG", written_at=datetime.datetime(2025,2,18,18,34,39,676402, tzinfo=datetime.timezone.utc), message="set some d..."),
 SnapshotInfo(id="P21CT7704W1N2M2ABVPG", parent_id="SMNR3FM3

In [37]:
latest_time = anc[0].written_at
latest_time

datetime.datetime(2025, 2, 18, 18, 41, 31, 934556, tzinfo=datetime.timezone.utc)

In [38]:
exp = repo.expire_snapshots(anc[0].written_at)
exp

{'JFRN89GWSGR4RY08SRZG',
 'K241WBRCD0T4Y698C6JG',
 'P21CT7704W1N2M2ABVPG',
 'QVFBBHZ4RJNDHEXVPM8G',
 'SMNR3FM3NTPGA2G97HZ0',
 'VK9629ZB7CFKTKTZM63G'}

In [39]:
repo.ancestry(branch="main")

[SnapshotInfo(id="G4WJSNP1M4AP77EEAY6G", parent_id="NJZ9FCPQW8ZS0AHSQVDG", written_at=datetime.datetime(2025,2,18,18,41,31,934556, tzinfo=datetime.timezone.utc), message="commited f..."),
 SnapshotInfo(id="NJZ9FCPQW8ZS0AHSQVDG", parent_id=None, written_at=datetime.datetime(2025,2,18,18,28,47,126396, tzinfo=datetime.timezone.utc), message="Repository...")]

In [40]:
gc = repo.garbage_collect(anc[0].written_at)
gc.snapshots_deleted, gc.chunks_deleted

(6, 6)

In [41]:
! aws s3 ls --recursive s3://icechunk-test/dvc-webinar/version-control-demo

2025-02-18 18:29:18    2716732 dvc-webinar/version-control-demo/chunks/3ERM7XNT3CHXGPK65VQ0
2025-02-18 18:29:18    2763398 dvc-webinar/version-control-demo/chunks/61ME72TR8QH2R2ND7CX0
2025-02-18 18:29:18    2761085 dvc-webinar/version-control-demo/chunks/8VASPD5S9R5PRYZT9HEG
2025-02-18 18:29:18    2725613 dvc-webinar/version-control-demo/chunks/E12CE25CQ80X27AD7N0G
2025-02-18 18:29:18    2765134 dvc-webinar/version-control-demo/chunks/GGXJ91G93CRADND4YAKG
2025-02-18 18:29:18    2739797 dvc-webinar/version-control-demo/chunks/HEVN87Z735EX4W7QG6MG
2025-02-18 18:29:18    2762001 dvc-webinar/version-control-demo/chunks/HK81PN8P2SBC3BJ1G0PG
2025-02-18 18:40:21    2687620 dvc-webinar/version-control-demo/chunks/N5YY04Z03ZCHHEXE41VG
2025-02-18 18:29:18    2761795 dvc-webinar/version-control-demo/chunks/PDCFVMPHZS0AEF3CBQSG
2025-02-18 18:29:18    2733578 dvc-webinar/version-control-demo/chunks/S5XNQAHS53G45165BKD0
2025-02-18 18:41:32        383 dvc-webinar/version-control-demo/manifests/XS8EZ7

In [None]:
! aws s3 rm --recursive s3://icechunk-test/dvc-webinar/version-control-demo