Permalink
Fetching contributors…
Cannot retrieve contributors at this time
108 lines (89 sloc) 3.09 KB
# -*- coding: utf-8 -*-
from future.builtins import bytes
from iscclib.const import CHUNKING_GEAR
def chunkify(data, normal_size_big=4096, normal_size_small=40):
"""
Split data into content defined chunks with target size normal_size.
:param bytes data: Data to be chunked
:param int normal_size: Target chunk size
:return list[bytes]: A list of chunked bytes
"""
boundaries = []
while len(data) > 0:
if len(boundaries) < 100:
boundary = fast_cdc(bytes(data), normal_size_small)
boundaries.append(data[:boundary])
data = data[boundary:]
else:
boundary = fast_cdc(bytes(data), normal_size_big)
boundaries.append(data[:boundary])
data = data[boundary:]
return boundaries
def iter_chunks(stream, normal_size_big=4096, normal_size_small=40):
"""
A generator that yields content defined chunks from a stream of data.
:param file stream: Stream of data that supports read
:param int normal_size: Target chunk size (20, 40, 80) for the first 100 chunks, then (2048, 4096, 8192)
:return Generator[bytes]: Chunked bytes
"""
data = stream.read(1000)
counter = 0
while True:
if counter < 100:
if len(data) < 650:
data = data + stream.read(650)
if len(data) == 0:
break
boundary = fast_cdc(bytes(data), normal_size_small)
yield data[:boundary]
data = data[boundary:]
counter += 1
else:
if len(data) < 65536:
data = data + stream.read(65536)
if len(data) == 0:
break
boundary = fast_cdc(bytes(data), normal_size_big)
yield data[:boundary]
data = data[boundary:]
counter += 1
def fast_cdc(data, normal_size=4096):
"""
Returns index of first boundary in given data.
This is an implementation of a content defined chunking algorithm as
proposed in the following paper by Wen Xia:
https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
:param bytes data: Data to be searched for next chunkpoint
:param int normal_size: Target chunk size (20, 40, 80) or (2048, 4096, 8192)
:return int: Index into bytes for chunkpoint
"""
if normal_size < 1000:
mask_s = 0x016118
mask_l = 0x00a0b1
min_size = 20
max_size = 640
else:
mask_s = 0x0003590703530000
mask_l = 0x0000d90003530000
min_size = 2048
max_size = 65536
data_length = len(data)
fp = 0
i = min_size
if data_length <= min_size:
return data_length
if data_length > max_size:
data_length = max_size
elif data_length <= normal_size:
normal_size = data_length
while i < normal_size:
fp = (fp << 1) + CHUNKING_GEAR[data[i]]
if not fp & mask_s:
return i
i = i + 1
while i < data_length:
fp = (fp << 1) + CHUNKING_GEAR[data[i]]
if not fp & mask_l:
return i
i = i + 1
return i