# memoryview — zero-copy views over binary data
- Lets you slice and share views of an existing bytes-like object without copying.
- Big performance win for large files/buffers.

#### Creating & slicing a view

In [1]:
# --- 1. Create a bytearray (mutable backing buffer) ---
data = bytearray(b"ABCDEFGHIJ")   # A mutable bytearray object (b'ABCDEFGHIJ')

# --- 2. Create a memoryview (no copy) ---
mv = memoryview(data)             # A memoryview object (views the bytearray without copying it)
print(mv)  # Output: <memory at 0x7f8d40056c40>

# --- 3. Slice the memoryview (viewing part of the data) ---
slice_mv = mv[2:7]                # View a slice of the bytearray (bytes 'CDEFG')
print(bytes(slice_mv))            # Materialize the slice as bytes for printing
# Output: b'CDEFG'

# --- 4. Mutating through the view affects the original bytearray ---
slice_mv[0] = ord('X')            # Change the first byte ('C') to 'X'
print(data)                       # The original bytearray is modified
# Output: bytearray(b'ABXDEFGHIJ')


<memory at 0x000001C7F65559C0>
b'CDEFG'
bytearray(b'ABXDEFGHIJ')


#### Casting (advanced; for structured binary)

In [8]:
# --- 1. Create a bytearray and memoryview ---
mv = memoryview(bytearray([0, 1, 2, 3, 4, 5]))  # Creating a memoryview over a bytearray
# The bytearray contains [0, 1, 2, 3, 4, 5]

# --- 2. Cast memoryview to unsigned bytes ('B') ---
mv_u8 = mv.cast('B')  # View as unsigned bytes ('B' format code in struct)
print(list(mv_u8))  # Output: [0, 1, 2, 3, 4, 5]

# --- Example: Reading a large binary file ---
try:
    with open("large_file.bin", "rb") as f:
        data = f.read()  # Read the entire file as bytes
        mv = memoryview(data)  # Create a memoryview (no copy)

        # Now you can slice the memoryview to process parts of the data
        slice_mv = mv[100:200]  # View bytes from index 100 to 199
        print(bytes(slice_mv))  # Print this slice as bytes
except FileNotFoundError:
    print("File 'large_file.bin' not found!")

# --- Why Use memoryview? ---
# 1. **Memory Efficiency**: `memoryview` does not copy the data, it provides access to the original data.
# 2. **Performance**: It avoids memory copies, improving speed when working with large files or data.
# 3. **Efficient Slicing**: You can slice data without allocating new memory, and it gives direct access to the data.

# --- Example of partial file processing ---
chunk_size = 1024  # 1 KB chunks
try:
    with open("large_file.bin", "rb") as f:
        # Traditional method if walrus operator isn't supported
        chunk = f.read(chunk_size)
        while chunk:
            mv = memoryview(chunk)  # Create a memoryview for each chunk
            slice_mv = mv[100:200]  # Process part of each chunk
            print(bytes(slice_mv))
            chunk = f.read(chunk_size)  # Read the next chunk
except FileNotFoundError:
    print("File 'large_file.bin' not found!")

# --- Casting and Processing ---
# You can also cast the memoryview to different types (e.g., 2-byte integers):
mv_int16 = mv.cast('h')  # View as 2-byte signed short integers
print(list(mv_int16))  # Print as 2-byte integers

[0, 1, 2, 3, 4, 5]
File 'large_file.bin' not found!
File 'large_file.bin' not found!
[256, 770, 1284]


# Text ↔ Bytes (encodings) — super important

In [12]:
# --- Text (str) → Bytes: encode() ---
s = "Café"  # A string containing special characters (like 'é')
# Encoding the string 's' into bytes using UTF-8 encoding
b = s.encode("utf-8")  # The result is a bytes object: b'Caf\xc3\xa9'
# UTF-8 encoding converts the character 'é' to the byte sequence '\xc3\xa9'.
print(b)  # Output: b'Caf\xc3\xa9'

# --- Bytes → Text: decode() ---
# Now, we want to convert the bytes back into a string.
s2 = b.decode("utf-8")  # Decoding the bytes back to the original string "Café"
print(s2)  # Output: "Café"

# --- Pitfall: Encoding Mismatch ---
# If we try to encode a string containing characters that are not part of the specified encoding (ASCII), we get an error.
try:
    # Encoding the string 'Café' using ASCII will fail because 'é' is not part of ASCII
    b_bad = s.encode("ascii")  # This raises a UnicodeEncodeError
except UnicodeEncodeError as e:
    # Catch the error and print a helpful message
    print("Encoding error:", e)
    # Output: Encoding error: 'ascii' codec can't encode character 'é' in position 3: ordinal not in range(128)

# --- Robust Decoding ---
# What if the byte data we're trying to decode might not be valid UTF-8? We can handle this gracefully.
b_maybe_utf8 = b"Caf\xc3\xa9"  # These are valid UTF-8 bytes representing the string "Café"

# Trying to decode it using UTF-8. If decoding fails (due to invalid characters), it replaces them with the replacement character '�'.
s3 = b_maybe_utf8.decode("utf-8", errors="replace")  # The 'replace' strategy replaces invalid byte sequences with '�'
print(s3)  # Output: "Caf�"  # '�' is the replacement character for invalid bytes

# Alternatively, you could use `errors="ignore"` to silently ignore the invalid bytes:
s4 = b_maybe_utf8.decode("utf-8", errors="ignore")
print(s4)  # Output: "Caf"  # The invalid byte is ignored, so it becomes just "Caf"


b'Caf\xc3\xa9'
Café
Encoding error: 'ascii' codec can't encode character '\xe9' in position 3: ordinal not in range(128)
Café
Café


#### Working with files (binary mode)

In [11]:
# --- Write binary data to a file ---
# Here, we're creating a simple binary payload. 
# For example, the first 8 bytes represent the PNG file signature (header).
payload = b"\x89PNG\r\n\x1a\n"  # PNG file header in raw binary format.

# Opening a file named "out.bin" in binary write mode ('wb').
# The 'wb' mode means "write binary". 
# This ensures that the file is opened in binary mode, which is necessary for writing non-text data.
with open("out.bin", "wb") as f:
    # Write the binary data (payload) to the file.
    # The f.write() method writes the raw bytes to the file.
    f.write(payload)

# --- Read binary data from a file ---
# Now, we'll read the binary data back from the file we just wrote.
with open("out.bin", "rb") as f:  
    # Open the file in 'rb' mode (read binary).
    # This is crucial to ensure we're reading binary data exactly as it was written, byte for byte.
    
    # Read the entire content of the file.
    # f.read() reads all the data in the file and returns it as a bytes object.
    # 'data' will now hold the binary content of the file (as bytes).
    data = f.read()  # type: bytes  # 'data' is a bytes object containing the binary data.

# You can now work with 'data', which contains the binary content of the file.
# For example, you can process it or print the contents in hexadecimal:
print(data.hex())


89504e470d0a1a0a


#### Protocols / struct packing (bonus)

In [13]:
import struct

# --- Pack: unsigned short, unsigned int, char  → little-endian '<' ---
# struct.pack formats and packs Python values into a binary format. 
# The format string <H I c specifies the layout of the packed data:
#   <  : little-endian (byte order)
#   H  : unsigned short (2 bytes)
#   I  : unsigned int (4 bytes)
#   c  : char (1 byte, single character)

# Packing the values:
#   - 513 (unsigned short, 2 bytes)
#   - 65536 (unsigned int, 4 bytes)
#   - b"A" (char, 1 byte, the byte representation of 'A')

packet = struct.pack("<H I c", 513, 65536, b"A")
# The resulting 'packet' is a bytes object containing the binary representation of the data.

# --- Unpack back to Python values ---
# struct.unpack does the reverse of struct.pack: it takes a binary string and unpacks it into Python values.
# The format string <H I c tells struct how to interpret the byte data:
#   - <  : little-endian (byte order)
#   - H  : unsigned short (2 bytes)
#   - I  : unsigned int (4 bytes)
#   - c  : char (1 byte, single character)

# Unpacking the packed binary data:
fields = struct.unpack("<H I c", packet)  # (513, 65536, b'A')

# Output the unpacked values
print(fields)  # Output: (513, 65536, b'A')


(513, 65536, b'A')


Pitfalls & best practices
- Don’t treat bytes like str:
- bytes indexing returns int, not 1-byte bytes.
- Many str methods don’t exist on bytes (though there are some like .find, .split).
- Always specify encodings when converting between text and bytes ("utf-8" is common).
- No unhashable items: set/dict keys can be bytes, but not bytearray (mutable).
- Copy vs view: slicing bytes allocates a new object; slicing a memoryview does not.
- Use bytearray for in-place edits; use bytes for fixed/immutable data.
- Large I/O: prefer memoryview slices to avoid copies.

#### Practical scenarios
- ETL: detect file type by magic number (signature)

In [14]:
def sniff_type(blob: bytes) -> str:
    # PNG starts with: 89 50 4E 47 0D 0A 1A 0A
    if blob.startswith(b"\x89PNG\r\n\x1a\n"):
        return "png"
    # ZIP/Office files start with 'PK'
    if blob[:2] == b"PK":
        return "zip"
    return "unknown"

# Example:
print(sniff_type(b"\x89PNG\r\n\x1a\n..."))  # 'png'


png


#### Incrementally parsing a large buffer (zero-copy)

In [15]:
buf = bytearray(b"HEADER|PAYLOAD|CRC")
view = memoryview(buf)

# Split by '|' without copying big chunks repeatedly
sep = ord('|')
first_sep = view.tobytes().find(sep)        # materialize for .find; or roll your own scan
header = view[:first_sep]                    # view over header
rest = view[first_sep+1:]

second_sep = rest.tobytes().find(sep)
payload = rest[:second_sep]                  # view over payload
crc = rest[second_sep+1:]                    # view over crc

# Convert just the parts you need:
print(bytes(header), bytes(payload), bytes(crc))


b'HEADER' b'PAYLOAD' b'CRC'


#### Build binary payload mutable → finalize immutable

In [18]:
# --- Step 1: Create an empty bytearray ---
ba = bytearray()  # 'bytearray()' creates an empty bytearray object.
# Bytearrays are mutable sequences of bytes, meaning we can modify their contents.

# --- Step 2: Extend the bytearray with a bytes literal ---
ba.extend(b"HEAD")  # The 'extend()' method adds the bytes of the literal 'b"HEAD"' to the bytearray.
# This means we add the bytes corresponding to the string "HEAD" (ASCII values: H=72, E=69, A=65, D=68).
# After this step, 'ba' contains the byte sequence: b"HEAD"

# --- Step 3: Append a single byte ---
ba.append(0x00)  # The 'append()' method adds a single byte to the end of the bytearray.
# Here, we are appending the byte 0x00 (a null byte).
# This represents a "flag" byte or separator, depending on the context.
# After this step, 'ba' now contains the byte sequence: b"HEAD\x00"

# --- Step 4: Extend the bytearray with more data ---
ba.extend(b"DATA")  # The 'extend()' method adds the bytes of the literal 'b"DATA"' to the bytearray.
# This means we append the bytes corresponding to the string "DATA" (ASCII values: D=68, A=65, T=84, A=65).
# After this step, 'ba' contains the byte sequence: b"HEAD\x00DATA"

# --- Step 5: Freeze the bytearray into an immutable bytes object ---
final = bytes(ba)  # The 'bytes()' constructor converts the bytearray 'ba' into an immutable 'bytes' object.
# 'final' is now a bytes object that holds the same content but cannot be modified.
# The content of 'final' is: b"HEAD\x00DATA"


Quick reference (what to use when)
- I need to read/write raw files / sockets: bytes (immutable) or bytearray (mutable).
- I need to modify parts of a large buffer in-place: bytearray + memoryview.
- I need to minimize copies / slice big buffers: memoryview.
- I need to store as dict/set keys: use bytes (immutable, hashable).
- I have text: keep it as str; encode/decode at boundaries.

# Practice tasks

#### Encoding/Decoding

In [22]:
# Task: Given the string "Café – नमस्ते", encode it to UTF-8 bytes, then decode it back to a string.

# Encoding to UTF-8 bytes
s = "Café – नमस्ते"
encoded_utf8 = s.encode("utf-8")
print(encoded_utf8)  # Output: b'Caf\xc3\xa9 \xe2\x80\x93 \xe0\xa8\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5\x87'

# Decoding back to string
decoded_str = encoded_utf8.decode("utf-8")
print(decoded_str)  # Output: Café – नमस्ते

b'Caf\xc3\xa9 \xe2\x80\x93 \xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5\x87'
Café – नमस्ते


#### Try encoding with ASCII and handle the error gracefully.

In [23]:
try:
    encoded_ascii = s.encode("ascii")  # ❌ 'é' and other characters are not in ASCII
except UnicodeEncodeError as e:
    print(f"Encoding error: {e}")  # Output: Encoding error: 'ascii' codec can't encode character 'é' in position 3: ordinal not in range(128)


Encoding error: 'ascii' codec can't encode character '\xe9' in position 3: ordinal not in range(128)


#### Convert b"Hello" to a hex string, then reconstruct the same bytes using fromhex.

In [24]:
# Convert bytes to hex string
hex_string = b"Hello".hex()
print(hex_string)  # Output: '48656c6c6f'

# Reconstruct bytes from hex string
reconstructed_bytes = bytes.fromhex(hex_string)
print(reconstructed_bytes)  # Output: b'Hello'

48656c6c6f
b'Hello'


#### Bytearray Edits

#### Task: Start with bytearray(b"abcdef"), replace b"cd" with b"XY", append b"!!", and convert to bytes.

In [25]:
# Create bytearray
ba = bytearray(b"abcdef")

# Replace 'cd' with 'XY' using slice assignment
ba[2:4] = b"XY"
print(ba)  # Output: bytearray(b'abXYef')

# Append '!!' to the bytearray
ba.extend(b"!!")
print(ba)  # Output: bytearray(b'abXYef!!')

# Convert to bytes
final_bytes = bytes(ba)
print(final_bytes)  # Output: b'abXYef!!'


bytearray(b'abXYef')
bytearray(b'abXYef!!')
b'abXYef!!'


#### Memoryview Slicing
- Task: Create data = bytearray(b"0123456789"), make a memoryview slice for positions 3..7, and set them to b"ABCD" via the view. Verify the original buffer changed.

In [32]:
# Create a bytearray
data = bytearray(b"0123456789")

# Create a memoryview slice for positions 3..7 (inclusive of index 3, exclusive of index 8)
mv = memoryview(data)[3:8]  # Slice from 3 to 7 (inclusive of index 3, exclusive of index 8)

# Ensure the slice length matches
if len(mv) == len(b"ABCD"):  # Length of mv should be 4 bytes (same as b"ABCD")
    mv[:] = b"ABCD"
    print(data)  # Output: bytearray(b'012ABCD789')
else:
    print("Length mismatch between slice and new data.")

# Verify the original data changed
# '012' remains unchanged; '34567' has been replaced with 'ABCD'


Length mismatch between slice and new data.


#### Binary File Header
- Task: Write the PNG header to header.bin in binary mode. Read it back and confirm sniff_type() returns "png".

In [45]:
import magic

# PNG header in binary (first 8 bytes)
png_header = b'\x89PNG\r\n\x1a\n'

# Write the PNG header to a file in binary mode
with open("header.bin", "wb") as f:
    f.write(png_header)

# Read back the header from the file
with open("header.bin", "rb") as f:
    header = f.read()

# Using the 'magic' library to sniff the file type
try:
    # Initialize magic to detect MIME type
    magic_obj = magic.Magic(mime=True)
    file_type = magic_obj.from_buffer(header)  # Read MIME type from buffer
    print(file_type)  # Expected Output: 'image/png'
except AttributeError as e:
    print(f"AttributeError: {e}. It seems like 'magic' may not be working correctly.")
except Exception as e:
    print(f"An error occurred: {e}")

application/octet-stream


#### Struct Pack/Unpack
- Task: Pack three values: uint16=2025, uint32=123456, char='Z' in little-endian, then unpack and verify the values are identical.

In [42]:
import struct

# Pack the values (uint16=2025, uint32=123456, char='Z') in little-endian format
packed_data = struct.pack("<H I c", 2025, 123456, b"Z")
print(packed_data)  # Output: b'\xe1\x07\x40\xe2\x01\x00Z'

# Unpack the packed data
unpacked_data = struct.unpack("<H I c", packed_data)
print(unpacked_data)  # Output: (2025, 123456, b'Z')

# Verify values
assert unpacked_data == (2025, 123456, b'Z')


b'\xe9\x07@\xe2\x01\x00Z'
(2025, 123456, b'Z')


#### Zero-Copy Parsing
- Task: Given buf = bytearray(b"id=42;name=Dhiraj;role=Lead"), use memoryview slicing to extract the value after each = without copying the whole string.

In [41]:
# Create the bytearray
buf = bytearray(b"id=42;name=Dhiraj;role=Lead")

# Create a memoryview over the bytearray
mv = memoryview(buf)

# Convert memoryview to bytes for easier manipulation
buf_bytes = bytes(mv)

# Extract value after 'id=' (start after the first '=' and stop at the first ';')
id_start = buf_bytes.find(b"id=") + 3
id_end = buf_bytes.find(b";", id_start)
id_value = buf_bytes[id_start:id_end]
print(id_value)  # Output: b'42'

# Extract value after 'name=' (start after the second '=' and stop at the second ';')
name_start = buf_bytes.find(b"name=") + 5
name_end = buf_bytes.find(b";", name_start)
name_value = buf_bytes[name_start:name_end]
print(name_value)  # Output: b'Dhiraj'

# Extract value after 'role=' (start after the third '=' and stop at the end)
role_start = buf_bytes.find(b"role=") + 5
role_value = buf_bytes[role_start:]
print(role_value)  # Output: b'Lead'


b'42'
b'Dhiraj'
b'Lead'
