### Neuralink Challenge - Part 2 

- Download the validation dataset
- See how the lookuptable does

In [1]:
# 1. Download challenge data
import requests, zipfile, io
res_zip = requests.get("https://content.neuralink.com/compression-challenge/data.zip")
zip_file = io.BytesIO(res_zip.content)
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall("./")

In [2]:
!ls data | wc -l

743


In [3]:
import os
import hashlib
import wave
import numpy as np
import sqlite3

# Function to generate a truncated SHA-256 hash for a given segment
def generate_short_hash(segment, hash_size_bytes):
    hash_sha256 = hashlib.sha256(segment).digest()[:hash_size_bytes]  # Truncate SHA-256 hash
    return hash_sha256.hex()  # Convert to hex string

# Function to pack samples into appropriate byte segments
def pack_samples(samples, bit_depth):
    packed_value = np.uint64(0)
    for i, sample in enumerate(samples):
        packed_value |= np.uint64(sample) << np.uint64(bit_depth * (len(samples) - i - 1))
    num_bytes = (bit_depth * len(samples) + 7) // 8
    packed_bytes = packed_value.tobytes()[:num_bytes]  # Convert to bytes
    return packed_bytes

# Function to load the lookup table from the SQLite database
def load_lookup_table(db_name):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute('SELECT hash, segment FROM lookup')
    lookup_table = {row[0]: row[1] for row in c.fetchall()}
    conn.close()
    return lookup_table

# Function to encode the wav data using the lookup table
def encode_wav_data_exhaustive(wav_data, bit_depth, segment_size, lookup_table, hash_size_bytes):
    encoded_data = []
    segment_found = []
    num_segments = len(wav_data) // segment_size

    for i in range(num_segments):
        samples = wav_data[i*segment_size:(i+1)*segment_size]
        segment = pack_samples(samples, bit_depth)
        hash_key = generate_short_hash(segment, hash_size_bytes)
        if hash_key in lookup_table:
            encoded_data.append(hash_key)
            segment_found.append(True)
        else:
            segment_found.append(False)

    return encoded_data, segment_found

# Function to calculate compression ratio
def calculate_compression_ratio(original_data, encoded_data, bit_depth, hash_size_bytes):
    original_size_bits = len(original_data) * bit_depth
    encoded_size_bits = len(encoded_data) * hash_size_bytes * 8  # Length of hash in bits
    compression_ratio = original_size_bits / encoded_size_bits
    return compression_ratio

# Function to process and score the uploaded WAV files
def process_uploaded_files(directory, bit_depth, segment_size, lookup_table, hash_size_bytes):
    file_scores = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):
            file_path = os.path.join(directory, file_name)
            wav = wave.open(file_path, 'r')
            n_frames = wav.getnframes()
            frames = wav.readframes(n_frames)
            wav_data = np.frombuffer(frames, dtype=np.uint16)  # 16-bit samples packed in 16-bit space
            wav.close()

            encoded_data, segment_found = encode_wav_data_exhaustive(wav_data, bit_depth, segment_size, lookup_table, hash_size_bytes)
            compression_ratio = calculate_compression_ratio(wav_data, encoded_data, bit_depth, hash_size_bytes)
            percentage_found = sum(segment_found) / len(segment_found) * 100

            file_scores.append({
                'file_name': file_name,
                'compression_ratio': compression_ratio,
                'percentage_found': percentage_found
            })
            print(file_scores[-1])
    return file_scores

# Function to get the count of rows in the lookup table
def count_rows_in_lookup_table(db_name):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute('SELECT COUNT(*) FROM lookup')
    count = c.fetchone()[0]
    conn.close()
    return count



In [4]:
# Parameters
bit_depth = 15
segment_size = 2   
hash_size_bytes = 3  # Truncated SHA-256 hash size in bytes (3 bytes = 24 bits)
db_name = 'nl_bit15_seg2_hash3.db'
directory = 'data'  # Directory containing the uploaded WAV files

# Load the lookup table
lookup_table = load_lookup_table(db_name)


In [5]:

# Count rows in the lookup table
row_count = count_rows_in_lookup_table(db_name)
print(f'Total number of rows in the lookup table: {row_count}')

# Process the uploaded files
file_scores_uploaded = process_uploaded_files(directory, bit_depth, segment_size, lookup_table, hash_size_bytes)

# Display the scores for uploaded files
for score in file_scores_uploaded:
    print(f"File: {score['file_name']}, Compression Ratio: {score['compression_ratio']:.2f}, Segments Found: {score['percentage_found']:.2f}%")


Total number of rows in the lookup table: 16777216
{'file_name': '102b47d9-371e-412a-8995-0dc6115ab2bb.wav', 'compression_ratio': 1.2500126648969585, 'percentage_found': 100.0}
{'file_name': '2eef5d4d-93d1-4c0e-9d23-0989abaa34d0.wav', 'compression_ratio': 1.2500126620745542, 'percentage_found': 100.0}
{'file_name': 'fa2c5efb-cc0d-4292-ab99-91d345cf17d9.wav', 'compression_ratio': 1.25, 'percentage_found': 100.0}
{'file_name': '0458e9fc-6403-427e-afec-6a659104399a.wav', 'compression_ratio': 1.2500126636138915, 'percentage_found': 100.0}
{'file_name': '30dee5fe-ded7-4978-9480-e40155e7b060.wav', 'compression_ratio': 1.25, 'percentage_found': 100.0}
{'file_name': '3953a230-d130-40e9-9dc4-068dda9bcef1.wav', 'compression_ratio': 1.25, 'percentage_found': 100.0}
{'file_name': '760ba446-aae7-4136-922c-9351c97504b8.wav', 'compression_ratio': 1.25, 'percentage_found': 100.0}
{'file_name': '2b1627e1-85a5-4155-ba52-4400e036b034.wav', 'compression_ratio': 1.25, 'percentage_found': 100.0}
{'file_name