# Purpose

The purpose of this script is to get the embeddings out of the tfrecords file and into a useful format for graphing/manipulation

Sources include:
https://www.tensorflow.org/tutorials/load_data/tfrecord

We know that the output of the embedded, post-processed model is a "SequenceExample in a similar format as the features released in AudioSet. Each row of the batch of embeddings corresponds to roughly a second of audio (96 10ms frames), and the rows are written as a sequence of bytes-valued features, where each feature value contains the 128 bytes of the whitened quantized embedding."
    
So we have byte-valued features, likely requiring tf.train.Feature uses tf.train.BytesList.

In [31]:
import tensorflow as tf
import numpy as np
import IPython.display as display
filename = '../intermediate_data/181204-203002-437599-806141979.tfrecords'

# TensorFlow method
Getting the raw dataset

In [30]:
filenames = '../intermediate_data/181204-203002-437599-806141979.tfrecords'
raw_dataset = tf.data.TFRecordDataset(filenames)

for raw_record in raw_dataset.take(3): #this pulls the first record
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)




Parsing the data and returning the raw TFRecord format




In [None]:
Attempting to read using numpy

In [9]:
result = {}
# example.features.feature is the dictionary
for key, feature in raw_dataset.items():
    # The values are the Feature objects which contain a `kind` which contains:
    # one of three fields: bytes_list, float_list, int64_list
    kind = feature.WhichOneof('kind')
    result[key] = np.array(getattr(feature, kind).value)

result

AttributeError: 'TFRecordDatasetV2' object has no attribute 'items'

In [None]:
GPT Answer

In [12]:
import tensorflow as tf

file_paths = ["../intermediate_data/181204-203002-437599-806141979.tfrecords"]
dataset = tf.data.TFRecordDataset(file_paths)
dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [20]:
def parse_function(serialized_example):
    feature_description = {
        'feature_embedding': tf.io.FixedLenFeature([], tf.bytes_list)
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example

In [21]:
parsed_dataset = dataset.map(parse_function)
parsed_dataset

AttributeError: in user code:

    File "C:\Users\ekrol\AppData\Local\Temp\ipykernel_5564\3349476032.py", line 2, in parse_function  *
        feature_description = {

    AttributeError: module 'tensorflow' has no attribute 'bytes_list'


# GPT Method

In [32]:
import tensorflow as tf

tfrecords_file_path = filename

dataset = tf.data.TFRecordDataset(tfrecords_file_path)

for raw_record in dataset.take(1):
    # Print the raw serialized record
    print("Raw Record:", raw_record.numpy())

    # Parse the serialized record without decoding
    context_feature_description = {}
    sequence_feature_description = {}

    context, sequence = tf.io.parse_single_sequence_example(
        raw_record,
        context_features=context_feature_description,
        sequence_features=sequence_feature_description
    )

    # Print the parsed context and sequence features
    print("Parsed Context:", context)
    print("Parsed Sequence:", sequence)

Raw Record: b'\x12\x9b\xfd.\n\x97\xfd.\n\x0faudio_embedding\x12\x82\xfd.\n\x86\x01\n\x83\x01\n\x80\x01\x9a=Xh\x80_Q\x99\xb2\xd8\x82N,<\x1e?m\xaaD\x9f\x87L\xb6\x13\xa2vs\xbb~\x9cp)\xc1U\xb7\xa9\xa8\x00*\x00f\xff\xe1\x06\xa8\x8c\xd9kL\xef\x91\xff\xff\xff\x96\x97bYR1{\xff9e\x00\x85\xae\xff\x00\xaa\x06\xf5\xb5\x00WO{\xee\xff4\x87@l5r\xff\x0b\xa3z\x8d\xff\xba\x00\x00L<F\xff\xad[Wk\x82\x96E\xef,%\xf8m@\x00\x00\xfe\xff\xe8\x9f(w\xc8fL\x8c\xff\x8f\'\x00\r\n\x86\x01\n\x83\x01\n\x80\x01\x95:nzjcf\x99\xae\xe1\x81C4C(=\x045a\xa1\x98l\xdd:\x94\xa4\xd9\xa1\x7f\xcdKu\xa4cV\xd8\x8b\x04\x00\x00W~\xec\xd0~\xb0\x9e]h\xc2R\xce\xffWH\xa67}cM\x88\x9b7\x84\x00\x8c\xe0\xd4\x00\xc8\x88\xff\x80\x00\x9b\x94\x88{\xadD\xa5\x93>\x9aN\xff\x13\xb0c\xd7\xff\xd3\n\x83\x87t\xb1\xff\xb4*Um\x97\x8f~\xf1^p6\x83%k\x00x\xdb\x96\x1eq9\xefs)\xab\xff\x9dT\x00\x00\n\x86\x01\n\x83\x01\n\x80\x01\x971jeR`u\x9e\xcd\x93mg:xE[\x1dbT\x92`\x9b\x94\x7f\x9c\xae\x95\xab~\xe3V\xa2\x92\x97e\x8b\xbf\x176\x07\x93\x8d\xff\x9c\x8el\xa2\x88\xbej\

ValueError: Both context_features and sequence_features are None, but at least one should have values.