In [1]:
import os
import json
import re
import datetime
import concurrent.futures
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

2023-05-16 12:48:49.205857: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# quick and dirty download of sample data using `!` to execute 
# command directly against CLI (rather than through the python 
# interpreter)
EMBEDDINGS_DIR = "point_one_percent_embeddings"
if not os.path.exists(EMBEDDINGS_DIR): 
    !gsutil -m cp -r "gs://a20_dropbox/point_one_percent_embeddings" .

In [3]:
# Define the feature description for parsing the TFRecordDataset
feature_description = {
    'timestamp_s': tf.io.FixedLenFeature([], tf.float32),
    'filename': tf.io.FixedLenFeature([], tf.string),
    'embedding': tf.io.FixedLenFeature([], tf.string),
    'embedding_shape': tf.io.FixedLenFeature([3], tf.int64)
}

# Define a function to parse the TFRecordDataset
def parse_tfrecord(example_proto):
    # Parse the features from the serialized example
    features = tf.io.parse_single_example(example_proto, feature_description)
    
    # extract embedding as 3D array of float32, from byte string 
    embedding = tf.io.parse_tensor(features["embedding"], out_type=tf.float32)
    
    return features['timestamp_s'], features["filename"], embedding, features["embedding_shape"]

In [4]:
# Parse single dataset file as an example
dataset_file = f"point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007"
raw_dataset = tf.data.TFRecordDataset(dataset_file)
dataset = raw_dataset.map(parse_tfrecord)

for timestamp_s, filename, embedding, embedding_shape in dataset.take(1):
    print("Timestamp: ", timestamp_s)
    print("Filename: ", filename)
    print("Embedding: ", embedding)
    print("Embedding shape: ", embedding_shape)


Timestamp:  tf.Tensor(5640.0, shape=(), dtype=float32)
Filename:  tf.Tensor(b'site_0062/20200308T220000+0800_Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera-Wet-A_192000.flac', shape=(), dtype=string)
Embedding:  tf.Tensor(
[[[ 1.2194262e-01 -5.2180082e-02  5.2327113e-03 ... -2.1131415e-02
   -2.7775053e-02 -1.7569216e-02]]

 [[ 8.3896384e-02  1.5045750e-01  4.5462926e-03 ... -6.7515844e-03
    1.8597849e-02 -2.3018550e-02]]

 [[ 1.0783694e-01  1.3868606e-01  2.0184470e-02 ... -8.1872819e-03
    2.5923029e-02 -2.4738500e-02]]

 ...

 [[ 5.9446793e-02  2.2982250e-01 -1.7121695e-04 ...  1.0457955e-02
   -7.0175850e-03 -1.4521907e-02]]

 [[ 1.1026376e-01 -3.6579993e-02  3.6871318e-02 ... -4.6614930e-03
    3.0870575e-02 -3.2219391e-03]]

 [[ 1.8443571e-01 -4.1682284e-02  2.3081422e-02 ... -8.0404608e-03
   -3.4469940e-02 -2.2840943e-02]]], shape=(12, 1, 1280), dtype=float32)
Embedding shape:  tf.Tensor([  12    1 1280], shape=(3,), dtype=int64)


2023-05-16 12:48:53.166974: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


In [5]:
# convert extracted metadata to insertable data records
def prep_data_from_file(dataset_filename): 
    data = []
    raw_dataset = tf.data.TFRecordDataset(dataset_filename)
    count = 0
    for timestamp_s, filename, embedding, embedding_shape in raw_dataset.map(parse_tfrecord).as_numpy_iterator():
        [(
            site_id, 
            file_datetime, 
            timezone, 
            site_name, 
            subsite_name, 
            file_seq_id
        )] = re.findall(
            # I'm quite proud of myself for this regex, but if anyone can see 
            # a way to simplify it, please let me know!
            r"site_(?P<site_id>\d{4})\/(?P<datetime>\d{8}T\d{6})(?P<timezone>(?:\+\d{4})|Z)_(?P<site_name>(?:\w*|-)*)-(?P<subsite_name>(?:Wet|Dry)-(?:A|B))_(?P<file_seq_id>\d*).flac",
            filename.decode("utf-8")
        )
        
        # Some files have just "Z" as timezone
        timezone = "+0000" if timezone == "Z" else timezone
        file_datetime = datetime.datetime.strptime(f"{file_datetime}{timezone}", f"%Y%m%dT%H%M%S%z").timestamp()
        
        # `embedding` is a 3D array with Dims [12,1,1280]
        # We loop over the first dimension to "flatten" 
        # the 12 emebddings per minute
        # and extract the single channel (2nd dimension). 
        # We add each of the 12 embeddings as their own record
        for i, _embedding in enumerate(embedding[:,0]):
            _data = {
                "embedding": _embedding, 
                "file_timestamp": int(file_datetime), 
                "offset": int(timestamp_s + (5*i)), 
                "site_id": int(site_id), 
                "site_name": site_name, 
                "subsite_name": subsite_name, 
                "file_seq_id": int(file_seq_id),
                "filename": filename.decode("utf-8")
            }
            data.append(_data)
        count += 1
    print(f"Processed data file: {dataset_filename}, found {count} records")
    return data

# example 
data = prep_data_from_file("point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007")
data[0:5]    

2023-05-16 12:48:53.221562: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007, found 24903 records


[{'embedding': array([ 0.12194262, -0.05218008,  0.00523271, ..., -0.02113141,
         -0.02777505, -0.01756922], dtype=float32),
  'file_timestamp': 1583676000,
  'offset': 5640,
  'site_id': 62,
  'site_name': 'Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera',
  'subsite_name': 'Wet-A',
  'file_seq_id': 192000,
  'filename': 'site_0062/20200308T220000+0800_Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera-Wet-A_192000.flac'},
 {'embedding': array([ 0.08389638,  0.1504575 ,  0.00454629, ..., -0.00675158,
          0.01859785, -0.02301855], dtype=float32),
  'file_timestamp': 1583676000,
  'offset': 5645,
  'site_id': 62,
  'site_name': 'Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera',
  'subsite_name': 'Wet-A',
  'file_seq_id': 192000,
  'filename': 'site_0062/20200308T220000+0800_Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera-Wet-A_192000.flac'},
 {'embedding': array([ 0.10783694,  0.13868606,  0.02018447, ..., -0.00818728,
          0.02592303, -0.0247385 ], dtype=float

In [6]:
%%time
# generate insertable records from all files in the 0.1 percent sample
data = [
    prep_data_from_file(f"point_one_percent_embeddings/a2o_sample_embeddings-0000{i}-of-00007") 
    for i in range(0,7)
]

# flatten 2 list for of records 
data = [record for _data in data for record in _data]

print(f"{len(data)} records to insert into the milvus collection")

2023-05-16 12:49:01.637069: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007, found 24903 records


2023-05-16 12:49:10.226445: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00001-of-00007, found 3102 records


2023-05-16 12:49:11.193622: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00002-of-00007, found 7740 records


2023-05-16 12:49:13.944556: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00003-of-00007, found 5981 records
Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00004-of-00007, found 424 records


2023-05-16 12:49:15.850773: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-05-16 12:49:16.052358: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00005-of-00007, found 21629 records


2023-05-16 12:49:24.160185: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00006-of-00007, found 9041 records
872994 records to insert into the milvus collection
CPU times: user 34.3 s, sys: 7.5 s, total: 41.8 s
Wall time: 27.7 s


In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,embedding,file_timestamp,offset,site_id,site_name,subsite_name,file_seq_id,filename
0,"[0.12194262, -0.05218008, 0.0052327113, 0.0393...",1583676000,5640,62,Uunguu-Indigenous-Protected-Area-Wunambal-Gaam...,Wet-A,192000,site_0062/20200308T220000+0800_Uunguu-Indigeno...
1,"[0.08389638, 0.1504575, 0.0045462926, 0.055106...",1583676000,5645,62,Uunguu-Indigenous-Protected-Area-Wunambal-Gaam...,Wet-A,192000,site_0062/20200308T220000+0800_Uunguu-Indigeno...
2,"[0.10783694, 0.13868606, 0.02018447, 0.0507583...",1583676000,5650,62,Uunguu-Indigenous-Protected-Area-Wunambal-Gaam...,Wet-A,192000,site_0062/20200308T220000+0800_Uunguu-Indigeno...
3,"[0.054465663, 0.19671515, 0.003600375, 0.02757...",1583676000,5655,62,Uunguu-Indigenous-Protected-Area-Wunambal-Gaam...,Wet-A,192000,site_0062/20200308T220000+0800_Uunguu-Indigeno...
4,"[0.10221145, 0.25671297, 0.021962637, 0.027206...",1583676000,5660,62,Uunguu-Indigenous-Protected-Area-Wunambal-Gaam...,Wet-A,192000,site_0062/20200308T220000+0800_Uunguu-Indigeno...
...,...,...,...,...,...,...,...,...
872989,"[-0.0849548, -0.15006493, 0.02749683, -0.00334...",1612980000,4295,168,Chillagoe,Dry-B,375000,site_0168/20210211T040000+1000_Chillagoe-Dry-B...
872990,"[-0.09999732, -0.19166626, 0.06264316, -0.0024...",1612980000,4300,168,Chillagoe,Dry-B,375000,site_0168/20210211T040000+1000_Chillagoe-Dry-B...
872991,"[-0.058289703, -0.14517488, 0.064751804, 0.002...",1612980000,4305,168,Chillagoe,Dry-B,375000,site_0168/20210211T040000+1000_Chillagoe-Dry-B...
872992,"[-0.09268322, -0.15292507, 0.041342225, 0.0062...",1612980000,4310,168,Chillagoe,Dry-B,375000,site_0168/20210211T040000+1000_Chillagoe-Dry-B...


In [8]:
print(f"Total number of embeddings: {len(df)}, from {len(df['filename'].unique())} different files")
# Number of embeddings per file on average
print(f'Max number of embeddings per file: {df.groupby("filename")[["offset"]].nunique().max().offset}')
print(f'Min number of embeddings per file: {df.groupby("filename")[["offset"]].nunique().min().offset}')
print(f'Mean number of embeddings per file: {df.groupby("filename")[["offset"]].nunique().mean().offset}')

Total number of embeddings: 872994, from 942 different files
Max number of embeddings per file: 1440
Min number of embeddings per file: 12
Mean number of embeddings per file: 926.7452229299363
