In [1]:
import os
import re
import datetime 
import json
import numpy as np
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tqdm import tqdm
from utils import parse_tfrecord
import faiss 
tf.get_logger().setLevel("ERROR")
tf.autograph.set_verbosity(1)

In [2]:
dataset_filenames = [f"./one_percent_embeddings/a2o_sample_embeddings-{i:05}-of-00374" for i in range(0, 374)]

In [3]:
embeddings = []
metadata = []
count = 0
for dataset_filename in tqdm(dataset_filenames): 
    embeddings =[]
    metadata = []
    raw_dataset = tf.data.TFRecordDataset(dataset_filename)
    for timestamp_s, filename, embedding, embedding_shape in raw_dataset.map(parse_tfrecord).as_numpy_iterator():
        [(
            site_id, 
            file_datetime, 
            timezone, 
            site_name, 
            subsite_name, 
            file_seq_id
        )] = re.findall(
            # I'm quite proud of myself for this regex, but if anyone can see 
            # a way to simplify it, please let me know!
            r"site_(?P<site_id>\d{4})\/(?P<datetime>\d{8}T\d{6})(?P<timezone>(?:\+\d{4})|Z)_(?P<site_name>(?:\w*|-)*)-(?P<subsite_name>(?:Wet|Dry)-(?:A|B))_(?P<file_seq_id>\d*).flac",
            filename.decode("utf-8")
        )
        
        # Some files have just "Z" as timezone, assume UTC in this case
        timezone = "+0000" if timezone == "Z" else timezone
        file_datetime = datetime.datetime.strptime(f"{file_datetime}{timezone}", "%Y%m%dT%H%M%S%z")
        midnight = file_datetime.replace(hour=0, minute=0, second=0)
        file_offset_since_midnight = (file_datetime - midnight).seconds
        
        # `embedding` is a 3D array with Dims [12,1,1280]
        # We loop over the first dimension to "flatten" 
        # the 12 emebddings per minute
        # and extract the single channel (2nd dimension). 
        # We add each of the 12 embeddings as their own record
        for i, _embedding in enumerate(embedding[:,0]):

            #embeddings.append(_embedding)
            count +=1
            metadata.append({
                "file_timestamp": int(file_datetime.timestamp()),
                "file_seconds_since_midnight": file_offset_since_midnight,
                "recording_offset_in_file": int(timestamp_s + (5*i)), 
                "site_id": site_id, 
                "site_name": site_name, 
                "subsite_name": subsite_name, 
                "file_seq_id": int(file_seq_id),
                "filename": filename.decode("utf-8")
            })

    with open(f"./one_percent_embeddings_metadata/{dataset_filename.split('/')[-1]}.json", "w") as f: 
        f.write(json.dumps(metadata))
    #np.save(f"./one_percent_embeddings_numpy/{dataset_filename.split('/')[-1]}.npy", embeddings)
print(f"Total number of data records: {count}")

100%|██████████████████████████████████████████████████████████████| 374/374 [08:12<00:00,  1.32s/it]

Total number of data records: 14412192





In [4]:
metadata_filenames = [f"./one_percent_embeddings_metadata/a2o_sample_embeddings-{i:05}-of-00374.json" for i in range(0, 374)]
embedding_numpy_filenames = [f"./one_percent_embeddings_numpy/a2o_sample_embeddings-{i:05}-of-00374.npy" for i in range(0, 374)]

In [None]:
# train PCA matrix on subset of total matrix
# 0.2 chosen arbitrarily
pca_training_sample_size = 0.25

In [None]:
%%time
training_set = []
for embedding_file in tqdm(embedding_numpy_filenames): 
    embeddings = np.load(embedding_file)
    rand_indexes = np.random.randint(low=0, high=len(embeddings), size=int(pca_training_sample_size * len(embeddings)))
    print(rand_indexes)
    subset = embeddings[rand_indexes]
    training_set.extend(list(subset))

training_set = np.array(training_set)
training_set.shape

In [None]:
%%time
mat = faiss.PCAMatrix(1280, 256)
mat.train(training_set)
faiss.write_VectorTransform(mat, "1280_to_256_dimensionality_reduction.pca")

In [None]:
%%time
pca_matrix = faiss.read_VectorTransform("1280_to_256_dimensionality_reduction.pca")
for embedding_file in tqdm(embedding_numpy_filenames): 
    embeddings = np.load(embedding_file)
    reduced_embeddings = pca_matrix.apply(embeddings)
    np.save(embedding_file.replace('_numpy', '_numpy_reduced'), reduced_embeddings)

In [6]:
metadata = []
for metadata_file in metadata_filenames: 
    with open(metadata_file, "r") as f: 
        _metadata = json.loads(f.read())
        metadata.extend(_metadata)

In [7]:
df = pd.DataFrame.from_records(metadata)

In [8]:
df.head()

Unnamed: 0,file_timestamp,file_datetime,file_seconds_since_midnight,recording_offset_in_file,site_id,site_name,subsite_name,file_seq_id,filename
0,1603584000.0,2020-10-25T11:00:00+11:00,39600,3300.0,302,Cape-Barren-Island,Dry-B,810300,site_0302/20201025T110000+1100_Cape-Barren-Isl...
1,1603584000.0,2020-10-25T11:00:00+11:00,39600,3305.0,302,Cape-Barren-Island,Dry-B,810300,site_0302/20201025T110000+1100_Cape-Barren-Isl...
2,1603584000.0,2020-10-25T11:00:00+11:00,39600,3310.0,302,Cape-Barren-Island,Dry-B,810300,site_0302/20201025T110000+1100_Cape-Barren-Isl...
3,1603584000.0,2020-10-25T11:00:00+11:00,39600,3315.0,302,Cape-Barren-Island,Dry-B,810300,site_0302/20201025T110000+1100_Cape-Barren-Isl...
4,1603584000.0,2020-10-25T11:00:00+11:00,39600,3320.0,302,Cape-Barren-Island,Dry-B,810300,site_0302/20201025T110000+1100_Cape-Barren-Isl...


In [10]:
df.filename.str.len().min()

52