In [7]:
import numpy as np
import pandas as pd
import sklearn as sk
import faiss
import pickle
import glob
import os
import json
import gc
import uuid

In [5]:
AF_train = examples.get('AF').get('train')
AF_val = examples.get('AF').get('val')
LM_train = examples.get('LM').get('train')
LM_val = examples.get('LM').get('val')
NA_train = examples.get('NA').get('train')
NA_val = examples.get('NA').get('val')

In [13]:
extracted_examples = glob.glob('/mnt/md0/Projects/EDF/Data_Files/2022-01-14-extraction/samples/*.hdf')

In [9]:
activities = ['Fish_Presence', 'Net_Activity', 'Background']
for activity in activities:
    print(f"Starting Activity {activity}")
    NA_val_examples = glob.glob(f'/mnt/md0/Projects/EDF/Data_Files/2022-01-14-extraction/samples/val__NA__{activity}__*.hdf')
    AF_val_examples = glob.glob(f'/mnt/md0/Projects/EDF/Data_Files/2022-01-14-extraction/samples/val__AF__{activity}__*.hdf')
    LM_val_examples = glob.glob(f'/mnt/md0/Projects/EDF/Data_Files/2022-01-14-extraction/samples/val__LM__{activity}__*.hdf')
    print(f"NA: {len(NA_val_examples)} \nAF: {len(AF_val_examples)} \nLM: {len(LM_val_examples)}")

    current_val_seqs = np.zeros(((len(NA_val_examples) + len(AF_val_examples) + len(LM_val_examples))*2, 2048))

    vessel_example_len_starts = [0, len(NA_val_examples), len(AF_val_examples)]

    for idx,examples in enumerate([NA_val_examples, AF_val_examples, LM_val_examples]):
        if idx == 1:
            range_start = 4096
            range_end = 6144
        else:
            range_start = 2048
            range_end = 4096
        print(f'Starting examples {idx}')
        for i,example in enumerate(examples):
            if i % 500 == 0:
                print(f'Processing example {i}')
                gc.collect()
            seq = pd.read_hdf(example).to_numpy('float32')
            current_val_seqs[vessel_example_len_starts[idx]*2 + i*2] = seq[0,range_start:range_end]
            current_val_seqs[vessel_example_len_starts[idx]*2 + i*2+1] = seq[-1,range_start:range_end]

    np.save(f'/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/{activity}_val_seqs.npy', current_val_seqs)

Starting Activity Fish_Presence
NA: 498 
AF: 396 
LM: 286
Starting examples 0
Processing example 0
Starting examples 1
Processing example 0
Starting examples 2
Processing example 0
Starting Activity Net_Activity
NA: 486 
AF: 466 
LM: 494
Starting examples 0
Processing example 0
Starting examples 1
Processing example 0
Starting examples 2
Processing example 0
Starting Activity Background
NA: 494 
AF: 497 
LM: 494
Starting examples 0
Processing example 0
Starting examples 1
Processing example 0
Starting examples 2
Processing example 0


In [4]:
print(f"NA: {len(NA_train_examples)} \nAF: {len(AF_train_examples)} \nLM: {len(LM_train_examples)}")

NA: 4788 
AF: 4959 
LM: 4966


In [4]:
fish_presence_train_seqs = np.load('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/fish_presence_train_seqs.npy')
net_activity_train_seqs = np.load('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/net_activity_train_seqs.npy')
background_train_seqs = np.load('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/background_train_seqs.npy')

In [42]:
fish_presence_val_seqs = np.load('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/Fish_Presence_val_seqs.npy')
net_activity_val_seqs = np.load('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/Net_Activity_val_seqs.npy')
background_val_seqs = np.load('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/Background_val_seqs.npy')

In [43]:
print(f"fish presence: {len(fish_presence_val_seqs)} \nnet activity: {len(net_activity_val_seqs)} \nbackground: {len(background_val_seqs)}")

fish presence: 2360 
net activity: 2892 
background: 2970


In [6]:
print(f"fish presence: {len(fish_presence_train_seqs)} \nnet activity: {len(net_activity_train_seqs)} \nbackground: {len(background_train_seqs)}")

fish presence: 27422 
net activity: 29210 
background: 29426


In [44]:
val_classes = np.vstack((np.ones((2360+2892,1)), np.zeros((2970,1))))

In [22]:
indices = [uuid.uuid1().int>>64 for x in range(len(fish_presence_train_seqs) + len(net_activity_train_seqs) + len(background_train_seqs))]

In [41]:
index_dict = {idx : clas for idx,clas in zip(indices,classes)}

In [33]:
index = faiss.IndexFlatL2(2048)
index2 = faiss.IndexIDMap(index)
index2.add_with_ids(np.vstack((fish_presence_train_seqs, net_activity_train_seqs, background_train_seqs)).astype('float32'),np.asarray(indices, dtype=np.int64))

In [45]:
k = 25

In [55]:
test_seq = fish_presence_val_seqs[0:1].astype('float32')

In [53]:
test_seq.shape

(1, 2048)

In [78]:
D,I = index2.search(background_val_seqs.astype('float32'),k)
I_uint = np.asarray(I,dtype=np.uint64) #need to convert back to uint

In [79]:
background_classes = []
for arr in I_uint:
    cls = [index_dict.get(idx) for idx in arr]
    sum = np.sum(cls)
    if sum > 12:
        background_classes.append(1)
    else:
        background_classes.append(0)

In [80]:
np.sum(np.asarray(background_classes))

983

In [11]:
for idx,train_examples in enumerate([NA_train_examples, AF_train_examples, LM_train_examples]):
    if idx == 1:
        range_start = 4096
        range_end = 6144
    else:
        range_start = 2048
        range_end = 4096
    print(f'Starting examples {idx}')
    for i,train_example in enumerate(train_examples):
        if i % 500 == 0:
            print(f'Processing example {i}')
            gc.collect()
        seq = pd.read_hdf(train_example).to_numpy('float32')
        background_train_seqs[vessel_example_len_starts[idx]*2 + i*2] = seq[0,range_start:range_end]
        background_train_seqs[vessel_example_len_starts[idx]*2 + i*2+1] = seq[-1,range_start:range_end]

Starting examples 0
Processing example 0
Processing example 500
Processing example 1000
Processing example 1500
Processing example 2000
Processing example 2500
Processing example 3000
Processing example 3500
Processing example 4000
Processing example 4500
Starting examples 1
Processing example 0
Processing example 500
Processing example 1000
Processing example 1500
Processing example 2000
Processing example 2500
Processing example 3000
Processing example 3500
Processing example 4000
Processing example 4500
Starting examples 2
Processing example 0
Processing example 500
Processing example 1000
Processing example 1500
Processing example 2000
Processing example 2500
Processing example 3000
Processing example 3500
Processing example 4000
Processing example 4500


In [10]:
gc.collect()

1052

In [76]:
first_example = True
for idx,train_examples in enumerate([NA_train_examples, AF_train_examples, LM_train_examples]):
    if idx == 1:
        range_start = 4096
        range_end = 6144
    else:
        range_start = 2048
        range_end = 4096
    print(f'Starting examples {idx}')
    for i,train_example in enumerate(train_examples):
        if i % 500 == 0:
            print(f'Processing example {i}')
        if first_example == True:
            seq = pd.read_hdf(train_example).to_numpy('float32')
            background_train_seqs = np.vstack((seq[0,range_start:range_end],seq[-1,range_start:range_end]))
            first_example = False
            continue
        seq = pd.read_hdf(train_example).to_numpy('float32')
        tmp_seq = np.vstack((seq[0,range_start:range_end],seq[-1,range_start:range_end]))
        background_train_seqs = np.vstack((background_train_seqs,tmp_seq))

Starting examples 0
Processing example 0
Processing example 500


KeyboardInterrupt: 

In [33]:
len(LM_train_fp_examples)

4472

In [51]:
gc.collect()

0

In [56]:
seq = pd.read_hdf(LM_train_fp_examples[0]).to_numpy('float32')

In [58]:
LM_train_seqs = np.vstack((seq[0,2048:4096],seq[-1,2048:4096]))

In [59]:
for train_example in LM_train_fp_examples[1:]:
    seq = pd.read_hdf(train_example).to_numpy('float32')
    tmp_seq = np.vstack((seq[0,2048:4096],seq[-1,2048:4096]))
    LM_train_seqs = np.vstack((LM_train_seqs,tmp_seq))

In [62]:
fish_presence_train_seqs = np.vstack((train_seqs, LM_train_seqs, AF_train_seqs))

In [71]:
np.save('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/fish_presence_train_seqs.npy', fish_presence_train_seqs)

In [12]:
np.save('/mnt/md0/Projects/EDF/Data_Files/2022-04-21-FAISS/background_train_seqs.npy', background_train_seqs)