In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import yaml
import os

import torch.nn as nn
import torch.nn.functional as F
import torchaudio

from mars_clip import MarsClip

%matplotlib inline

In [None]:
torch.cuda.empty_cache()

In [None]:
with open('detector_config.yaml', 'r') as ymlfile:
    config = yaml.load(ymlfile, Loader=yaml.Loader)

In [None]:
data = pd.read_json('../MARS-data-tagger/recordings.json')
data.dropna(inplace=True)

In [None]:
data

In [None]:
data['y'] = data['label'].apply(lambda x: 1 if x != 'no_whale' else 0)
data.head(20)


In [None]:
data.head()

In [None]:
X_train, X_val = train_test_split(data, 
                                  test_size=config['TEST_SIZE'], 
                                  random_state=config['RANDOM_STATE'],
                                  stratify=data['y'],
                                  )
X_train['y'].value_counts(), X_val['y'].value_counts()

In [None]:
clip = MarsClip(X_train['filename'].iloc[0])
x = clip.get_samples()

In [None]:
x.shape

In [None]:
plt.plot(clip.get_samples())
plt.show()

In [None]:
sxx, _, _ = clip.get_spec_img()
plt.imshow(np.flipud(sxx))
plt.show()

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
print(net)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H

In [None]:
model = bundle.get_model().to(device)

In [None]:
filepath = config['DATA_ROOT'] + X_train['filename'].iloc[0]
filepath

In [None]:
waveform, sample_rate = torchaudio.load(filepath)
waveform = waveform.to(device)
if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

In [None]:
with torch.inference_mode():
    features, _ = model.extract_features(waveform)

In [None]:
with torch.inference_mode():
    emission, _ = model(waveform)