In [1]:
from msclap import CLAP
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import torch
import os

In [2]:
seen_classes = list(sorted(os.listdir("D:/DATA/UCF-101-SEEN/test_seen/")))
unseen_classes = list(sorted(os.listdir("D:/DATA/UCF-101-SEEN/test_unseen/")))
all_classes = list(sorted(seen_classes + unseen_classes))
len(all_classes), len(seen_classes), len(unseen_classes)

(51, 42, 9)

In [3]:
all_classes

['ApplyEyeMakeup',
 'ApplyLipstick',
 'Archery',
 'BabyCrawling',
 'BalanceBeam',
 'BandMarching',
 'BasketballDunk',
 'BlowDryHair',
 'BlowingCandles',
 'BodyWeightSquats',
 'Bowling',
 'BoxingPunchingBag',
 'BoxingSpeedBag',
 'BrushingTeeth',
 'CliffDiving',
 'CricketBowling',
 'CricketShot',
 'CuttingInKitchen',
 'FieldHockeyPenalty',
 'FloorGymnastics',
 'FrisbeeCatch',
 'FrontCrawl',
 'Haircut',
 'HammerThrow',
 'Hammering',
 'HandStandPushups',
 'HandstandWalking',
 'HeadMassage',
 'IceDancing',
 'Knitting',
 'LongJump',
 'MoppingFloor',
 'ParallelBars',
 'PlayingCello',
 'PlayingDaf',
 'PlayingDhol',
 'PlayingFlute',
 'PlayingSitar',
 'Rafting',
 'ShavingBeard',
 'Shotput',
 'SkyDiving',
 'SoccerPenalty',
 'StillRings',
 'SumoWrestling',
 'Surfing',
 'TableTennisShot',
 'Typing',
 'UnevenBars',
 'WallPushups',
 'WritingOnBoard']

In [11]:
seen_class_to_idx = {C: i for i, C in enumerate(seen_classes)}
unseen_class_to_idx = {C: i for i, C in enumerate(unseen_classes)}

In [12]:
all_class_to_idx = {C: i for i, C in enumerate(all_classes)}

In [5]:
class FileDataLoader(object):

    def __init__(self, root: str, task: str, batch_size: int, class_to_idx: dict[str, int], shuffle: bool = True):

        root = os.path.join(root, task)
        self.batch_size = batch_size

        self.audio_paths = []
        self.labels = []
        for C in os.listdir(root):
            class_dir = os.path.join(root, C)
            for file in os.listdir(class_dir):
                assert(file[:file.index('_')] == C)
                self.audio_paths.append(os.path.join(class_dir, file))
                self.labels.append(class_to_idx[C])

        self.indices = np.arange(len(self.audio_paths))
        if shuffle:
            np.random.shuffle(self.indices)

        self.next_ptr = 0

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))
    
    def __iter__(self):
        return self
    
    def __next__(self) -> tuple[list[str], list[int]]:
        if self.next_ptr >= len(self.indices):
            self.next_ptr = 0
            raise StopIteration
        
        batch_paths = []
        batch_labels = []
        for _ in range(self.batch_size):
            if self.next_ptr >= len(self.indices):
                break

            index = self.indices[self.next_ptr]
            self.next_ptr += 1

            batch_paths.append(self.audio_paths[index])
            batch_labels.append(self.labels[index])
        
        return batch_paths, batch_labels

In [6]:
def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

def inference(model: CLAP, classes: list[int], loader: FileDataLoader, device: str):
    top1, top3, top5 = 0, 0, 0
    text_embeddings = model.get_text_embeddings([f"a photo of a {c}"for c in classes])

    num_ttl = 0
    for paths, labels in tqdm(loader, total=len(loader), leave=False):

        audio_embeddings = model.get_audio_embeddings(paths)
        similarity = model.compute_similarity(audio_embeddings, text_embeddings).softmax(dim=1)
        
        labels = torch.tensor(labels).to(device)
        acc1, acc3, acc5 = accuracy(similarity, labels, topk=(1, 3, 5))
        top1 += acc1
        top3 += acc3
        top5 += acc5

        num_ttl += len(paths)

    return top1 / num_ttl, top1, top3 / num_ttl, top3, top5 / num_ttl, top5, num_ttl

In [13]:
ROOT = "D:/DATA/UCF101-Waves-Raw/"

# train_loader = FileDataLoader(ROOT, "train", 4, seen_class_to_idx)
# test_seen_loader = FileDataLoader(ROOT, "test_seen", 4, seen_class_to_idx)
# test_unseen_loader = FileDataLoader(ROOT, "test_unseen", 4, unseen_class_to_idx)

train_loader = FileDataLoader(ROOT, "train", 4, all_class_to_idx)
test_seen_loader = FileDataLoader(ROOT, "test_seen", 4, all_class_to_idx)
test_unseen_loader = FileDataLoader(ROOT, "test_unseen", 4, all_class_to_idx)

In [14]:
with torch.no_grad():
    # Load and initialize CLAP
    clap_model = CLAP(version = '2023', use_cuda=True)
    # top1_acc, top1, top3_acc, top3, top5_acc, top5, num_ttl = inference(clap_model, seen_classes, train_loader, "cuda")
    top1_acc, top1, top3_acc, top3, top5_acc, top5, num_ttl = inference(clap_model, all_classes, train_loader, "cuda")
    print("train_dataset")
    print(f"Top-1 accuracy: {top1_acc:.3%}, correct: {int(top1)}/{num_ttl}")
    print(f"Top-3 accuracy: {top3_acc:.3%}, correct: {int(top3)}/{num_ttl}")
    print(f"Top-5 accuracy: {top5_acc:.3%}, correct: {int(top5)}/{num_ttl}")

                                                   

train_dataset
Top-1 accuracy: 20.162%, correct: 1793/8893
Top-3 accuracy: 36.984%, correct: 3289/8893
Top-5 accuracy: 48.960%, correct: 4354/8893




In [9]:
with torch.no_grad():
    # Load and initialize CLAP
    clap_model = CLAP(version = '2023', use_cuda=True)
    top1_acc, top1, top3_acc, top3, top5_acc, top5, num_ttl = inference(clap_model, seen_classes, test_seen_loader, "cuda")
    print("test_seen_dataset")
    print(f"Top-1 accuracy: {top1_acc:.3%}, correct: {int(top1)}/{num_ttl}")
    print(f"Top-3 accuracy: {top3_acc:.3%}, correct: {int(top3)}/{num_ttl}")
    print(f"Top-5 accuracy: {top5_acc:.3%}, correct: {int(top5)}/{num_ttl}")

                                                 

test_seen_dataset
Top-1 accuracy: 20.348%, correct: 187/919
Top-3 accuracy: 42.873%, correct: 394/919
Top-5 accuracy: 55.060%, correct: 506/919




In [16]:
with torch.no_grad():
    # Load and initialize CLAP
    clap_model = CLAP(version = '2023', use_cuda=True)
    # top1_acc, top1, top3_acc, top3, top5_acc, top5, num_ttl = inference(clap_model, unseen_classes, test_unseen_loader, "cuda")
    top1_acc, top1, top3_acc, top3, top5_acc, top5, num_ttl = inference(clap_model, all_classes, test_unseen_loader, "cuda")
    print("test_seen_dataset")
    print(f"Top-1 accuracy: {top1_acc:.3%}, correct: {int(top1)}/{num_ttl}")
    print(f"Top-3 accuracy: {top3_acc:.3%}, correct: {int(top3)}/{num_ttl}")
    print(f"Top-5 accuracy: {top5_acc:.3%}, correct: {int(top5)}/{num_ttl}")

                                                  

test_seen_dataset
Top-1 accuracy: 59.618%, correct: 1559/2615
Top-3 accuracy: 73.652%, correct: 1926/2615
Top-5 accuracy: 77.782%, correct: 2034/2615


