In [1]:

!pip install datasets transformers librosa soundfile numpy"<"2 torch torchaudio fast-tsp kmedoids pytorchvideo torchvision torchcodec



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Setup

In [3]:
%load_ext autoreload
%autoreload 2

from transformers import ClapModel, ClapFeatureExtractor, AutoTokenizer

device='mps'
model = ClapModel.from_pretrained("laion/clap-htsat-unfused", use_safetensors=True).to(device)
feature_extractor: ClapFeatureExtractor = ClapFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")


In [4]:
from typing import Generator
import torch
torch.set_grad_enabled(False)
from tqdm.auto import tqdm

def wrap(idx, total):
    if idx < 0:
        return total + idx
    elif idx >= total:
        return idx - total
    else:
        return idx
    
def get_audio_chunks(waveform, chunk_size_seconds: float=0.25, 
                     window_width: float=0, include_window: bool=True
             ) -> Generator[torch.Tensor, None, None]:
    if len(waveform.shape) != 1:
        raise ValueError("waveform should have shape [num_samples]")
    chunk_size = int(8 * ((sampling_rate * chunk_size_seconds) // 8))
    window_width_samples = int(chunk_size * window_width)
    for offset in range(window_width_samples, waveform.shape[0]-window_width_samples, chunk_size):
        if include_window:
            start = wrap(offset-window_width_samples, waveform.shape[0])
            end = wrap(offset+chunk_size+window_width_samples, waveform.shape[0])
        else:
            start = offset
            end = offset + chunk_size
        yield waveform[start:end]
    

def get_audio_features(waveform: torch.Tensor, sampling_rate):
    inputs = feature_extractor(waveform, sampling_rate=sampling_rate, return_tensors="pt")
    #print(inputs.keys())
    audio_features = model.get_audio_features(input_features=inputs.input_features.to(device))
    return audio_features / torch.norm(audio_features, p=2, dim=1, keepdim=True)

def get_text_features(text: str):
    inputs = tokenizer(text, padding=True, return_tensors='pt')
    text_features =  model.get_text_features(input_ids = inputs.input_ids.to(device))
    return text_features / torch.norm(text_features, p=2, dim=1, keepdim=True)


# Run

In [5]:
import torchaudio
import torchaudio.transforms as T
import pickle
import os

filename = '/Users/damian/2.current/clapSlice/outputs/Dua Lipa - Be The One (Official Music Video).mp3-sorted-cs2.742857142857143-smeared-sw5-spread0.waw.mp3'
waveform, sampling_rate = torchaudio.load(filename)
chunk_beats = 4
bpm = 87.5
window_width = 0
chunk_size_seconds = chunk_beats*60/bpm

if sampling_rate != feature_extractor.sampling_rate:
    resampler = T.Resample(sampling_rate,  feature_extractor.sampling_rate, dtype=waveform.dtype)
    waveform, sampling_rate = resampler(waveform), feature_extractor.sampling_rate

print(sampling_rate, waveform.shape, chunk_size_seconds)

left_chunks_window = list(get_audio_chunks(waveform[0], chunk_size_seconds=chunk_size_seconds, window_width=window_width))[:-1]
right_chunks_window = list(get_audio_chunks(waveform[1], chunk_size_seconds=chunk_size_seconds, window_width=window_width))[:-1]
mono_chunks = [(left_chunks_window[i] + right_chunks_window[i]) / 2 for i in range(len(left_chunks_window))]

left_chunks_no_window = list(get_audio_chunks(waveform[0], chunk_size_seconds=chunk_size_seconds, window_width=window_width, include_window=False))[:-1]
right_chunks_no_window = list(get_audio_chunks(waveform[1], chunk_size_seconds=chunk_size_seconds, window_width=window_width, include_window=False))[:-1]
stereo_chunks_no_window = [torch.stack([left_chunks_no_window[index], right_chunks_no_window[index]])
    for index in range(len(left_chunks_no_window))]
features_pickle_filename = filename + f'.clap-norm-bpm{bpm}-cb{chunk_beats}-ww{window_width}.pkl'
if os.path.exists(features_pickle_filename):
    with open(features_pickle_filename, 'rb') as f:
        all_features = pickle.load(f)
else:
    all_features = torch.concat([get_audio_features(chunk, sampling_rate=sampling_rate) 
                             for chunk in tqdm(mono_chunks)])
    with open(features_pickle_filename, 'wb') as f:
        pickle.dump(all_features, f)


objc[65210]: Class AVFFrameReceiver is implemented in both /Users/damian/2.current/clapSlice/venv/lib/python3.12/site-packages/av/.dylibs/libavdevice.61.3.100.dylib (0x130e213b0) and /usr/local/Cellar/ffmpeg/6.1_1/lib/libavdevice.60.3.100.dylib (0x193770378). One of the two will be used. Which one is undefined.
objc[65210]: Class AVFAudioReceiver is implemented in both /Users/damian/2.current/clapSlice/venv/lib/python3.12/site-packages/av/.dylibs/libavdevice.61.3.100.dylib (0x130e21400) and /usr/local/Cellar/ffmpeg/6.1_1/lib/libavdevice.60.3.100.dylib (0x1937703c8). One of the two will be used. Which one is undefined.


48000 torch.Size([2, 9742544]) 2.742857142857143


## Remap

In [6]:
from clap_slice import sort_tsp

In [7]:
import pickle
original_filename = '/Users/damian/2.current/clapSlice/outputs/Dua Lipa - Be The One (Official Music Video).mp3.clap-cs2.742857142857143.pkl'

with open(original_filename, 'rb') as f:
    original_all_features = pickle.load(f)
original_all_features = original_all_features[:all_features.shape[0]]
print(original_all_features.shape)

torch.Size([73, 512])


In [24]:
sort_order = sort_tsp(original_all_features)
print(sort_order)

computing distance matrix
computing route
tensor([55, 27, 71, 53, 69, 25, 67, 23, 51, 22, 50, 66, 64, 48, 20, 49, 21, 65,
        46, 18, 47, 19,  0,  1,  2,  3,  7,  6, 10, 11,  8,  4,  9,  5, 61, 63,
        57, 59, 58, 60, 62, 56, 72, 34, 38, 36, 33, 37, 35, 39, 44, 16, 12, 40,
        13, 41, 17, 45, 15, 43, 42, 14, 32, 31, 29, 28, 30, 68, 52, 24, 26, 70,
        54])


In [25]:
from clap_slice import get_distance_matrix
remap_distance_matrix = get_distance_matrix(original_all_features, all_features)
#remap_distance_matrix.cpu()

In [26]:
torch.argmin(remap_distance_matrix, dim=1, keepdim=False)

tensor([34, 34, 34, 32, 38, 46, 38, 31, 47, 47, 27, 27, 51, 53, 59, 59, 27, 27,
         0, 16, 27, 27, 12, 27, 27, 25, 25, 27, 70, 70, 70, 70, 65, 65, 67, 63,
        65, 65, 67, 27, 52, 53, 59, 59, 27, 27,  0,  0,  6, 27, 12,  6, 27, 27,
        27, 27, 37, 37, 38, 38, 38, 38, 38, 38,  6, 27, 12, 27, 27, 27, 25, 27,
        37], device='mps:0')

In [77]:
from clap_slice.hungarian import hungarian_algorithm
assignment = hungarian_algorithm(remap_distance_matrix).int()
assignment.shape, assignment.cpu()

  0%|          | 0/73 [00:00<?, ?it/s]

stuck with 18/73 non-zero entries


(torch.Size([44, 2]),
 tensor([[ 0, 34],
         [ 4, 38],
         [ 7, 31],
         [ 9, 47],
         [ 8, 48],
         [10, 27],
         [13, 53],
         [14, 59],
         [16, 49],
         [18,  0],
         [22, 12],
         [23, 14],
         [26, 25],
         [25, 26],
         [28, 70],
         [29, 69],
         [30, 71],
         [32, 65],
         [34, 67],
         [36, 66],
         [37, 64],
         [40, 52],
         [48,  6],
         [56, 37],
         [60, 28],
         [64,  7],
         [65,  4],
         [70, 24],
         [72, 36],
         [ 6, 29],
         [12, 50],
         [46,  1],
         [47, 22],
         [51,  3],
         [55, 19],
         [ 3, 32],
         [17, 54],
         [19, 15],
         [27, 17],
         [35, 61],
         [61, 39],
         [43, 57],
         [ 5, 40],
         [50,  8]], dtype=torch.int32))

In [105]:
next = [None] * len(original_all_features)
for original, arranged in assignment:
    print(original, arranged)
    try:
        #print(arranged, assignment[:, 1], assignment[:, 1].tolist().index(arranged+1))
        arranged_f_index = assignment[:, 1].tolist().index(arranged+1)
        #print(arranged_f_index)
    except ValueError:
        continue
    next[original] = assignment[arranged_f_index, 0].item()
print(next)

#inverse_assignment = torch.argsort(assignment)
hungarian_boost = 1 + torch.zeros_like(remap_distance_matrix)
for a, b in enumerate(next):
    if b is None:
        continue
    #print(a, b)
    hungarian_boost[a, b] -= 1
    #hungarian_boost[b, a] -= 1

for a, b in assignment:
    #if a == 0:
    #    hungarian_boost[:, b] = 100
    #    hungarian_boost[b, b] = 0
    if b == 0:
        hungarian_boost[:, a] = 1
    #    #hungarian_boost[a, :] = -100
    #    hungarian_boost[a, a] = 0
    #    #hungarian_boost[]
    #hungarian_boost[b, a] += 0.1
hungarian_boost.cpu()


tensor(0, device='mps:0', dtype=torch.int32) tensor(34, device='mps:0', dtype=torch.int32)
tensor(4, device='mps:0', dtype=torch.int32) tensor(38, device='mps:0', dtype=torch.int32)
tensor(7, device='mps:0', dtype=torch.int32) tensor(31, device='mps:0', dtype=torch.int32)
tensor(9, device='mps:0', dtype=torch.int32) tensor(47, device='mps:0', dtype=torch.int32)
tensor(8, device='mps:0', dtype=torch.int32) tensor(48, device='mps:0', dtype=torch.int32)
tensor(10, device='mps:0', dtype=torch.int32) tensor(27, device='mps:0', dtype=torch.int32)
tensor(13, device='mps:0', dtype=torch.int32) tensor(53, device='mps:0', dtype=torch.int32)
tensor(14, device='mps:0', dtype=torch.int32) tensor(59, device='mps:0', dtype=torch.int32)
tensor(16, device='mps:0', dtype=torch.int32) tensor(49, device='mps:0', dtype=torch.int32)
tensor(18, device='mps:0', dtype=torch.int32) tensor(0, device='mps:0', dtype=torch.int32)
tensor(22, device='mps:0', dtype=torch.int32) tensor(12, device='mps:0', dtype=torch.i

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [106]:
sort_order = sort_tsp(original_all_features, dist_matrix_offset=hungarian_boost)
print(sort_order)

computing distance matrix
computing route
tensor([ 0,  1,  2,  3,  4, 61,  5,  9,  8, 16, 12, 40, 13, 17, 45, 41, 14, 42,
        39, 35, 37, 32, 36, 34, 38, 33, 27, 71, 53, 69, 25, 10, 60,  6, 11, 24,
        52, 68, 26, 70, 54, 55, 21, 49, 65, 67, 23, 19, 47, 18, 46, 20, 48, 64,
        50, 66, 22, 51, 44, 15, 43, 30, 28, 29, 31,  7, 57, 59, 58, 62, 56, 63,
        72])


In [110]:
from torch_linear_assignment import batch_linear_assignment
assignment = batch_linear_assignment(remap_distance_matrix.unsqueeze(0).cpu())
print(assignment)

tensor([[35, 33, 34, 32, 29, 45, 30, 31, 48, 46, 47, 28, 51, 53, 60, 58, 49, 56,
          1, 22,  4,  6,  9, 14, 25, 20, 21, 16, 72, 69, 71, 70, 63, 65, 68, 62,
         66, 64, 67, 61, 52, 54, 59, 57, 50, 55,  2,  0,  8,  5, 10, 13, 27, 17,
         23, 19, 37, 42, 40, 41, 39, 44, 38, 43,  7,  3, 11, 12, 26, 15, 24, 18,
         36]])


# Video

In [1]:
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.data.encoded_video_pyav import EncodedVideoPyAV
video_path = '/Users/damian/2.current/clapSlice/outputs/Dua Lipa - Be The One (Official Music Video) (1080p_25fps_H264-128kbit_AAC).mp4'
video: EncodedVideoPyAV = EncodedVideo.from_path(video_path)


In [2]:
float(video.duration) / chunk_size_seconds

NameError: name 'chunk_size_seconds' is not defined

In [39]:
fps = 25
import av
video_output = av.open(filename + '.mp4', 'w')
stream = video_output.add_stream('h264', rate=fps)
stream.width = 1920  # Set frame width
stream.height = 1080  # Set frame height
stream.pix_fmt = 'yuv444p'   # Select yuv444p pixel format (better quality than default yuv420p).
stream.options = {'crf': '17'}  # Select low crf for high quality (the price is larger file size).

inverse_assignment = torch.argsort(assignment[0])

for target_index in tqdm(range(len(assignment[0]))):
    source_index = inverse_assignment[target_index].item()
    if assignment[0, source_index] < 0:
        continue
    chunk_start_s = source_index * chunk_size_seconds
    chunk_end_s = chunk_start_s + chunk_size_seconds
    #print(source_index, chunk_start_s, chunk_end_s)
    video_data = video.get_clip(start_sec = chunk_start_s, end_sec = chunk_end_s)
    #print(video_data['video'].shape)
    video_frames = video_data['video']
    for frame_index in tqdm(range(video_frames.shape[1]), leave=False):
        frame_data = video_frames[:, frame_index].byte().permute(1, 2, 0)
        #print(frame_data.shape, frame_data[0][0])
        frame = av.VideoFrame.from_ndarray(frame_data.numpy(), format="rgb24")
        frame.pts = None
        video_output.mux(stream.encode(frame))
        del frame
    del video_data

video_output.close()


  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

In [50]:
print(float(video.duration))

2252288/11025


In [42]:

assignment[0], torch.unique(assignment[0])[1:] == torch.arange(len(assignment[0])-1)

(tensor([35, 33, 34, 32, 29, 45, 30, 31, 48, 46, 47, 28, 51, 53, 60, 58, 49, 56,
          1, 22,  4,  6,  9, 14, 25, 20, 21, 16, 72, 69, 71, 70, 63, 65, 68, 62,
         66, 64, 67, 61, 52, 54, 59, 57, 50, 55,  2,  0,  8,  5, 10, 13, 27, 17,
         23, 19, 37, 42, 40, 41, 39, 44, 38, 43,  7,  3, 11, 12, 26, 15, 24, 18,
         36, -1]),
 tensor([True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True]))

In [28]:
inverse_assignment = torch.argsort(assignment[0])
inverse_assignment

tensor([47, 18, 46, 65, 20, 49, 21, 64, 48, 22, 50, 66, 67, 51, 23, 69, 27, 53,
        71, 55, 25, 26, 19, 54, 70, 24, 68, 52, 11,  4,  6,  7,  3,  1,  2,  0,
        72, 56, 62, 60, 58, 59, 57, 63, 61,  5,  9, 10,  8, 16, 44, 12, 40, 13,
        41, 45, 17, 43, 15, 42, 14, 39, 35, 32, 37, 33, 36, 38, 34, 29, 31, 30,
        28])

In [23]:
a = torch.tensor([0, 4, 2, 1, 3])
torch.argsort(a)

tensor([0, 3, 2, 4, 1])

In [12]:
clip_start_sec = 0.0 # secs
clip_duration = 2.0 # secs
video_data = video.get_clip(start_sec=clip_start_sec, end_sec=clip_start_sec + clip_duration)

In [13]:
video_data['video'].shape

torch.Size([3, 50, 1080, 1920])

In [54]:
import tracemalloc
from tqdm.auto import tqdm
try:
    tracemalloc.start()
    print("start:", torch.mps.current_allocated_memory()/(1024*1024), tracemalloc.get_traced_memory()[0]/(1024*1024))
    video_chunks = [None] * len(assignment[0])
    for source_index, target_index in enumerate(tqdm(assignment[0])):
        chunk_start_s = source_index * chunk_size_seconds
        chunk_end_s = chunk_start_s + chunk_size_seconds
        video_data = video.get_clip(start_sec = chunk_start_s, end_sec = chunk_end_s)
        video_chunks[target_index] = video_data['video'].byte()
        print("after chunk", source_index, ":", torch.mps.current_allocated_memory()/(1024*1024), tracemalloc.get_traced_memory()[0]/(1024*1024))
        del video_data
finally:
    tracemalloc.stop()


start: 587.101806640625 0.0


  0%|          | 0/74 [00:00<?, ?it/s]

after chunk 0 : 587.101806640625 0.6500930786132812
after chunk 1 : 587.101806640625 0.6857318878173828
after chunk 2 : 587.101806640625 0.7408819198608398
after chunk 3 : 587.101806640625 0.6764802932739258
after chunk 4 : 587.101806640625 0.7214689254760742
after chunk 5 : 587.101806640625 0.7545967102050781


mmco: unref short failure


after chunk 6 : 587.101806640625 0.8030595779418945
after chunk 7 : 587.101806640625 0.8352193832397461
after chunk 8 : 587.101806640625 0.7493391036987305
after chunk 9 : 587.101806640625 0.788111686706543
after chunk 10 : 587.101806640625 0.8159360885620117
after chunk 11 : 587.101806640625 0.869598388671875
after chunk 12 : 587.101806640625 0.8976469039916992
after chunk 13 : 587.101806640625 0.7922000885009766


KeyboardInterrupt: 

In [55]:
tracemalloc.get_traced_memory()

(0, 0)

In [50]:
del video_chunks

In [26]:
assignment.shape

torch.Size([1, 74])