In [1]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install libraries
!apt-get update && apt-get install -y libclang-dev
!pip install tokenizers==0.13.3
!pip install transformers==4.28.0 torch torchaudio numpy tqdm

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository

In [2]:
import os

# Paths
zip_path = "/content/drive/MyDrive/Plagiarism-Detection-System/data/processed_smp.zip"
extract_path = "/content/data"

# Decompression
if not os.path.exists(extract_path):
    print("Unzipping dataset...")
    !unzip -q "{zip_path}" -d "{extract_path}"
    print("Done!")
else:
    print("Files already extracted.")

Unzipping dataset...
Done!


In [6]:
import os
import torch
import soundfile as sf
import torchaudio.functional as F
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict, Any

class AudioDataset(Dataset):
    def __init__(self, tracks_dir: str, audio_processor=None):
        self.tracks_dir = tracks_dir
        self.tracklist = sorted([
            t for t in os.listdir(tracks_dir)
            if os.path.isdir(os.path.join(tracks_dir, t)) and not t.startswith('.')
        ])
        self.audio_processor = audio_processor
        self.sample_rate = None

    def __len__(self) -> int:
        return len(self.tracklist)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        audios = {}
        track_name = self.tracklist[idx]
        track_path = os.path.join(self.tracks_dir, track_name)

        versions = sorted([
            v for v in os.listdir(track_path)
            if os.path.isdir(os.path.join(track_path, v)) and not v.startswith('.')
        ])

        for version in versions:
            version_path = os.path.join(track_path, version)
            files = sorted([f for f in os.listdir(version_path) if f.endswith('.wav')])

            if not files: continue

            audios[version] = []
            for file in files:
                file_path = os.path.join(version_path, file)

                # ŒîŒπŒ±Œ≤Œ¨Œ∂ŒøœÖŒºŒµ ŒºŒµ soundfile
                data, samplerate = sf.read(file_path)
                self.sample_rate = samplerate

                # Œ§Œø soundfile ŒµœÄŒπœÉœÑœÅŒ≠œÜŒµŒπ numpy array (Samples, Channels) ŒÆ (Samples,)
                # Œ§Œø ŒºŒµœÑŒ±œÑœÅŒ≠œÄŒøœÖŒºŒµ œÉŒµ Tensor: (Channels, Samples)
                waveform = torch.from_numpy(data).float()
                if waveform.ndim == 1:
                    waveform = waveform.unsqueeze(0) # Mono -> (1, Samples)
                else:
                    waveform = waveform.t() # Stereo -> (Channels, Samples)

                audios[version].append(waveform)

            # MERT Preprocessing
            if self.audio_processor:
                target_sr = self.audio_processor.sampling_rate
                processed_audios = []
                for waveform in audios[version]:
                    if self.sample_rate is not None and self.sample_rate != target_sr:
                        waveform = F.resample(waveform, int(self.sample_rate), target_sr)

                    inputs = self.audio_processor(
                        waveform.squeeze().numpy(),
                        sampling_rate=target_sr,
                        return_tensors="pt"
                    )["input_values"].squeeze()
                    processed_audios.append(inputs)

                audios[version] = processed_audios

        if audios:
            min_frames = min([len(audios[v]) for v in audios.keys()])
            for v in audios.keys():
                audios[v] = audios[v][:min_frames]

        return {'track': track_name, 'audios': audios}

def audio_collate_fn(batch):
    batch_dict = {}
    max_len = 0
    for item in batch:
        for version in item["audios"]:
            for segment in item["audios"][version]:
                if segment.shape[-1] > max_len: max_len = segment.shape[-1]

    for item in batch:
        track_name = item["track"]
        batch_dict[track_name] = []
        for version in item["audios"]:
            padded_segments = []
            for segment in item["audios"][version]:
                pad_amount = max_len - segment.shape[-1]
                padded_seg = torch.nn.functional.pad(segment, (0, pad_amount))
                padded_segments.append(padded_seg)
            batch_dict[track_name].append(torch.stack(padded_segments))
    return batch_dict

def create_audio_dataloader(tracks_dir, batch_size=1, num_workers=2, audio_processor=None):
    dataset = AudioDataset(tracks_dir, audio_processor=audio_processor)
    return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=audio_collate_fn)

In [4]:
!pip install soundfile



In [8]:
import numpy as np
import torch
import torchaudio
from transformers import AutoModel, Wav2Vec2FeatureExtractor
from tqdm import tqdm
import os

# ŒëŒΩŒ±Œ≥Œ∫Œ¨Œ∂ŒøœÖŒºŒµ œÑŒø torchaudio ŒΩŒ± œáœÅŒ∑œÉŒπŒºŒøœÄŒøŒπŒÆœÉŒµŒπ œÑŒø soundfile
try:
    import soundfile
    torchaudio.set_audio_backend("soundfile")
    print("‚úÖ Audio backend set to: soundfile")
except:
    print("‚ö†Ô∏è Could not set soundfile backend explicitly. Hoping for auto-detection.")

INPUT_DIR = "/content/data/processed_smp"
OUTPUT_DIR = "/content/drive/MyDrive/Plagiarism-Detection-System/data/mert_embeddings"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def run_mert_extraction():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"üöÄ Using device: {device}")

    # Load Model
    print("Loading MERT model...")
    processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M")
    model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True).to(device)
    model.eval()

    # Time reduction layer
    time_reduce = torch.nn.AvgPool1d(kernel_size=10, stride=10, count_include_pad=False)

    # Dataloader
    # Œ†ŒµœÅŒΩŒ¨ŒºŒµ œÑŒø processor Œ≥ŒπŒ± ŒΩŒ± Œ≥ŒØŒΩŒµœÑŒ±Œπ œÑŒø resampling ŒºŒ≠œÉŒ± œÉœÑŒø dataset
    dataloader = create_audio_dataloader(INPUT_DIR, batch_size=1, audio_processor=processor)

    print(f"Starting extraction for {len(dataloader)} tracks...")

    with torch.no_grad():
        for batch in tqdm(dataloader):
            for track_name in batch.keys():
                output_path = os.path.join(OUTPUT_DIR, f"{track_name}.npy")

                if os.path.exists(output_path):
                    continue

                track_versions = []

                for version_segments in batch[track_name]:
                    version_segments = version_segments.to(device)

                    # Pass through MERT
                    hidden_states = model(version_segments, output_hidden_states=True).hidden_states

                    # Select Layers (2, 5, 8, 11) & Reduce Time Dimension
                    features = torch.stack(
                        [time_reduce(h.detach()[:, :, :].permute(0,2,1)).permute(0,2,1) for h in hidden_states[2::3]],
                        dim=1
                    )

                    track_versions.append(features.cpu().numpy())

                final_array = np.array(track_versions)
                np.save(output_path, final_array)

    print(f"\n‚úÖ Extraction Complete! Saved to: {OUTPUT_DIR}")

run_mert_extraction()

‚ö†Ô∏è Could not set soundfile backend explicitly. Hoping for auto-detection.
üöÄ Using device: cuda
Loading MERT model...
Starting extraction for 158 tracks...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 158/158 [08:42<00:00,  3.31s/it]


‚úÖ Extraction Complete! Saved to: /content/drive/MyDrive/Plagiarism-Detection-System/data/mert_embeddings



