In [20]:
import h5py
from pathlib import Path

def explore_h5_structure(base_folder):
    """
    Explore the structure of H5 files in the dataset
    
    This function will:
    1. Find the first H5 file in the dataset
    2. Print its complete internal structure
    3. Show sample data shapes and types
    """
    # Convert to Path object if it's a string
    base_path = Path(base_folder)
    
    # Get the first work folder
    first_work = next(base_path.iterdir())
    print(f"Examining first work folder: {first_work.name}")
    
    # Get the first H5 file
    first_file = next(first_work.glob("*.h5"))
    print(f"Examining file: {first_file.name}")
    
    def print_h5_structure(name, item):
        """Helper function to recursively print H5 structure"""
        if isinstance(item, h5py.Dataset):
            print(f"\nDataset: {name}")
            print(f"  Shape: {item.shape}")
            print(f"  Type: {item.dtype}")
            print(f"  Attributes: {list(item.attrs.keys())}")
            
            # Print a small sample of data
            try:
                if len(item.shape) == 2:
                    print(f"  First few values shape: {item[:5, :5].shape}")
            except Exception as e:
                print(f"  Cannot read sample: {e}")
        elif isinstance(item, h5py.Group):
            print(f"\nGroup: {name}")
            print(f"  Attributes: {list(item.attrs.keys())}")
    
    try:
        with h5py.File(first_file, 'r') as f:
            print("\nFile structure:")
            print("=" * 50)
            print(f"Top-level keys: {list(f.keys())}")
            f.visititems(print_h5_structure)
            
    except Exception as e:
        print(f"Error reading file: {e}")

# Let's also create a function to validate multiple files
def validate_multiple_files(base_folder, num_files=3):
    """
    Check multiple H5 files to ensure consistent structure
    """
    base_path = Path(base_folder)
    structures = []
    
    print("\nValidating multiple files...")
    print("=" * 50)
    
    for work_folder in base_path.iterdir():
        if len(structures) >= num_files:
            break
            
        for h5_file in work_folder.glob("*.h5"):
            if len(structures) >= num_files:
                break
                
            try:
                with h5py.File(h5_file, 'r') as f:
                    structure = {
                        'file': h5_file.name,
                        'keys': list(f.keys()),
                        'shapes': {k: f[k].shape for k in f.keys() if isinstance(f[k], h5py.Dataset)}
                    }
                    structures.append(structure)
                    print(f"\nFile: {h5_file.name}")
                    print(f"Keys: {structure['keys']}")
                    print(f"Shapes: {structure['shapes']}")
                    
            except Exception as e:
                print(f"Error with file {h5_file}: {e}")
    
    return structures


base_folder = r'D:\TACOS\da-tacos_benchmark_subset_crema\da-tacos_benchmark_subset_crema'
print("Exploring single file structure:")
explore_h5_structure(base_folder)

print("\nValidating multiple files:")
structures = validate_multiple_files(base_folder)

Exploring single file structure:
Examining first work folder: W_1002_crema
Examining file: P_1002_crema.h5

File structure:
Top-level keys: ['crema']

Dataset: crema
  Shape: (19492, 12)
  Type: float32
  Attributes: ['CLASS', 'TITLE', 'VERSION']
  First few values shape: (5, 5)

Validating multiple files:

Validating multiple files...

File: P_1002_crema.h5
Keys: ['crema']
Shapes: {'crema': (19492, 12)}

File: P_122525_crema.h5
Keys: ['crema']
Shapes: {'crema': (17404, 12)}

File: P_129091_crema.h5
Keys: ['crema']
Shapes: {'crema': (18801, 12)}


In [21]:
import numpy as np
import h5py
from scipy.spatial.distance import cdist

class MemoryEfficientDTW:
    def __init__(self, window_size=None, target_length=2000):
        """
        Initialize the memory-efficient DTW comparator
        
        Parameters:
        window_size (int): Size of the Sakoe-Chiba window for constrained DTW
        target_length (int): Target length after downsampling
        """
        self.window_size = window_size
        self.target_length = target_length

    def load_features(self, h5_path):
        """
        Load and preprocess chromagram features efficiently
        """
        with h5py.File(h5_path, 'r') as f:
            # Load features as float32 instead of float64 to save memory
            features = np.array(f['crema'], dtype=np.float32)
            
            # Normalize features efficiently
            row_sums = np.sum(features, axis=1, keepdims=True)
            features = np.divide(features, row_sums, where=row_sums != 0)
            
            return features

    def downsample_sequence(self, sequence):
        """
        Downsample a sequence to reduce memory usage
        """
        # Calculate downsample factor based on sequence length
        downsample_factor = max(1, len(sequence) // self.target_length)
        return sequence[::downsample_factor]

    def compute_dtw_distance(self, seq1, seq2):
        """
        Compute DTW distance with memory efficiency optimizations
        """
        # Convert to float32 for memory efficiency
        seq1 = seq1.astype(np.float32)
        seq2 = seq2.astype(np.float32)
        
        # Downsample sequences
        seq1_ds = self.downsample_sequence(seq1)
        seq2_ds = self.downsample_sequence(seq2)
        
        N, M = len(seq1_ds), len(seq2_ds)
        
        # Use two rows instead of full matrix
        previous_row = np.full(M, np.inf, dtype=np.float32)
        current_row = np.full(M, np.inf, dtype=np.float32)
        
        # Initialize first cell
        current_row[0] = cdist(
            seq1_ds[0].reshape(1, -1), 
            seq2_ds[0].reshape(1, -1)
        )[0, 0]
        
        # Fill first row
        for j in range(1, M):
            current_row[j] = current_row[j-1] + cdist(
                seq1_ds[0].reshape(1, -1),
                seq2_ds[j].reshape(1, -1)
            )[0, 0]
        
        # Process rest of the sequences
        for i in range(1, N):
            # Swap rows
            previous_row, current_row = current_row, previous_row
            
            # Calculate window boundaries
            if self.window_size:
                j_start = max(0, i - self.window_size)
                j_end = min(M, i + self.window_size + 1)
            else:
                j_start, j_end = 0, M
            
            # Initialize first column
            current_row[0] = previous_row[0] + cdist(
                seq1_ds[i].reshape(1, -1),
                seq2_ds[0].reshape(1, -1)
            )[0, 0]
            
            # Fill current row
            for j in range(max(1, j_start), j_end):
                cost = cdist(
                    seq1_ds[i].reshape(1, -1),
                    seq2_ds[j].reshape(1, -1)
                )[0, 0]
                
                current_row[j] = cost + min(
                    previous_row[j],     # vertical
                    current_row[j-1],    # horizontal
                    previous_row[j-1]    # diagonal
                )
        
        # Return normalized distance
        return current_row[-1] / (N + M)

    def compare_performances(self, perf1_path, perf2_path):
        """
        Compare two performances with key-invariance
        """
        # Load features
        chroma1 = self.load_features(perf1_path)
        chroma2 = self.load_features(perf2_path)
        
        # Try all possible key shifts
        min_distance = float('inf')
        best_shift = 0
        
        for shift in range(12):
            # Shift second chromagram
            shifted_chroma2 = np.roll(chroma2, shift, axis=1)
            
            # Compute DTW distance
            distance = self.compute_dtw_distance(chroma1, shifted_chroma2)
            
            # Update if this is the best match
            if distance < min_distance:
                min_distance = distance
                best_shift = shift
        
        # Convert distance to similarity score
        similarity = 1 / (1 + min_distance)
        
        return similarity, best_shift

In [22]:
import os
import numpy as np
from pathlib import Path
import h5py
from itertools import combinations
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

class MemoryEfficientDatasetProcessor:
    def __init__(self, base_folder, dtw_comparator, max_files_per_work=5):
        """
        Initialize the dataset processor with memory efficiency in mind
        
        Parameters:
        base_folder (str): Path to the main folder containing work subfolders
        dtw_comparator: Instance of MemoryEfficientDTW
        max_files_per_work (int): Maximum number of files to process per work
        """
        self.base_folder = Path(base_folder)
        self.comparator = dtw_comparator
        self.max_files_per_work = max_files_per_work
        self.works = self._get_works()
    
    def _get_works(self):
        """Get all work folders in the dataset"""
        return [d for d in self.base_folder.iterdir() if d.is_dir()]
    
    def _get_performances(self, work_path):
        """Get a subset of h5 files for a specific work"""
        all_performances = list(work_path.glob("*.h5"))
        if len(all_performances) > self.max_files_per_work:
            return np.random.choice(all_performances, 
                                  self.max_files_per_work, 
                                  replace=False).tolist()
        return all_performances
    
    def evaluate_single_work(self, work_path):
        """Evaluate a subset of pairwise comparisons within a single work"""
        performances = self._get_performances(work_path)
        scores = []
        
        print(f"Processing work: {work_path.name}")
        for perf1, perf2 in tqdm(list(combinations(performances, 2)), 
                                desc="Comparing performances"):
            try:
                similarity, shift = self.comparator.compare_performances(
                    str(perf1), str(perf2)
                )
                scores.append({
                    'work': work_path.name,
                    'perf1': perf1.name,
                    'perf2': perf2.name,
                    'similarity': similarity,
                    'key_shift': shift,
                    'same_work': True
                })
            except Exception as e:
                print(f"Error processing {perf1.name} and {perf2.name}: {str(e)}")
        
        return scores
    
    def evaluate_between_works(self, work1_path, work2_path, num_samples=2):
        """Compare a small sample of performances between works"""
        perfs1 = self._get_performances(work1_path)[:num_samples]
        perfs2 = self._get_performances(work2_path)[:num_samples]
        
        scores = []
        print(f"Comparing {work1_path.name} with {work2_path.name}")
        for perf1 in perfs1:
            for perf2 in perfs2:
                try:
                    similarity, shift = self.comparator.compare_performances(
                        str(perf1), str(perf2)
                    )
                    scores.append({
                        'work1': work1_path.name,
                        'work2': work2_path.name,
                        'perf1': perf1.name,
                        'perf2': perf2.name,
                        'similarity': similarity,
                        'key_shift': shift,
                        'same_work': False
                    })
                except Exception as e:
                    print(f"Error comparing {perf1.name} and {perf2.name}: {str(e)}")
        
        return scores
    
    def analyze_dataset(self, num_works=None, num_between_samples=2):
        """
        Analyze a subset of the dataset with memory efficiency in mind
        
        Parameters:
        num_works: Number of works to analyze (None for all)
        num_between_samples: Number of samples for between-work comparisons
        """
        all_scores = []
        
        # Select works to analyze
        selected_works = self.works
        if num_works is not None:
            selected_works = np.random.choice(self.works, 
                                            min(num_works, len(self.works)), 
                                            replace=False)
        
        # Within-work comparisons
        print("Analyzing within-work comparisons...")
        for work in selected_works:
            scores = self.evaluate_single_work(work)
            all_scores.extend(scores)
            
            # Clear memory periodically
            if len(all_scores) > 1000:
                temp_df = pd.DataFrame(all_scores)
                all_scores = [temp_df]
        
        # Between-work comparisons
        print("Analyzing between-work comparisons...")
        for work1, work2 in combinations(selected_works, 2):
            scores = self.evaluate_between_works(
                work1, work2, num_between_samples
            )
            all_scores.extend(scores)
        
        # Combine all results
        if isinstance(all_scores[0], pd.DataFrame):
            final_df = pd.concat(all_scores)
        else:
            final_df = pd.DataFrame(all_scores)
        
        return final_df
    
    def plot_similarity_distribution(self, results_df, save_path=None):
        """Plot similarity score distribution with memory efficiency"""
        plt.figure(figsize=(12, 6))
        
        # Process data in chunks if necessary
        chunk_size = 1000
        for chunk in np.array_split(results_df, max(1, len(results_df) // chunk_size)):
            sns.histplot(data=chunk, x='similarity', hue='same_work', 
                        bins=30, alpha=0.6, stat='density')
        
        plt.title('Distribution of Similarity Scores')
        plt.xlabel('Similarity Score')
        plt.ylabel('Density')
        plt.legend(['Different Works', 'Same Work'])
        
        if save_path:
            plt.savefig(save_path)
            plt.close()
    
    def compute_metrics(self, results_df, threshold=0.7):
        """Compute evaluation metrics with memory efficiency"""
        # Process in chunks if the dataset is large
        chunk_size = 1000
        metrics = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
        
        for chunk in np.array_split(results_df, max(1, len(results_df) // chunk_size)):
            predictions = chunk['similarity'] >= threshold
            true_labels = chunk['same_work']
            
            metrics['tp'] += sum(predictions & true_labels)
            metrics['fp'] += sum(predictions & ~true_labels)
            metrics['tn'] += sum(~predictions & ~true_labels)
            metrics['fn'] += sum(~predictions & true_labels)
        
        precision = metrics['tp'] / (metrics['tp'] + metrics['fp']) if (metrics['tp'] + metrics['fp']) > 0 else 0
        recall = metrics['tp'] / (metrics['tp'] + metrics['fn']) if (metrics['tp'] + metrics['fn']) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'threshold': threshold,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            **metrics
        }

    def plot_chromagram(self, file_path, downsample_factor=200, save_path=None):
        """Plot chromagram with aggressive downsampling for visualization"""
        features = self.comparator.load_features(str(file_path))
        features = features[::downsample_factor]
        
        plt.figure(figsize=(15, 5))
        plt.imshow(features.T, aspect='auto', origin='lower', 
                  interpolation='nearest', cmap='Blues')
        plt.colorbar(label='Magnitude')
        plt.ylabel('Pitch Class')
        plt.xlabel(f'Time (downsampled by factor of {downsample_factor})')
        plt.title(f'Chromagram: {Path(file_path).name}')
        
        if save_path:
            plt.savefig(save_path)
            plt.close()

In [23]:
# # Initialize with modest parameters for testing
# dtw_comparator = MusicDTWComparator(window_size=200)  # Larger window since we're downsampling
# processor = MusicDatasetProcessor(r'D:\TACOS\da-tacos_benchmark_subset_crema\da-tacos_benchmark_subset_crema', dtw_comparator)

# # Start with a small subset first
# results = processor.analyze_dataset(num_between_samples=2)  # Small number for testing

# # Create some visualizations
# processor.plot_similarity_distribution(results)
# plt.show()

# # Find the optimal threshold
# best_metrics = processor.find_optimal_threshold(results)
# print(f"Optimal threshold: {best_metrics['threshold']}")
# print(f"F1 Score: {best_metrics['f1_score']}")

In [None]:
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

def analyze_dataset_subset(croma_folder, num_works=3, max_files_per_work=5):
    """
    Analyze a subset of the dataset with memory-efficient processing and visualization
    
    Parameters:
    croma_folder: Path to the main folder containing work subfolders
    num_works: Number of works to analyze
    max_files_per_work: Maximum number of files to process per work
    """
    # Initialize our memory-efficient DTW comparator
    dtw_comparator = MemoryEfficientDTW(
        window_size=50,
        target_length=1000
    )
    
    # Initialize the memory-efficient dataset processor
    processor = MemoryEfficientDatasetProcessor(
        croma_folder,
        dtw_comparator,
        max_files_per_work=max_files_per_work
    )
    
    # Select works to analyze
    works_list = list(Path(croma_folder).glob("W_*"))
    if not works_list:
        raise ValueError(f"No work folders found in {croma_folder}")
    
    selected_works = np.random.choice(
        works_list,
        min(num_works, len(works_list)),
        replace=False
    )
    
    # Create visualization directory
    viz_folder = Path("visualization_results")
    viz_folder.mkdir(exist_ok=True)
    
    print("Starting analysis...")
    try:
        # Analyze the dataset
        print("Processing selected works...")
        results = processor.analyze_dataset(
            num_works=len(selected_works),
            num_between_samples=2
        )
        
        # Create visualizations
        print("\nGenerating visualizations...")
        
        # 1. Plot example chromagram
        print("Plotting chromagram...")
        sample_file = next(selected_works[0].glob("*.h5"))
        processor.plot_chromagram(
            sample_file,
            downsample_factor=200,
            save_path=viz_folder / 'chromagram_example.png'
        )
        
        # 2. Plot similarity distribution
        print("Plotting similarity distribution...")
        processor.plot_similarity_distribution(
            results,
            save_path=viz_folder / 'similarity_distribution.png'
        )
        
        # 3. Save raw similarity scores
        print("Saving similarity data...")
        results.to_csv(viz_folder / 'similarity_scores.csv', index=False)
        
        # 4. Compute and save metrics
        print("Computing metrics...")
        metrics = processor.compute_metrics(results, threshold=0.7)
        
        # Save metrics to file
        with open(viz_folder / 'analysis_metrics.txt', 'w') as f:
            f.write("Analysis Metrics:\n")
            f.write("=" * 50 + "\n")
            for key, value in metrics.items():
                if isinstance(value, (int, float)):
                    f.write(f"{key}: {value:.3f}\n")
                else:
                    f.write(f"{key}: {value}\n")
        
        # Print summary statistics correctly based on DataFrame structure
        print("\nSummary Statistics:")
        print("-" * 50)
        
        # Get unique works from both work1 and work2 columns
        unique_works = pd.concat([
            results['work1'] if 'work1' in results.columns else pd.Series(),
            results['work2'] if 'work2' in results.columns else pd.Series()
        ]).unique()
        
        print(f"Number of works analyzed: {len(unique_works)}")
        print(f"Total comparisons made: {len(results)}")
        print(f"Average similarity score: {results['similarity'].mean():.3f}")
        print("\nUnique works analyzed:")
        
        return results, metrics
        
    except Exception as e:
        print(f"Error during analysis: {str(e)}")
        raise

# Usage example
if __name__ == "__main__":
    # Set the path to your dataset
    croma_folder = r'D:\TACOS\da-tacos_benchmark_subset_crema\da-tacos_benchmark_subset_crema'
    
    # Run analysis with conservative parameters
    results, metrics = analyze_dataset_subset(
        croma_folder,
        num_works=3,
        max_files_per_work=5
    )

Starting analysis...
Processing selected works...
Analyzing within-work comparisons...
Processing work: W_5283_crema


Comparing performances: 100%|██████████| 10/10 [01:32<00:00,  9.23s/it]


Processing work: W_21826_crema


Comparing performances: 100%|██████████| 10/10 [01:28<00:00,  8.88s/it]


Processing work: W_30309_crema


Comparing performances: 0it [00:00, ?it/s]


Analyzing between-work comparisons...
Comparing W_5283_crema with W_21826_crema
Comparing W_5283_crema with W_30309_crema
Comparing W_21826_crema with W_30309_crema

Generating visualizations...
Plotting chromagram...
Plotting similarity distribution...
Saving similarity data...
Computing metrics...

Summary Statistics:
--------------------------------------------------
Number of works analyzed: 4
Total comparisons made: 28
Average similarity score: 0.825

Unique works analyzed:
Error during analysis: '<' not supported between instances of 'str' and 'float'


  return bound(*args, **kwds)


TypeError: '<' not supported between instances of 'str' and 'float'