In [None]:
import json
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class TACOSMetadataAnalyzer:
    def __init__(self, metadata_path):
        """
        Initialize the analyzer for the TACOS dataset metadata.
        
        The metadata is organized hierarchically:
        - Top level: Work IDs (W_xxxxx) representing original songs
        - Second level: Performance IDs (P_xxxxx) representing different versions
        """
        self.metadata_path = Path(metadata_path)
        self.works_data = None
        # Dictionary to store organized cover relationships
        self.cover_relationships = {}
        
    def load_metadata(self):
        """
        Load the metadata JSON file and organize it into cover relationships.
        The file structure has works (original songs) containing multiple performances (versions).
        """
        try:
            with open(self.metadata_path, 'r', encoding='utf-8') as f:
                self.works_data = json.load(f)
            
            # Process each work (original song) and its performances (versions)
            for work_id, performances in self.works_data.items():
                # Get the first performance to extract work (original song) information
                first_perf = next(iter(performances.values()))
                
                # Create entry for this work
                self.cover_relationships[work_id] = {
                    'work_title': first_perf['work_title'],
                    'work_artist': first_perf['work_artist'],
                    'performances': []
                }
                
                # Add all performances (including different versions)
                for perf_id, perf_data in performances.items():
                    performance = {
                        'title': perf_data['perf_title'],
                        'artist': perf_data['perf_artist'],
                        'year': perf_data.get('release_year', 'Unknown'),
                        'performance_id': perf_id,
                        'instrumental': perf_data.get('instrumental', 'Unknown')
                    }
                    self.cover_relationships[work_id]['performances'].append(performance)
                
                # Add count of covers
                self.cover_relationships[work_id]['number_of_covers'] = len(performances) 
            
            return True
            
        except Exception as e:
            print(f"Error loading metadata: {str(e)}")
            return False
    
    def print_cover_relationships(self, min_covers=0):
        """
        Print organized information about each work and its covers.
        
        Args:
            min_covers (int): Minimum number of covers to include in output
        """
        if not self.cover_relationships:
            print("No data loaded. Please run load_metadata() first.")
            return
        
        print(f"\nFound {len(self.cover_relationships)} works with performances:")
        print("=" * 80)
        
        for work_id, work_info in self.cover_relationships.items():
            if work_info['number_of_covers'] >= min_covers:
                print(f"\nOriginal Work: '{work_info['work_title']}'")
                print(f"Original Artist: {work_info['work_artist']}")
                print(f"Number of covers: {work_info['number_of_covers']}")
                
                print("\nPerformances:")
                for idx, perf in enumerate(work_info['performances'], 1):
                    print(f"{idx}. '{perf['title']}' by {perf['artist']} ({perf['year']})")
                    print(f"   Instrumental: {perf['instrumental']}")
                print("-" * 80)
    
    def get_statistics(self):
        """
        Calculate and return statistics about the dataset.
        """
        if not self.cover_relationships:
            return None
        
        stats = {
            'total_works': len(self.cover_relationships),
            'total_performances': sum(len(work['performances']) 
                                   for work in self.cover_relationships.values()),
            'works_with_covers': sum(1 for work in self.cover_relationships.values() 
                                   if work['number_of_covers'] > 0),
            'most_covered': sorted(
                [(work['work_title'], work['number_of_covers']) 
                 for work in self.cover_relationships.values()],
                key=lambda x: x[1],
                reverse=True
            )[:5],
            'instrumental_count': sum(
                sum(1 for perf in work['performances'] if perf['instrumental'] == 'Yes')
                for work in self.cover_relationships.values()
            )
        }
        
        # Calculate average covers per work
        stats['avg_covers_per_work'] = (stats['total_performances'] - stats['total_works']) / stats['total_works']
        
        return stats

    def find_work_by_artist(self, artist_name):
        """
        Find all works performed by a specific artist.
        
        Args:
            artist_name (str): Name of the artist to search for
        """
        found_works = []
        for work_id, work_info in self.cover_relationships.items():
            # Check original artist
            if artist_name.lower() in work_info['work_artist'].lower():
                found_works.append({
                    'work_title': work_info['work_title'],
                    'role': 'original artist',
                    'number_of_covers': work_info['number_of_covers']
                })
            
            # Check cover artists
            for perf in work_info['performances']:
                if artist_name.lower() in perf['artist'].lower():
                    if perf['artist'].lower() != work_info['work_artist'].lower():
                        found_works.append({
                            'work_title': work_info['work_title'],
                            'role': 'cover artist',
                            'year': perf['year'],
                            'original_artist': work_info['work_artist']
                        })
        
        return found_works

    def plot_covers_distribution(self):
        """
        Creates a histogram showing the distribution of how many songs have a certain number of covers.
        This visualization helps understand if most songs have few covers or if there are many songs
        with numerous covers.
        """
        
        # Extract the number of covers for each work
        covers_per_work = [work['number_of_covers'] 
                        for work in self.cover_relationships.values()]
        
        # Set up the plot style for better readability
        plt.figure(figsize=(12, 6))
        sns.set_style("whitegrid")
        
        # Create the histogram
        # We use bins='auto' to let matplotlib determine optimal bin size
        plt.hist(covers_per_work, bins='auto', edgecolor='black', alpha=0.7)
        
        # Enhance the plot with labels and title
        plt.xlabel('Number of Covers', fontsize=12)
        plt.ylabel('Number of Original Works', fontsize=12)
        plt.title('Distribution of Covers per Original Work', fontsize=14, pad=20)
        
        # Add statistical annotations
        avg_covers = sum(covers_per_work) / len(covers_per_work)
        max_covers = max(covers_per_work)
        
        # Add text box with statistics
        stats_text = (f'Total Works: {len(covers_per_work)}\n'
                    f'Average Covers: {avg_covers:.1f}\n'
                    f'Maximum Covers: {max_covers}')
        
        plt.text(0.95, 0.95, stats_text,
                transform=plt.gca().transAxes,
                verticalalignment='top',
                horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        # Adjust layout to prevent text cutoff
        plt.tight_layout()
        plt.show()
        # Print additional insights about the distribution
        print("\nDistribution Insights:")
        print(f"Most common: {max(set(covers_per_work), key=covers_per_work.count)} covers")
        print(f"Number of works with no covers: "
            f"{sum(1 for x in covers_per_work if x == 0)}")
        print(f"Number of works with 5+ covers: "
            f"{sum(1 for x in covers_per_work if x >= 5)}")

    def create_cover_key_mapping(self):
        """
        Creates a focused mapping of only songs that have covers, storing just the key relationships.
        
        This method analyzes the loaded metadata and creates a streamlined mapping structure that:
        1. Only includes works that have at least one cover version
        2. Stores just the work_id (original) and performance_ids (covers) relationships
        3. Maintains a clean, memory-efficient structure for quick lookups
        
        Returns:
            dict: A dictionary where:
                - key: work_id of the original song
                - value: list of performance_ids of its covers
                
        Example output format:
        {
            'W_00001': ['P_00002', 'P_00003'],  # Original work -> Cover performance IDs
            'W_00004': ['P_00008', 'P_00009', 'P_00010']
        }
        """
        if not self.works_data:
            print("No data loaded. Please run load_metadata() first.")
            return None
        
        # Initialize our cover relationships dictionary
        cover_key_mapping = {}
        
        for work_id, performances in self.works_data.items():
            # Get the first performance (original version)
            first_perf_id = min(performances.keys())
            
            # Get all cover performance IDs (excluding the original)
            cover_perf_ids = [pid for pid in performances.keys() if pid != first_perf_id]
            
            # Only include works that have at least one cover
            if cover_perf_ids:
                cover_key_mapping[work_id] = cover_perf_ids
        
        # Store statistics as class attribute for later use
        self.key_mapping_stats = {
            'total_works_with_covers': len(cover_key_mapping),
            'total_cover_performances': sum(len(covers) for covers in cover_key_mapping.values()),
            'max_covers': max(len(covers) for covers in cover_key_mapping.values()) if cover_key_mapping else 0,
            'min_covers': min(len(covers) for covers in cover_key_mapping.values()) if cover_key_mapping else 0
        }
        
        return cover_key_mapping                                                                                                                                          

In [None]:
analyzer = TACOSMetadataAnalyzer(r'D:\TACOS\da-tacos_metadata\da-tacos_metadata\da-tacos_benchmark_subset_metadata.json')
analyzer.load_metadata()
analyzer.print_cover_relationships(min_covers=1)  # Show only songs with at least 2 covers

In [None]:
from pprint import pprint
stats = analyzer.get_statistics()
pprint(stats)

In [None]:
artist_works = analyzer.find_work_by_artist("Flea")
pprint(artist_works)

In [None]:
analyzer.plot_covers_distribution()

In [None]:
cover_mapping=analyzer.create_cover_key_mapping()
with open("cover_mapping.json", "w") as outfile: 
    json.dump(cover_mapping, outfile,indent=4)