In [2]:
import os
import json
import shutil
import hashlib
from pathlib import Path
from typing import Dict, List, Set
import re

class SongConsolidator:
    def __init__(self, base_songs_folder: str, output_folder: str = "consolidated_songs"):
        self.base_songs_folder = Path(base_songs_folder)
        self.output_folder = Path(output_folder)
        self.songs_output = self.output_folder / "songs"
        self.metadata_output = self.output_folder / "metadata"
        
        # Create output directories
        self.songs_output.mkdir(parents=True, exist_ok=True)
        self.metadata_output.mkdir(parents=True, exist_ok=True)
        
        # Data structures to track songs and playlists
        self.unique_songs = {}  # track_uri -> song_info
        self.playlist_data = {}  # playlist_name -> playlist_info
        self.song_to_playlists = {}  # song_id -> list of playlist names
        
    def generate_song_id(self, track_name: str, artists: str) -> str:
        """Generate a unique ID for a song based on track name and artists"""
        # Clean the string for ID generation
        clean_string = f"{track_name}_{artists}".lower()
        clean_string = re.sub(r'[^a-z0-9_]', '', clean_string)
        
        # Generate a hash for uniqueness
        hash_object = hashlib.md5(clean_string.encode())
        return f"song_{hash_object.hexdigest()[:12]}"
    
    def normalize_filename(self, filename: str) -> str:
        """Normalize filename for comparison"""
        return filename.lower().replace('-', '').replace('_', '').replace(' ', '')
    
    def is_duplicate_song(self, song1: dict, song2: dict) -> bool:
        """Check if two songs are duplicates based on metadata"""
        # Primary check: Spotify URI
        if song1.get('track_uri') and song2.get('track_uri'):
            return song1['track_uri'] == song2['track_uri']
        
        # Secondary check: Track name and artists
        name1 = song1.get('track_name', '').lower().strip()
        name2 = song2.get('track_name', '').lower().strip()
        
        artists1 = song1.get('artists_string', '').lower().strip()
        artists2 = song2.get('artists_string', '').lower().strip()
        
        return name1 == name2 and artists1 == artists2
    
    def find_song_files(self, playlist_folder: Path) -> List[Path]:
        """Find all audio files in a playlist folder"""
        audio_extensions = {'.mp3', '.webm', '.wav', '.flac', '.m4a'}
        song_files = []
        
        for file_path in playlist_folder.iterdir():
            if file_path.is_file() and file_path.suffix.lower() in audio_extensions:
                song_files.append(file_path)
        
        return song_files
    
    def process_playlist_folder(self, playlist_folder: Path):
        """Process a single playlist folder"""
        playlist_name = playlist_folder.name
        json_file = playlist_folder / "enhanced_download_summary.json"
        
        # Skip folders without the JSON file
        if not json_file.exists():
            print(f"Skipping {playlist_name}: No enhanced_download_summary.json found")
            return
        
        print(f"Processing playlist: {playlist_name}")
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            print(f"Error reading JSON from {playlist_name}: {e}")
            return
        
        # Store playlist metadata
        download_info = data.get('download_info', {})
        self.playlist_data[playlist_name] = {
            'name': playlist_name,
            'total_tracks': download_info.get('total_tracks', 0),
            'successful_downloads': download_info.get('successful_downloads', 0),
            'source_url': download_info.get('source_url', ''),
            'timestamp': download_info.get('timestamp', ''),
            'folder_path': str(playlist_folder)
        }
        
        # Get all song files in the folder
        song_files = self.find_song_files(playlist_folder)
        
        # Process each song in the download results
        download_results = data.get('download_results', [])
        
        for result in download_results:
            if result.get('status') != 'success':
                continue
                
            metadata = result.get('metadata', {})
            filename = result.get('filename', '')
            
            # Find the actual file
            matching_file = None
            for song_file in song_files:
                if self.normalize_filename(song_file.name) == self.normalize_filename(filename):
                    matching_file = song_file
                    break
            
            if not matching_file:
                print(f"Warning: Could not find file {filename} in {playlist_name}")
                continue
            
            # Check if this song already exists
            track_uri = metadata.get('track_uri', '')
            song_id = None
            is_duplicate = False
            
            # Look for existing song
            for existing_uri, existing_song in self.unique_songs.items():
                if self.is_duplicate_song(metadata, existing_song['metadata']):
                    song_id = existing_song['song_id']
                    is_duplicate = True
                    break
            
            # If not a duplicate, create new entry
            if not is_duplicate:
                song_id = self.generate_song_id(
                    metadata.get('track_name', ''),
                    metadata.get('artists_string', '')
                )
                
                # Copy the song file to consolidated folder
                file_extension = matching_file.suffix
                new_filename = f"{song_id}{file_extension}"
                new_file_path = self.songs_output / new_filename
                
                try:
                    shutil.copy2(matching_file, new_file_path)
                    print(f"Copied: {filename} -> {new_filename}")
                except Exception as e:
                    print(f"Error copying {filename}: {e}")
                    continue
                
                # Store song info
                self.unique_songs[track_uri or song_id] = {
                    'song_id': song_id,
                    'filename': new_filename,
                    'original_filename': filename,
                    'file_path': str(new_file_path),
                    'metadata': metadata,
                    'playlists': [playlist_name]
                }
                
                self.song_to_playlists[song_id] = [playlist_name]
            else:
                # Add this playlist to existing song
                if playlist_name not in self.unique_songs[track_uri or song_id]['playlists']:
                    self.unique_songs[track_uri or song_id]['playlists'].append(playlist_name)
                
                if song_id not in self.song_to_playlists:
                    self.song_to_playlists[song_id] = []
                if playlist_name not in self.song_to_playlists[song_id]:
                    self.song_to_playlists[song_id].append(playlist_name)
                
                print(f"Duplicate found: {filename} (already exists as {song_id})")
    
    def save_metadata(self):
        """Save consolidated metadata to JSON files"""
        
        # Save song database
        songs_db = {
            'songs': {},
            'stats': {
                'total_unique_songs': len(self.unique_songs),
                'total_playlists': len(self.playlist_data),
                'generated_at': __import__('datetime').datetime.now().isoformat()
            }
        }
        
        for song_key, song_info in self.unique_songs.items():
            song_id = song_info['song_id']
            songs_db['songs'][song_id] = song_info
        
        with open(self.metadata_output / 'songs_database.json', 'w', encoding='utf-8') as f:
            json.dump(songs_db, f, indent=2, ensure_ascii=False)
        
        # Save playlist database
        playlists_db = {
            'playlists': self.playlist_data,
            'stats': {
                'total_playlists': len(self.playlist_data),
                'generated_at': __import__('datetime').datetime.now().isoformat()
            }
        }
        
        for playlist_name, playlist_info in self.playlist_data.items():
            # Add songs in this playlist
            playlist_songs = []
            for song_id, playlists in self.song_to_playlists.items():
                if playlist_name in playlists:
                    playlist_songs.append(song_id)
            playlist_info['songs'] = playlist_songs
            playlist_info['unique_song_count'] = len(playlist_songs)
        
        with open(self.metadata_output / 'playlists_database.json', 'w', encoding='utf-8') as f:
            json.dump(playlists_db, f, indent=2, ensure_ascii=False)
        
        # Save song-to-playlist mapping
        mapping_db = {
            'song_to_playlists': self.song_to_playlists,
            'stats': {
                'total_mappings': len(self.song_to_playlists),
                'generated_at': __import__('datetime').datetime.now().isoformat()
            }
        }
        
        with open(self.metadata_output / 'song_playlist_mapping.json', 'w', encoding='utf-8') as f:
            json.dump(mapping_db, f, indent=2, ensure_ascii=False)
    
    def run(self):
        """Run the consolidation process"""
        print(f"Starting consolidation process...")
        print(f"Base folder: {self.base_songs_folder}")
        print(f"Output folder: {self.output_folder}")
        
        if not self.base_songs_folder.exists():
            print(f"Error: Base songs folder '{self.base_songs_folder}' does not exist!")
            return
        
        # Process each playlist folder
        playlist_folders = [f for f in self.base_songs_folder.iterdir() if f.is_dir()]
        
        if not playlist_folders:
            print("No playlist folders found!")
            return
        
        print(f"Found {len(playlist_folders)} playlist folders")
        
        for playlist_folder in playlist_folders:
            self.process_playlist_folder(playlist_folder)
        
        # Save all metadata
        self.save_metadata()
        
        # Print summary
        print("\n" + "="*50)
        print("CONSOLIDATION COMPLETE")
        print("="*50)
        print(f"Total unique songs: {len(self.unique_songs)}")
        print(f"Total playlists processed: {len(self.playlist_data)}")
        print(f"Songs saved to: {self.songs_output}")
        print(f"Metadata saved to: {self.metadata_output}")
        
        # Show duplicate statistics
        total_occurrences = sum(len(song_info['playlists']) for song_info in self.unique_songs.values())
        duplicates_avoided = total_occurrences - len(self.unique_songs)
        print(f"Duplicates avoided: {duplicates_avoided}")
        
        print("\nGenerated files:")
        print("- songs_database.json (complete song information)")
        print("- playlists_database.json (playlist information with song lists)")
        print("- song_playlist_mapping.json (song to playlist relationships)")


def main():
    # Configuration
    SONGS_FOLDER = "songs"  # Your base songs folder
    OUTPUT_FOLDER = "consolidated_music"  # Output folder name
    
    # Create and run consolidator
    consolidator = SongConsolidator(SONGS_FOLDER, OUTPUT_FOLDER)
    consolidator.run()


if __name__ == "__main__":
    main()

Starting consolidation process...
Base folder: songs
Output folder: consolidated_music
Found 11 playlist folders
Processing playlist: All in one
Copied: Hold-Me-Now-Yuvan-Shankar-Raja-Sanjith-Hegde-Thurga.mp3 -> song_06ffdb5ab169.mp3
Copied: Kanne-Kanne-Madras-Gig-Leon-James-Jonita-Gandhi.mp3 -> song_d0fb39333ab3.mp3
Copied: High-On-Love-Yuvan-Shankar-Raja.mp3 -> song_cbcc73d7e617.mp3
Copied: Vilambara-Idaiveli-From-Imaikkaa-Nodigal-Hiphop-Tamizha-Christopher-Stanley-Sudarshan-Ashok-Srinisha.mp3 -> song_47b89f5754c9.mp3
Copied: Vaadi-Pulla-Vaadi-Hiphop-Tamizha.mp3 -> song_2937d1e0634a.mp3
Copied: Thangamey-Anirudh-Ravichander.mp3 -> song_b1eed02e084e.mp3
Copied: Oh-Penne-Anirudh-Ravichander-Vishal-Dadlani.mp3 -> song_afd87a393b33.mp3
Copied: Kannala-Kannala-The-Melting-Point-of-Love-Hiphop-Tamizha-Kaushik-Krish-Padmalatha.mp3 -> song_1600ddfc5d9c.mp3
Copied: Idhazhin-Oram-The-Innocence-of-Love-Anirudh-Ravichander-Ajesh.mp3 -> song_f904bcb561f8.mp3
Copied: Po-Nee-Po-The-Pain-of-Love-Ani