In [1]:
import os, shutil, pandas as pd, re, unicodedata
from zipfile import ZipFile
from google.colab import files

CSV_PATH = "/content/spotify_metadata_batched.csv"
DOWNLOAD_DIR = "/content/yt-dlp-downloads"
ZIP_NAME = "/content/yt_dlp_all.zip"

# ✅ STEP 3: Clean previous runs
if os.path.exists(DOWNLOAD_DIR):
    shutil.rmtree(DOWNLOAD_DIR)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ✅ STEP 4: Load and clean track queries
df = pd.read_csv(CSV_PATH)
df.dropna(subset=["Track Name", "Artist(s)"], inplace=True)

def clean_query(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\b(slowed|reverb|sped up|remix|edit|instrumental|version|demo|live)\b', '', text, flags=re.I)
    text = re.sub(r'[^\w\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

search_queries = df.apply(lambda row: f"{row['Track Name']} {row['Artist(s)']}", axis=1).tolist()
search_queries = [clean_query(q) for q in search_queries]

# ✅ STEP 5: Track already downloaded files to avoid duplicates
existing_titles = set()
for f in os.listdir(DOWNLOAD_DIR):
    if f.endswith(".mp3"):
        base = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', f.rsplit('.', 1)[0])).lower()
        existing_titles.add(base)

# ✅ STEP 6: yt-dlp download loop with duplicate check
for i, query in enumerate(search_queries):
    norm_query = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', query)).lower()
    if any(norm_query in title for title in existing_titles):
        print(f"⏩ [{i+1}] Skipping duplicate: {query}")
        continue

    print(f"🎯 [{i+1}/{len(search_queries)}] Searching: {query}")
    command = (
        f'yt-dlp "ytsearch1:{query} official audio" '
        f'-x --audio-format mp3 --no-playlist --newline '
        f'-o "{DOWNLOAD_DIR}/%(title)s.%(ext)s"'
    )
    os.system(command)

    # Update seen titles with newly downloaded files
    for f in os.listdir(DOWNLOAD_DIR):
        if f.endswith(".mp3"):
            base = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', f.rsplit('.', 1)[0])).lower()
            existing_titles.add(base)

# ✅ STEP 7: Zip the downloads
with ZipFile(ZIP_NAME, 'w') as zipf:
    for f in os.listdir(DOWNLOAD_DIR):
        if f.endswith(".mp3"):
            zipf.write(os.path.join(DOWNLOAD_DIR, f), arcname=f)

print(f"📦 Zipped {len(os.listdir(DOWNLOAD_DIR))} files to {ZIP_NAME}")

# ✅ STEP 8: Download the ZIP to your device
files.download(ZIP_NAME)


FileNotFoundError: [Errno 2] No such file or directory: '/content/spotify_metadata_batched.csv'