In [2]:
"""
Spotify Library Dataset Builder
Combines Kaggle dataset with user's Spotify library and adds 'in_library' flag
"""

import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import time
from tqdm.auto import tqdm
from dotenv import load_dotenv
import os

# Configuration
KAGGLE_DATASET_PATH = 'data/data.csv'
OUTPUT_PATH = 'data/final_dataset.csv'

load_dotenv()
client_id = os.getenv('client_id')
client_secret = os.getenv('client_secret')

# Spotify API credentials
CLIENT_ID = client_id
CLIENT_SECRET = client_secret
REDIRECT_URI = "http://127.0.0.1:8080"
SCOPE = 'user-library-read'


def setup_spotify_client():
    """Initialize Spotify client with authentication"""
    auth_manager = SpotifyOAuth(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        redirect_uri=REDIRECT_URI,
        scope=SCOPE
    )
    sp = spotipy.Spotify(auth_manager=auth_manager)
    return sp


def get_all_saved_tracks(sp):
    """
    Fetch all tracks from user's Spotify library (basic info only)
    Returns a list of track dictionaries
    Note: Audio features are deprecated, so we only fetch basic track info
    and rely on the Kaggle dataset for audio features
    """
    saved_tracks = []
    offset = 0
    limit = 50  # Max allowed by Spotify API
    
    print("Fetching your saved tracks from Spotify...")
    print("Note: Audio features API is deprecated. We'll match your library")
    print("tracks with the Kaggle dataset to get audio features.")
    
    try:
        # Get first batch to determine total
        results = sp.current_user_saved_tracks(limit=limit, offset=offset)
        total = results['total']
        
        print(f"Total tracks in your library: {total}")
        
        # Handle empty library
        if total == 0:
            print("Your library is empty!")
            return []
        
        # Use tqdm for progress bar
        with tqdm(total=total, desc="Downloading library") as pbar:
            while offset < total:
                try:
                    results = sp.current_user_saved_tracks(limit=limit, offset=offset)
                    
                    # Check if we got any items
                    if not results or not results.get('items'):
                        break
                    
                    # Extract basic track info
                    for item in results['items']:
                        track = item.get('track')
                        if track and track.get('id'):
                            # Extract year from release_date
                            release_date = track.get('album', {}).get('release_date', '')
                            year = release_date.split('-')[0] if release_date else None
                            
                            saved_tracks.append({
                                'id': track['id'],
                                'name': track.get('name', 'Unknown'),
                                'artist': track['artists'][0]['name'] if track.get('artists') and len(track['artists']) > 0 else 'Unknown',
                                'album': track['album']['name'] if track.get('album') else 'Unknown',
                                'year': year,
                                'duration_ms': track.get('duration_ms'),
                                'explicit': track.get('explicit', False),
                                'popularity': track.get('popularity')
                            })
                    
                    pbar.update(len(results['items']))
                    
                    # Check if there are more tracks
                    if results['next'] is None or len(results['items']) == 0:
                        break
                    
                    offset += limit
                    time.sleep(0.1)  # Be nice to the API
                    
                except Exception as e:
                    print(f"\nError fetching tracks at offset {offset}: {e}")
                    print("Continuing with tracks fetched so far...")
                    break
        
        print(f"Successfully fetched {len(saved_tracks)} tracks from your library")
        print(f"Audio features will be matched from the Kaggle dataset")
        return saved_tracks
        
    except Exception as e:
        print(f"Error initializing library fetch: {e}")
        print("Make sure you're authenticated and have granted the correct permissions.")
        return []


def load_kaggle_dataset(file_path):
    """Load the Kaggle Spotify dataset"""
    print(f"Loading Kaggle dataset from {file_path}...")
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} tracks from Kaggle dataset")
    print(f"Columns in dataset: {df.columns.tolist()}")
    return df


def create_track_identifier(row, id_col='id', name_col='name', artist_col='artists'):
    """
    Create a unique identifier for matching tracks
    Handles different possible column names in Kaggle dataset
    """
    # Try to use Spotify ID first (most reliable)
    if id_col in row.index and pd.notna(row[id_col]):
        return str(row[id_col]).strip()
    
    # Fallback to name+artist combination
    name = str(row[name_col]).lower().strip() if name_col in row.index else ''
    artist = str(row[artist_col]).lower().strip() if artist_col in row.index else ''
    
    return f"{name}||{artist}"


def merge_datasets(kaggle_df, library_tracks):
    """
    Merge Kaggle dataset with user's library
    Add 'in_library' column
    """
    print("\nMerging datasets...")
    print(f"Kaggle dataset shape: {kaggle_df.shape}")
    print(f"Library tracks count: {len(library_tracks)}")
    
    if kaggle_df is None or len(kaggle_df) == 0:
        print("Error: Kaggle dataset is empty or None")
        return None
    
    # Create set of track IDs from user's library for fast lookup
    library_ids = {track['id'] for track in library_tracks if track.get('id')}
    
    # Also create name+artist combinations as backup
    library_name_artist = {
        f"{track['name'].lower().strip()}||{track['artist'].lower().strip()}"
        for track in library_tracks
        if track.get('name') and track.get('artist')
    }
    
    print(f"Library contains {len(library_ids)} unique track IDs")
    
    # Determine column names in Kaggle dataset
    possible_id_cols = ['id', 'track_id', 'spotify_id']
    possible_name_cols = ['name', 'track_name', 'song_name']
    possible_artist_cols = ['artists', 'artist', 'artist_name']
    
    id_col = next((col for col in possible_id_cols if col in kaggle_df.columns), None)
    name_col = next((col for col in possible_name_cols if col in kaggle_df.columns), None)
    artist_col = next((col for col in possible_artist_cols if col in kaggle_df.columns), None)
    
    print(f"Using columns: id='{id_col}', name='{name_col}', artist='{artist_col}'")
    
    # Add 'in_library' column
    def check_in_library(row):
        # Try matching by Spotify ID first
        if id_col and id_col in row.index and pd.notna(row[id_col]):
            track_id = str(row[id_col]).strip()
            if track_id in library_ids:
                return 1
        
        # Fallback to name+artist matching
        if name_col and artist_col:
            identifier = create_track_identifier(row, id_col, name_col, artist_col)
            if identifier in library_name_artist:
                return 1
        
        return 0
    
    print("Flagging tracks in your library...")
    tqdm.pandas(desc="Processing tracks")
    kaggle_df['in_library'] = kaggle_df.progress_apply(check_in_library, axis=1)
    
    matches = kaggle_df['in_library'].sum()
    print(f"\nFound {matches} tracks from your library in the Kaggle dataset")
    if len(library_tracks) > 0:
        print(f"Match rate: {matches/len(library_tracks)*100:.1f}%")
    
    print(f"Returning dataset with shape: {kaggle_df.shape}")
    print(f"'in_library' column added: {'in_library' in kaggle_df.columns}")
    print(f"Dataset type: {type(kaggle_df)}")
    
    # Explicit verification before return
    if kaggle_df is None:
        print("ERROR: Dataset is None before return!")
        return None
    
    if 'in_library' not in kaggle_df.columns:
        print("ERROR: 'in_library' column missing before return!")
        return None
        
    return kaggle_df


def check_missing_library_tracks(kaggle_df, library_tracks):
    """
    Report on tracks from user's library that aren't in Kaggle dataset
    Since audio features API is deprecated, we can't fetch them for missing tracks
    """
    print("\nChecking for library tracks missing from Kaggle dataset...")
    
    # Determine ID column
    possible_id_cols = ['id', 'track_id', 'spotify_id']
    id_col = next((col for col in possible_id_cols if col in kaggle_df.columns), None)
    
    if not id_col:
        print("Warning: Could not find ID column. Skipping missing tracks check.")
        return kaggle_df
    
    kaggle_ids = set(kaggle_df[id_col].dropna().astype(str))
    library_ids = {track['id'] for track in library_tracks if track['id']}
    missing_ids = library_ids - kaggle_ids
    
    print(f"Found {len(missing_ids)} tracks in your library not in Kaggle dataset")
    
    if len(missing_ids) > 0:
        print("\nNote: These tracks cannot be added to the dataset because")
        print("Spotify's audio features API has been deprecated.")
        print("Your ML model will only train on tracks present in the Kaggle dataset.")
        print(f"\nCoverage: {len(library_ids - missing_ids)}/{len(library_ids)} ")
        print(f"({(len(library_ids - missing_ids)/len(library_ids)*100):.1f}%) of your library tracks are in the dataset")
        
        # Show some examples of missing tracks
        missing_tracks = [t for t in library_tracks if t['id'] in missing_ids]
        if len(missing_tracks) > 0:
            print("\nExample missing tracks:")
            for track in missing_tracks[:5]:
                print(f"  - {track['name']} by {track['artist']}")
    
    return kaggle_df




In [3]:
def main():
    """Main execution function"""
    print("=" * 60)
    print("Spotify Library Dataset Builder")
    print("=" * 60)
    
    # Step 1: Load Kaggle dataset
    kaggle_df = load_kaggle_dataset(KAGGLE_DATASET_PATH)
    if kaggle_df is None or len(kaggle_df) == 0:
        print("Error: Failed to load Kaggle dataset. Exiting.")
        return None
    
    # Step 2: Setup Spotify client
    print("\nAuthenticating with Spotify...")
    sp = setup_spotify_client()
    print("Successfully authenticated!")
    
    # Step 3: Get user's saved tracks
    library_tracks = get_all_saved_tracks(sp)
    if len(library_tracks) == 0:
        print("Warning: No tracks found in your library.")
        user_choice = input("Continue anyway? (y/n): ").lower()
        if user_choice != 'y':
            return None
    
    # Step 4: Merge datasets
    print("\n" + "=" * 60)
    print("MERGING DATASETS")
    print("=" * 60)
    final_df = merge_datasets(kaggle_df.copy(), library_tracks)
    
    # Verify merge was successful
    if final_df is None:
        print("Error: Merge failed, final_df is None")
        return None
    
    if 'in_library' not in final_df.columns:
        print("Error: 'in_library' column not found after merge")
        return None
    
    print(f"✓ Merge successful! Dataset shape: {final_df.shape}")
    print(f"✓ 'in_library' column present: {final_df['in_library'].sum()} tracks flagged")
    
    if final_df is None:
        print("Error: Dataset became None after adding missing tracks")
        return None
    
    # Step 6: Save final dataset
    print(f"\nSaving final dataset to {OUTPUT_PATH}...")
    try:
        final_df.to_csv(OUTPUT_PATH, index=False)
        print(f"✓ Dataset successfully saved!")
    except Exception as e:
        print(f"Error saving dataset: {e}")
        return None
    
    # Print summary statistics
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total tracks in dataset: {len(final_df)}")
    print(f"Tracks in your library: {final_df['in_library'].sum()}")
    print(f"Tracks not in your library: {(final_df['in_library'] == 0).sum()}")
    print(f"Percentage in library: {final_df['in_library'].mean() * 100:.2f}%")
    print(f"\nDataset saved to: {OUTPUT_PATH}")
    print("=" * 60)
    
    # Show sample of data
    print("\nSample of the final dataset:")
    print(final_df.head(10))
    print("\nColumns:", final_df.columns.tolist())
    print("\nLibrary tracks sample:")
    if final_df['in_library'].sum() > 0:
        print(final_df[final_df['in_library'] == 1].head(5))
    
    return final_df


if __name__ == "__main__":
    main()

Spotify Library Dataset Builder
Loading Kaggle dataset from data/data.csv...
Loaded 170653 tracks from Kaggle dataset
Columns in dataset: ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']

Authenticating with Spotify...
Successfully authenticated!
Fetching your saved tracks from Spotify...
Note: Audio features API is deprecated. We'll match your library
tracks with the Kaggle dataset to get audio features.
Total tracks in your library: 10021


Downloading library:   0%|          | 0/10021 [00:00<?, ?it/s]

Successfully fetched 10021 tracks from your library
Audio features will be matched from the Kaggle dataset

MERGING DATASETS

Merging datasets...
Kaggle dataset shape: (170653, 19)
Library tracks count: 10021
Library contains 10021 unique track IDs
Using columns: id='id', name='name', artist='artists'
Flagging tracks in your library...


Processing tracks:   0%|          | 0/170653 [00:00<?, ?it/s]


Found 3080 tracks from your library in the Kaggle dataset
Match rate: 30.7%
Returning dataset with shape: (170653, 20)
'in_library' column added: True
Dataset type: <class 'pandas.core.frame.DataFrame'>
✓ Merge successful! Dataset shape: (170653, 20)
✓ 'in_library' column present: 3080 tracks flagged

Saving final dataset to data/final_dataset.csv...
✓ Dataset successfully saved!

SUMMARY
Total tracks in dataset: 170653
Tracks in your library: 3080
Tracks not in your library: 167573
Percentage in library: 1.80%

Dataset saved to: data/final_dataset.csv

Sample of the final dataset:
   valence  year  acousticness  \
0   0.0594  1921         0.982   
1   0.9630  1921         0.732   
2   0.0394  1921         0.961   
3   0.1650  1921         0.967   
4   0.2530  1921         0.957   
5   0.1960  1921         0.579   
6   0.4060  1921         0.996   
7   0.0731  1921         0.993   
8   0.7210  1921         0.996   
9   0.7710  1921         0.982   

                                   