In [None]:
pip install pandas matplotlib os python-dotenv

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement os (from versions: none)

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for os


In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')

print(f"Client ID: {SPOTIFY_CLIENT_ID}")
print(f"Client Secret: {SPOTIFY_CLIENT_SECRET}")

In [None]:
# Import necessary libraries
import pandas as pd
import json
import os
import random
from datetime import datetime, timedelta
from dotenv import load_dotenv

In [None]:
# Function to get user ID from input
def get_user_id():
    user_id = input("Enter the user's ID: ").lower()
    return user_id


In [None]:
# Function to get the number of data chunks from input
def get_num_chunks():
    num_chunks = int(input("Enter the number of chunks: "))
    return num_chunks

In [None]:
# Function to read and process data from multiple JSON files
def read_and_process_data(user_id, num_chunks, base_path='wrapped_files/'):
    all_data = []
    
    for i in range(num_chunks):
        json_file = os.path.join(base_path, f'{user_id}_music_{i}.json')
        print(f"Checking for file: {json_file}")
        
        if not os.path.exists(json_file):
            print(f"File not found: {json_file}")
            continue
        
        print(f"Reading data from {json_file}")
        with open(json_file, 'r', encoding='utf-8') as file:
            data_list = json.load(file)
            all_data.extend(data_list)
    
    if not all_data:
        raise ValueError("No data files were found or all were empty.")
    
    df = pd.DataFrame(all_data)
    df['user_id'] = user_id
    df['endTime'] = pd.to_datetime(df['endTime'])
    
    print(f"Data read successfully for {len(df)} records.")
    return df


In [None]:
# Function to export data to a CSV file
def export_to_csv(df, user_id):
    csv_file = f'{user_id}_listening_data.csv'
    df.to_csv(csv_file, index=False)
    print(f"Data exported to {csv_file}")


In [None]:
# Function to track and save unique songs to a CSV file
def track_unique_songs(df, unique_songs_file):
    unique_songs = df[['trackName', 'artistName']].drop_duplicates()
    print(f"Tracking {len(unique_songs)} unique songs.")
    
    try:
        existing_songs = pd.read_csv(unique_songs_file)
        updated_songs = pd.concat([existing_songs, unique_songs]).drop_duplicates()
        print(f"Existing unique songs loaded, total unique songs now {len(updated_songs)}.")
    except FileNotFoundError:
        updated_songs = unique_songs
        print("Unique songs file not found. Creating a new one.")
    
    updated_songs.to_csv(unique_songs_file, index=False)
    print(f"Unique songs tracked and saved to {unique_songs_file}.")


In [11]:
# Function to get Spotify access token
def get_spotify_access_token(client_id, client_secret):
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
    })
    
    auth_response_data = auth_response.json()
    return auth_response_data['access_token']

In [None]:
# Function to get song details from Spotify API
def get_song_details(artist_name, track_name, access_token):
    search_url = 'https://api.spotify.com/v1/search'
    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    params = {
        'q': f'artist:{artist_name} track:{track_name}',
        'type': 'track',
        'limit': 1
    }
    
    response = requests.get(search_url, headers=headers, params=params)
    response_data = response.json()
    
    if response_data['tracks']['items']:
        track_info = response_data['tracks']['items'][0]
        song_details = {
            'spotify_id': track_info['id'],
            'album': track_info['album']['name'],
            'release_date': track_info['album']['release_date'],
            'popularity': track_info['popularity'],
            'duration_ms': track_info['duration_ms'],
            'track_number': track_info['track_number'],
            'album_artwork': track_info['album']['images'][0]['url'] if track_info['album']['images'] else None,
            'external_urls': track_info['external_urls']['spotify'],
            'artists_involved': ", ".join(artist['name'] for artist in track_info['artists'])
        }
        return song_details
    else:
        return None


In [None]:
# Function to update unique songs table with Spotify info
def update_unique_songs(unique_songs_file='unique_songs.csv'):
    unique_songs = pd.read_csv(unique_songs_file)
    unique_songs['spotify_id'] = None
    unique_songs['album'] = None
    unique_songs['release_date'] = None
    unique_songs['popularity'] = None
    unique_songs['duration_ms'] = None
    unique_songs['track_number'] = None
    unique_songs['album_artwork'] = None
    unique_songs['external_urls'] = None
    unique_songs['artists_involved'] = None
    
    access_token = get_spotify_access_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)
    
    for index, row in unique_songs.iterrows():
        artist_name = row['artistName']
        track_name = row['trackName']
        song_details = get_song_details(artist_name, track_name, access_token)
        
        if song_details:
            unique_songs.at[index, 'spotify_id'] = song_details['spotify_id']
            unique_songs.at[index, 'album'] = song_details['album']
            unique_songs.at[index, 'release_date'] = song_details['release_date']
            unique_songs.at[index, 'popularity'] = song_details['popularity']
            unique_songs.at[index, 'duration_ms'] = song_details['duration_ms']
            unique_songs.at[index, 'track_number'] = song_details['track_number']
            unique_songs.at[index, 'album_artwork'] = song_details['album_artwork']
            unique_songs.at[index, 'external_urls'] = song_details['external_urls']
            unique_songs.at[index, 'artists_involved'] = song_details['artists_involved']
    
    unique_songs.to_csv(unique_songs_file, index=False)
    print(f"Unique songs table updated with Spotify info and saved to {unique_songs_file}.")

In [None]:
# Execute the main steps to read data, export to CSV, and track unique songs
user_id = get_user_id()
num_chunks = get_num_chunks()
base_path = '../wrapped_files/'  # Adjusting the relative path based on the notebook location
unique_songs_file = 'unique_songs.csv'

try:
    df = read_and_process_data(user_id, num_chunks, base_path)
    export_to_csv(df, user_id)
    track_unique_songs(df, unique_songs_file)

    print("Data processing complete!")
except ValueError as e:
    print(e)

Checking for file: ../wrapped_files/ezra_music_0.json
Reading data from ../wrapped_files/ezra_music_0.json
Checking for file: ../wrapped_files/ezra_music_1.json
Reading data from ../wrapped_files/ezra_music_1.json
Data read successfully for 10518 records.
Data exported to ezra_listening_data.csv
Tracking 4419 unique songs.
Unique songs file not found. Creating a new one.
Unique songs tracked and saved to unique_songs.csv.
Data processing complete!


In [None]:
# Execute the function to update unique songs table with Spotify info
update_unique_songs('unique_songs.csv')