In [None]:
import json
import csv
import os
from datetime import datetime

# The directory containing your folders with JSON files
root_directory = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\track_information'

# The path for the output CSV file
csv_file_path = 'tracks_data.csv'

# Current date for 'Date Added To DB' and 'Date Last Modified' columns in the YYYY-MM-DD format
current_date = datetime.now().strftime('%Y-%m-%d')

# Template for CSV rows with the updated header titles
csv_columns = [
    'id', 'song_spotify_id', 'song_title', 'song_duration', 'song_album_type', 
    'song_album_id', 'song_explicit', 'song_popularity', 'song_preview_url', 
    'song_track_features_added', 'song_acousticness', 'song_danceability', 'song_energy', 
    'song_instrumentalness', 'song_liveness', 'song_loudness', 'song_speechiness', 
    'song_tempo', 'song_valence', 'song_key', 'song_time_signature', 'song_date_added_to_db', 
    'song_date_last_modified'
]

# Initialize an empty list to store rows for the CSV
csv_rows = []

def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        for track in data.get('tracks', []):
            # Skip processing if the track is None
            if track is None:
                continue

            row = {
                'id': -1,  # This will be updated later with the actual ID
                'song_spotify_id': track['id'],
                'song_title': track['name'],
                'song_duration': track['duration_ms'],
                'song_album_type': track['album']['album_type'].upper(),
                'song_album_id': track['album']['id'],
                'song_explicit': track['explicit'],
                'song_popularity': track['popularity'],
                'song_preview_url': track.get('preview_url', ''),
                'song_track_features_added': False,
                'song_acousticness': -1,
                'song_danceability': -1,
                'song_energy': -1,
                'song_instrumentalness': -1,
                'song_liveness': -1,
                'song_loudness': -1,
                'song_speechiness': -1,
                'song_tempo': -1,
                'song_valence': -1,
                'song_key': -1,
                'song_time_signature': -1,
                'song_date_added_to_db': current_date,
                'song_date_last_modified': current_date
            }
            csv_rows.append(row)

# Iterate over each subfolder and JSON file in the directory
for subdir, dirs, files in os.walk(root_directory):
    for filename in files:
        if filename.endswith('.json'):
            print(f"Processing {filename}...")
            process_json_file(os.path.join(subdir, filename))

# Write the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=';')
    writer.writeheader()
    
    # Update each row with its ID before writing
    for i, row in enumerate(csv_rows, start=1):
        row['id'] = i
        writer.writerow(row)

print(f"CSV file has been successfully created at {csv_file_path} with {len(csv_rows)} tracks.")


In [None]:
import json
import csv
import os
from datetime import datetime

# Define the root directory containing your JSON files
root_directory = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'

# Define the output CSV file path
csv_file_path = 'album_data.csv'

# Current date for 'Date Added To DB' and 'Date Last Modified' columns
current_date = datetime.now().strftime('%Y-%m-%d')

# CSV column headers
csv_columns = [
    'id', 'album_spotify_id', 'album_name', 'album_cover_art', 'album_release_date',
    'release_date_precision', 'album_popularity', 'album_type', 'spotify_album_upc',
    'spotify_album_ean', 'spotify_album_isrc', 'date_added_to_db', 'date_last_modified',
    'musicbrainz_metadata_added', 'musicbrainz_id'
]

# Initialize a list to hold album data
albums_data = []

# Function to process each JSON file
def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        for album in data.get('albums', []):
            # Extract the required information, with checks for nullable fields
            album_data = {
                'id': -1,  # Placeholder, will be updated later
                'album_spotify_id': album['id'],
                'album_name': album['name'],
                'album_cover_art': album['images'][0]['url'] if album.get('images') else '',
                'album_release_date': album['release_date'],
                'release_date_precision': album['release_date_precision'],
                'album_popularity': album['popularity'],
                'album_type': album['album_type'],
                'spotify_album_upc': album['external_ids'].get('upc', '') if album.get('external_ids') else '',
                'spotify_album_ean': album['external_ids'].get('ean', '') if album.get('external_ids') else '',
                'spotify_album_isrc': album['external_ids'].get('isrc', '') if album.get('external_ids') else '',
                'date_added_to_db': current_date,
                'date_last_modified': current_date,
                'musicbrainz_metadata_added': False,  # Placeholder
                'musicbrainz_id': ''  # Placeholder
            }
            albums_data.append(album_data)

# Process each JSON file in the directory and subdirectories
for subdir, dirs, files in os.walk(root_directory):
    for filename in files:
        if filename.endswith('.json'):
            process_json_file(os.path.join(subdir, filename))

# Write the data to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=';')
    writer.writeheader()
    
    # Update each row with its actual ID before writing
    for i, album_data in enumerate(albums_data, start=1):
        album_data['id'] = i
        writer.writerow(album_data)

print(f"CSV file has been successfully created at {csv_file_path} with {len(albums_data)} albums.")


In [11]:
import pandas as pd

# Specify the dtypes for the IDs to be strings when reading the CSVs
dtype_dict = {'id': str, 'song_album_id': str, 'album_spotify_id': str}
tracks_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\tracks_data.csv', delimiter=';', dtype=dtype_dict)
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype=dtype_dict)

# Create a dictionary mapping from album_spotify_id to id from albums_df
# Ensure the 'id' column in albums_df is converted to integer if it's not NaN
album_id_map = albums_df.dropna(subset=['id']).set_index('album_spotify_id')['id'].astype(int).to_dict()

# Map the song_album_id in tracks_df using the album_id_map to get the album id
tracks_df['album_id'] = tracks_df['song_album_id'].map(album_id_map)

# Convert the new album_id column to integers, NaNs will be converted to a float with a .0
tracks_df['album_id'] = tracks_df['album_id'].fillna(-1).astype(int)

# Replace -1 back to NaN if you want to keep NaN values
tracks_df['album_id'].replace(-1, pd.NA, inplace=True)

# Save the updated tracks DataFrame to a new CSV file
tracks_df.to_csv('path_to_updated_tracks.csv', index=False, sep=';')


In [12]:
import os
import json
import pandas as pd

# Load the artists CSV into a DataFrame and create a mapping from artist_spotify_id to id
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

# Initialize a dictionary to store the mapping from album_spotify_id to main_artist_id
album_artist_map = {}

# Assume 'folder_with_jsons' is the path to the folder containing all the JSON files
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            # Construct the full file path
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # Assuming the structure of the JSON is as shown in the sample
                for album in data['albums']:
                    album_spotify_id = album['id']
                    # Assuming the first artist in the list is the main artist
                    main_artist_spotify_id = album['artists'][0]['id']
                    main_artist_id = artist_id_map.get(main_artist_spotify_id)
                    if main_artist_id:
                        album_artist_map[album_spotify_id] = main_artist_id

# Load the albums CSV into a DataFrame
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})

# Map the main artist id to the albums DataFrame
albums_df['MAIN_ARTIST_ID'] = albums_df['album_spotify_id'].map(album_artist_map)

# Save the updated albums DataFrame to a new CSV file
albums_df.to_csv('path_to_updated_albums_csv', index=False, sep=';')


In [13]:
import os
import json
from collections import defaultdict

# This dictionary will hold each song ID and the set of album IDs it appears in
song_albums_map = defaultdict(set)

# Walk through the folder containing all the JSON files
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            # Construct the full file path
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # Loop through each album in the JSON
                for album in data['albums']:
                    album_id = album['id']
                    # Loop through each track in the album
                    for track in album['tracks']['items']:
                        track_id = track['id']
                        # Add the album ID to the set of album IDs for this track
                        song_albums_map[track_id].add(album_id)

# Filter out song IDs that appear in more than one album
duplicate_tracks = {song_id: album_ids for song_id, album_ids in song_albums_map.items() if len(album_ids) > 1}

# Now `duplicate_tracks` contains all the song IDs that are present in multiple albums
print(duplicate_tracks)


{'7BO7ZbaEmgCSSp3znPMxN4': {'1aJ1OS2xdkKGvmLvkU9qn3', '37rNuexqEXWeSIOiJtn3A9'}, '5t9KYe0Fhd5cW6UYT4qP8f': {'5Q4YAfyREjs05kndnKl5b3', '3hep7TwxnkSEIbZAd6dsys', '37rNuexqEXWeSIOiJtn3A9', '4Pt15cJBBgAbjtWxULYbKh', '6J0It6ch0fqrvTYxHqnF4i', '1QbPTcTHwoWHu2sD6QzY7W'}, '3LFeYtxcstS7tezHLWuUY5': {'3HBijmDB3SyMRpt0o5THfa', '2eAK1pBzED09E9ooIAznXP'}, '4uGsmjXhF8NYuj1WI1mnq5': {'0wJRO5fA0Xh47PBS6E7g9b', '3HBijmDB3SyMRpt0o5THfa'}, '3dxuJoE3uTklekmqv0W37f': {'3HBijmDB3SyMRpt0o5THfa', '0IyOee3GxUYvowsggMEXwK'}, '6dZ7QsOOBtamjVpZ8d427I': {'3HBijmDB3SyMRpt0o5THfa', '61r8RVSOccHFbMtUDzGpns'}, '24tNzAYyAyPvDZV0ozm0Vt': {'6YpYaGFJFed2XKq26Qthz5', '3HBijmDB3SyMRpt0o5THfa'}, '00RO3KGPvR2v369wwjDIsk': {'58p5IFT7CEcMHSrAIuO5DJ', '3HBijmDB3SyMRpt0o5THfa', '61r8RVSOccHFbMtUDzGpns'}, '3BlDtGT0W0vngtHfTCssoI': {'3HBijmDB3SyMRpt0o5THfa', '2LihOIn7e1VBTWRZKAm1to'}, '6SND4ZKrlRZK0YCXMwhG7W': {'4hwFFfpDp28EYWbVZoZ9pi', '32ZtKKC18URbfeWP8Kd4Ql'}, '6Amxac8VmEnPvLyU0YFTcO': {'4hwFFfpDp28EYWbVZoZ9pi', '32ZtKKC18URbfeW