In [None]:
import json
import csv
import os
from datetime import datetime

# The directory containing your folders with JSON files
root_directory = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\track_information'

# The path for the output CSV file
csv_file_path = 'tracks_data.csv'

# Current date for 'Date Added To DB' and 'Date Last Modified' columns in the YYYY-MM-DD format
current_date = datetime.now().strftime('%Y-%m-%d')

# Template for CSV rows with the updated header titles
csv_columns = [
    'id', 'song_spotify_id', 'song_title', 'song_duration', 'song_album_type', 
    'song_album_id', 'song_explicit', 'song_popularity', 'song_preview_url', 
    'song_track_features_added', 'song_acousticness', 'song_danceability', 'song_energy', 
    'song_instrumentalness', 'song_liveness', 'song_loudness', 'song_speechiness', 
    'song_tempo', 'song_valence', 'song_key', 'song_time_signature', 'song_date_added_to_db', 
    'song_date_last_modified'
]

# Initialize an empty list to store rows for the CSV
csv_rows = []

def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        for track in data.get('tracks', []):
            # Skip processing if the track is None
            if track is None:
                continue

            row = {
                'id': -1,  # This will be updated later with the actual ID
                'song_spotify_id': track['id'],
                'song_title': track['name'],
                'song_duration': track['duration_ms'],
                'song_album_type': track['album']['album_type'].upper(),
                'song_album_id': track['album']['id'],
                'song_explicit': track['explicit'],
                'song_popularity': track['popularity'],
                'song_preview_url': track.get('preview_url', ''),
                'song_track_features_added': False,
                'song_acousticness': -1,
                'song_danceability': -1,
                'song_energy': -1,
                'song_instrumentalness': -1,
                'song_liveness': -1,
                'song_loudness': -1,
                'song_speechiness': -1,
                'song_tempo': -1,
                'song_valence': -1,
                'song_key': -1,
                'song_time_signature': -1,
                'song_date_added_to_db': current_date,
                'song_date_last_modified': current_date
            }
            csv_rows.append(row)

# Iterate over each subfolder and JSON file in the directory
for subdir, dirs, files in os.walk(root_directory):
    for filename in files:
        if filename.endswith('.json'):
            print(f"Processing {filename}...")
            process_json_file(os.path.join(subdir, filename))

# Write the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=';')
    writer.writeheader()
    
    # Update each row with its ID before writing
    for i, row in enumerate(csv_rows, start=1):
        row['id'] = i
        writer.writerow(row)

print(f"CSV file has been successfully created at {csv_file_path} with {len(csv_rows)} tracks.")


In [None]:
import json
import csv
import os
from datetime import datetime

# Define the root directory containing your JSON files
root_directory = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'

# Define the output CSV file path
csv_file_path = 'album_data.csv'

# Current date for 'Date Added To DB' and 'Date Last Modified' columns
current_date = datetime.now().strftime('%Y-%m-%d')

# CSV column headers
csv_columns = [
    'id', 'album_spotify_id', 'album_name', 'album_cover_art', 'album_release_date',
    'release_date_precision', 'album_popularity', 'album_type', 'spotify_album_upc',
    'spotify_album_ean', 'spotify_album_isrc', 'date_added_to_db', 'date_last_modified',
    'musicbrainz_metadata_added', 'musicbrainz_id'
]

# Initialize a list to hold album data
albums_data = []

# Function to process each JSON file
def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        for album in data.get('albums', []):
            # Extract the required information, with checks for nullable fields
            album_data = {
                'id': -1,  # Placeholder, will be updated later
                'album_spotify_id': album['id'],
                'album_name': album['name'],
                'album_cover_art': album['images'][0]['url'] if album.get('images') else '',
                'album_release_date': album['release_date'],
                'release_date_precision': album['release_date_precision'],
                'album_popularity': album['popularity'],
                'album_type': album['album_type'],
                'spotify_album_upc': album['external_ids'].get('upc', '') if album.get('external_ids') else '',
                'spotify_album_ean': album['external_ids'].get('ean', '') if album.get('external_ids') else '',
                'spotify_album_isrc': album['external_ids'].get('isrc', '') if album.get('external_ids') else '',
                'date_added_to_db': current_date,
                'date_last_modified': current_date,
                'musicbrainz_metadata_added': False,  # Placeholder
                'musicbrainz_id': ''  # Placeholder
            }
            albums_data.append(album_data)

# Process each JSON file in the directory and subdirectories
for subdir, dirs, files in os.walk(root_directory):
    for filename in files:
        if filename.endswith('.json'):
            process_json_file(os.path.join(subdir, filename))

# Write the data to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=';')
    writer.writeheader()
    
    # Update each row with its actual ID before writing
    for i, album_data in enumerate(albums_data, start=1):
        album_data['id'] = i
        writer.writerow(album_data)

print(f"CSV file has been successfully created at {csv_file_path} with {len(albums_data)} albums.")


In [None]:
import pandas as pd

# Specify the dtypes for the IDs to be strings when reading the CSVs
dtype_dict = {'id': str, 'song_album_id': str, 'album_spotify_id': str}
tracks_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\tracks_data.csv', delimiter=';', dtype=dtype_dict)
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype=dtype_dict)

# Create a dictionary mapping from album_spotify_id to id from albums_df
# Ensure the 'id' column in albums_df is converted to integer if it's not NaN
album_id_map = albums_df.dropna(subset=['id']).set_index('album_spotify_id')['id'].astype(int).to_dict()

# Map the song_album_id in tracks_df using the album_id_map to get the album id
tracks_df['album_id'] = tracks_df['song_album_id'].map(album_id_map)

# Convert the new album_id column to integers, NaNs will be converted to a float with a .0
tracks_df['album_id'] = tracks_df['album_id'].fillna(-1).astype(int)

# Replace -1 back to NaN if you want to keep NaN values
tracks_df['album_id'].replace(-1, pd.NA, inplace=True)

# Save the updated tracks DataFrame to a new CSV file
tracks_df.to_csv('path_to_updated_tracks.csv', index=False, sep=';')


In [None]:



import pandas as pd
import os
import json

# Read the albums and artists CSVs into DataFrames
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data_full.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})

# Dictionary to map Spotify album ID to CSV album ID
album_id_map = albums_df.set_index('album_spotify_id')['id'].to_dict()

# Dictionary to map Spotify artist ID to CSV artist ID
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

# Initialize a list to hold the artist-album mappings
artist_album_mapping = []

# Assuming 'path_to_json_folder' is the folder containing all the JSON subfolders
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for album in data['albums']:
                    album_id = album_id_map.get(album['id'])
                    if album_id:
                        for artist in album['artists']:
                            artist_id = artist_id_map.get(artist['id'])
                            if artist_id:
                                artist_album_mapping.append({'artistID': artist_id, 'albumID': album_id})

# Create a DataFrame from the artist-album mappings
artist_album_df = pd.DataFrame(artist_album_mapping)

# Remove duplicates if there are any
artist_album_df = artist_album_df.drop_duplicates()

# Save the DataFrame to CSV
artist_album_df.to_csv('artist_album_mappings_new.csv', index=False, sep=';')


In [None]:
import pandas as pd
import os
import json

# Read the albums and artists CSVs into DataFrames
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data_full.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})

# Dictionary to map Spotify album ID to CSV album ID
album_id_map = albums_df.set_index('album_spotify_id')['id'].to_dict()

# Dictionary to map Spotify artist ID to CSV artist ID
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

# Initialize a list to hold the artist-album mappings
artist_album_mapping = []

# List to hold artist Spotify IDs where CSV artist ID was not found
missing_artist_ids = []

# Assuming 'path_to_json_folder' is the folder containing all the JSON subfolders
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for album in data['albums']:
                    album_id = album_id_map.get(album['id'])
                    if album_id:
                        for artist in album['artists']:
                            artist_spotify_id = artist['id']
                            artist_id = artist_id_map.get(artist_spotify_id)
                            if artist_id:
                                artist_album_mapping.append({'artistID': artist_id, 'albumID': album_id})
                            else:
                                missing_artist_ids.append(artist_spotify_id)

# Print the list of artist Spotify IDs where the CSV artist ID was not found
print("List of artist Spotify IDs where the CSV artist ID was not found:")
for artist_id in missing_artist_ids:
    print(artist_id)

In [None]:
import pandas as pd

# Convert the list to a DataFrame
missing_artist_ids_df = pd.DataFrame(missing_artist_ids, columns=['missing_artist_spotify_id'])

# Define the file path where you want to save the CSV
file_path = 'missing_artist_ids.csv'  # You can specify your own path

# Save the DataFrame to a CSV file
missing_artist_ids_df.to_csv(file_path, index=False)

print(f'The missing artist IDs have been saved to {file_path}')

In [None]:
import pandas as pd

# Replace 'input_file.csv' with the path to your input CSV file
input_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\musicBrainzAllContributors.csv'
# Replace 'output_file.csv' with the path where you want to save the output CSV file
output_file_path = 'contributor_data_prelink.csv'

# Step 1: Read the input CSV
input_df = pd.read_csv(input_file_path, delimiter=',')

# Step 2: Create the new DataFrame
output_df = pd.DataFrame({
    'ID': range(1, len(input_df) + 1),
    'NAME': input_df['artist_credit_name'],
    'ROLE': input_df['role'],
    'INSTRUMENT': input_df['instrument'],
    'MUSICBRAINZ_ID': input_df['artist_mbid'],
    'MAINARTIST': input_df['artist_credit_name'],
    'SONGTITLE': input_df['recording_name']
})

# Step 3: Write the resulting DataFrame to a new CSV file
output_df.to_csv(output_file_path, sep=';', index=False)

print(f"Output CSV saved to {output_file_path}")


In [None]:
import pandas as pd
from fuzzywuzzy import process

# Load the CSV files
musicbrainz_df = pd.read_csv('C:\\Users\\Music\\team_project\\team37\\prepopulationStuff\\PythonNotebooksForPrepopulation\\contributor_data_prelink.csv', sep=';')  # The file generated from the previous step
spotify_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\tracks_data.csv', sep=';')  # The Spotify songs CSV

# Create a list of combined song title and artist names from Spotify for matching
spotify_combined_list = spotify_df.apply(lambda x: f"{x['song_title']} {x['song_album_id']}", axis=1).tolist()

def find_best_match(mb_row, spotify_combined_list):
    mb_combined = f"{mb_row['SONGTITLE']} {mb_row['MAINARTIST']}"
    
    # Using extractOne to find the best match from the list
    best_match_info = process.extractOne(mb_combined, spotify_combined_list)
    
    # If there is a match found, extract it
    if best_match_info:
        best_match_text, best_score = best_match_info
        # Find the index of the match in Spotify list to retrieve the full row from spotify_df
        match_index = spotify_combined_list.index(best_match_text)
        best_match_row = spotify_df.iloc[match_index]
        return best_match_row, best_score
    return None, 0

# Prepare the output DataFrame
matched_df = pd.DataFrame(columns=['CONTRIBUTOR_ID', 'SONG_TABLE_ID', 'Spotify_Song_Title', 'Spotify_Artist', 'MusicBrainz_Song_Title', 'MusicBrainz_Artist'])

# Iterate over MusicBrainz entries to find matches
for index, mb_row in musicbrainz_df.iterrows():
    best_match_row, score = find_best_match(mb_row, spotify_combined_list)
    if best_match_row is not None and score > 80:  # Adjust the threshold as needed
        matched_df = matched_df.append({
            'CONTRIBUTOR_ID': mb_row['ID'],
            'SONG_TABLE_ID': best_match_row['id'],
            'Spotify_Song_Title': best_match_row['song_title'],
            'Spotify_Artist': best_match_row['song_album_id'],  # Note: Adjust if there's a more direct artist name column
            'MusicBrainz_Song_Title': mb_row['SONGTITLE'],
            'MusicBrainz_Artist': mb_row['MAINARTIST']
        }, ignore_index=True)

# Output the matched DataFrame to CSV
matched_df.to_csv('linked_songs.csv', sep=';', index=False)

print("Linked songs CSV generated.")

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\contributors_non-normalized.csv')

# Select only the 'artist_credit_name' and 'artist_credit_id' columns
df_selected = df[['artist_credit_name', 'artist_credit_id']]

# Drop duplicates based on 'artist_credit_id' to ensure each ID is unique
df_unique = df_selected.drop_duplicates(subset=['artist_credit_id'])

# Save the result to a new CSV file
df_unique.to_csv('unique_artist_credits.csv', index=False)

print('Unique artist credits CSV file has been saved as unique_artist_credits.csv.')


In [None]:
import pandas as pd

def parse_manual(array_str):
    # Manually parse the string to extract elements between curly braces
    # Remove leading and trailing braces and split by comma
    items = array_str.strip('{}').split(',')
    # Strip quotes and extra spaces from each item
    items = [item.strip('"').strip() for item in items]
    return items

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\contributors_non-normalized.csv')

# Prepare a list to collect all artist name-MBID pairs
artist_pairs = []
i=0

# Iterate over the DataFrame rows
for _, row in df.iterrows():
    # Parse the 'artist_mbids' and 'individual_artist_names' fields manually
    artist_mbids = parse_manual(row['artist_mbids'])
    individual_artist_names = parse_manual(row['individual_artist_names'])
    
    # Ensure we have equal lengths of MBIDs and names before proceeding
    if len(artist_mbids) == len(individual_artist_names):
        # Pair each individual artist name with its corresponding MBID
        for artist_name, artist_mbid in zip(individual_artist_names, artist_mbids):
            artist_pairs.append((artist_name, artist_mbid))
    else:
        print("Warning: Mismatched MBIDs and artist names for a row, skipping.")
        print(f"Row index: {i}")
        i = i+1

# Convert the list of pairs into a DataFrame, ensuring uniqueness
df_pairs = pd.DataFrame(list(set(artist_pairs)), columns=['individual_artist_name', 'artist_mbid'])

# Save the DataFrame to a new CSV file
df_pairs.to_csv('artist_name_mbid_pairs.csv', index=False)

print('Artist name-MBID pairs CSV file has been saved as artist_name_mbid_pairs.csv.')



In [None]:
import pandas as pd

# Load the MBID CSV
mbid_df = pd.read_csv('C:\\Users\\Music\\team_project\\team37\\prepopulationStuff\\PythonNotebooksForPrepopulation\\artist_name_mbid_pairs.csv')

# Load the Spotify artist CSV, remember to use the ';' delimiter
spotify_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data.csv', delimiter=';')

# Perform a direct merge based on artist names
# Note: This assumes 'artist_name' in spotify_df exactly matches 'individual_artist_name' in mbid_df
merged_df = pd.merge(spotify_df, mbid_df, how='left', left_on='artist_name', right_on='individual_artist_name')

# Drop the 'individual_artist_name' column as it's redundant after merge
merged_df.drop(columns=['individual_artist_name'], inplace=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('spotify_artist_with_mbid_direct_match.csv', index=False)

print('Spotify artist data with direct match MBIDs has been saved as spotify_artist_with_mbid_direct_match.csv.')


In [None]:
import pandas as pd

# Adjust the path to your Spotify artists CSV file
spotify_csv_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data.csv'
spotify_df = pd.read_csv(spotify_csv_path, delimiter=';')

# Print column names to verify
print(spotify_df.columns)
