In [None]:
import json
import csv
import os
from datetime import datetime

# The directory containing your folders with JSON files
root_directory = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\track_information'

# The path for the output CSV file
csv_file_path = 'tracks_data.csv'

# Current date for 'Date Added To DB' and 'Date Last Modified' columns in the YYYY-MM-DD format
current_date = datetime.now().strftime('%Y-%m-%d')

# Template for CSV rows with the updated header titles
csv_columns = [
    'id', 'song_spotify_id', 'song_title', 'song_duration', 'song_album_type', 
    'song_album_id', 'song_explicit', 'song_popularity', 'song_preview_url', 
    'song_track_features_added', 'song_acousticness', 'song_danceability', 'song_energy', 
    'song_instrumentalness', 'song_liveness', 'song_loudness', 'song_speechiness', 
    'song_tempo', 'song_valence', 'song_key', 'song_time_signature', 'song_date_added_to_db', 
    'song_date_last_modified', 'recording_mbid', 'album_id'
]

# Initialize an empty list to store rows for the CSV
csv_rows = []

def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        for track in data.get('tracks', []):
            # Skip processing if the track is None
            if track is None:
                continue

            row = {
                'id': -1,  # This will be updated later with the actual ID
                'song_spotify_id': track['id'],
                'song_title': track['name'],
                'song_duration': track['duration_ms'],
                'song_album_type': track['album']['album_type'].upper(),
                'song_album_id': track['album']['id'],
                'song_explicit': track['explicit'],
                'song_popularity': track['popularity'],
                'song_preview_url': track.get('preview_url', ''),
                'song_track_features_added': False,
                'song_acousticness': -1,
                'song_danceability': -1,
                'song_energy': -1,
                'song_instrumentalness': -1,
                'song_liveness': -1,
                'song_loudness': -1,
                'song_speechiness': -1,
                'song_tempo': -1,
                'song_valence': -1,
                'song_key': -1,
                'song_time_signature': -1,
                'song_date_added_to_db': current_date,
                'song_date_last_modified': current_date,
                'recording_mbid': '0',
                'album_id': '0'
            }
            csv_rows.append(row)

# Iterate over each subfolder and JSON file in the directory
for subdir, dirs, files in os.walk(root_directory):
    for filename in files:
        if filename.endswith('.json'):
            print(f"Processing {filename}...")
            process_json_file(os.path.join(subdir, filename))

# Write the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=';')
    writer.writeheader()
    
    # Update each row with its ID before writing
    for i, row in enumerate(csv_rows, start=3769556):
        row['id'] = i
        writer.writerow(row)

print(f"CSV file has been successfully created at {csv_file_path} with {len(csv_rows)} tracks.")


In [None]:
import json
import csv
import os
from datetime import datetime

# Define the root directory containing your JSON files
root_directory = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\NEWNEW\\album_information'

# Define the output CSV file path
csv_file_path = 'album_data.csv'

# Current date for 'Date Added To DB' and 'Date Last Modified' columns
current_date = datetime.now().strftime('%Y-%m-%d')

# CSV column headers
csv_columns = [
    'id', 'album_spotify_id', 'album_name', 'album_cover_art', 'album_release_date',
    'release_date_precision', 'album_popularity', 'album_type', 'spotify_album_upc',
    'spotify_album_ean', 'spotify_album_isrc', 'date_added_to_db', 'date_last_modified',
    'musicbrainz_metadata_added', 'musicbrainz_id'
]

# Initialize a list to hold album data
albums_data = []

# Function to process each JSON file
def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        for album in data.get('albums', []):
            # Extract the required information, with checks for nullable fields
            album_data = {
                'id': -1,  # Placeholder, will be updated later
                'album_spotify_id': album['id'],
                'album_name': album['name'],
                'album_cover_art': album['images'][0]['url'] if album.get('images') else '',
                'album_release_date': convert_date_format(album['release_date']),
                'release_date_precision': album['release_date_precision'],
                'album_popularity': album['popularity'],
                'album_type': album['album_type'],
                'spotify_album_upc': album['external_ids'].get('upc', '') if album.get('external_ids') else '',
                'spotify_album_ean': album['external_ids'].get('ean', '') if album.get('external_ids') else '',
                'spotify_album_isrc': album['external_ids'].get('isrc', '') if album.get('external_ids') else '',
                'date_added_to_db': current_date,
                'date_last_modified': current_date,
                'musicbrainz_metadata_added': False,  # Placeholder
                'musicbrainz_id': ''  # Placeholder
            }
            albums_data.append(album_data)


def convert_date_format(date_str):
    try:
        return datetime.strptime(date_str, '%d/%m/%Y').strftime('%Y-%m-%d')
    except ValueError:
        return date_str  # Returns the original string if the format is incorrect


# Process each JSON file in the directory and subdirectories
for subdir, dirs, files in os.walk(root_directory):
    for filename in files:
        if filename.endswith('.json'):
            process_json_file(os.path.join(subdir, filename))

# Write the data to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=';')
    writer.writeheader()
    
    # Update each row with its actual ID before writing
    for i, album_data in enumerate(albums_data, start=538765):
        album_data['id'] = i
        writer.writerow(album_data)

print(f"CSV file has been successfully created at {csv_file_path} with {len(albums_data)} albums.")


In [None]:
import pandas as pd

# Specify the dtypes for the IDs to be strings when reading the CSVs
dtype_dict = {'id': str, 'song_album_id': str, 'album_spotify_id': str}
tracks_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\tracks_data.csv', delimiter=';', dtype=dtype_dict)
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype=dtype_dict)

# Create a dictionary mapping from album_spotify_id to id from albums_df
# Ensure the 'id' column in albums_df is converted to integer if it's not NaN
album_id_map = albums_df.dropna(subset=['id']).set_index('album_spotify_id')['id'].astype(int).to_dict()

# Map the song_album_id in tracks_df using the album_id_map to get the album id
tracks_df['album_id'] = tracks_df['song_album_id'].map(album_id_map)

# Convert the new album_id column to integers, NaNs will be converted to a float with a .0
tracks_df['album_id'] = tracks_df['album_id'].fillna(-1).astype(int)

# Replace -1 back to NaN if you want to keep NaN values
tracks_df['album_id'].replace(-1, pd.NA, inplace=True)

# Save the updated tracks DataFrame to a new CSV file
tracks_df.to_csv('path_to_updated_tracks.csv', index=False, sep=';')


In [None]:



import pandas as pd
import os
import json

# Read the albums and artists CSVs into DataFrames
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\album_table.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\artists_table.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})

# Dictionary to map Spotify album ID to CSV album ID
album_id_map = albums_df.set_index('album_spotify_id')['id'].to_dict()

# Dictionary to map Spotify artist ID to CSV artist ID
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

# Initialize a list to hold the artist-album mappings
artist_album_mapping = []

# Assuming 'path_to_json_folder' is the folder containing all the JSON subfolders
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for album in data['albums']:
                    album_id = album_id_map.get(album['id'])
                    if album_id:
                        for artist in album['artists']:
                            artist_id = artist_id_map.get(artist['id'])
                            if artist_id:
                                artist_album_mapping.append({'artistID': artist_id, 'albumID': album_id})

# Create a DataFrame from the artist-album mappings
artist_album_df = pd.DataFrame(artist_album_mapping)

# Remove duplicates if there are any
artist_album_df = artist_album_df.drop_duplicates()

# Save the DataFrame to CSV
artist_album_df.to_csv('artist_album_mappings_new.csv', index=False, sep=';')


In [None]:
import pandas as pd
import os
import json

# Read the albums and artists CSVs into DataFrames
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\album_data.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data_full.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})

# Dictionary to map Spotify album ID to CSV album ID
album_id_map = albums_df.set_index('album_spotify_id')['id'].to_dict()

# Dictionary to map Spotify artist ID to CSV artist ID
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

# Initialize a list to hold the artist-album mappings
artist_album_mapping = []

# List to hold artist Spotify IDs where CSV artist ID was not found
missing_artist_ids = []

# Assuming 'path_to_json_folder' is the folder containing all the JSON subfolders
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for album in data['albums']:
                    album_id = album_id_map.get(album['id'])
                    if album_id:
                        for artist in album['artists']:
                            artist_spotify_id = artist['id']
                            artist_id = artist_id_map.get(artist_spotify_id)
                            if artist_id:
                                artist_album_mapping.append({'artistID': artist_id, 'albumID': album_id})
                            else:
                                missing_artist_ids.append(artist_spotify_id)

# Print the list of artist Spotify IDs where the CSV artist ID was not found
print("List of artist Spotify IDs where the CSV artist ID was not found:")
for artist_id in missing_artist_ids:
    print(artist_id)

In [None]:
import pandas as pd

# Convert the list to a DataFrame
missing_artist_ids_df = pd.DataFrame(missing_artist_ids, columns=['missing_artist_spotify_id'])

# Define the file path where you want to save the CSV
file_path = 'missing_artist_ids.csv'  # You can specify your own path

# Save the DataFrame to a CSV file
missing_artist_ids_df.to_csv(file_path, index=False)

print(f'The missing artist IDs have been saved to {file_path}')

In [None]:
import pandas as pd

# Replace 'input_file.csv' with the path to your input CSV file
input_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\musicBrainzAllContributors.csv'
# Replace 'output_file.csv' with the path where you want to save the output CSV file
output_file_path = 'contributor_data_prelink.csv'

# Step 1: Read the input CSV
input_df = pd.read_csv(input_file_path, delimiter=',')

# Step 2: Create the new DataFrame
output_df = pd.DataFrame({
    'ID': range(1, len(input_df) + 1),
    'NAME': input_df['artist_credit_name'],
    'ROLE': input_df['role'],
    'INSTRUMENT': input_df['instrument'],
    'MUSICBRAINZ_ID': input_df['artist_mbid'],
    'MAINARTIST': input_df['artist_credit_name'],
    'SONGTITLE': input_df['recording_name']
})

# Step 3: Write the resulting DataFrame to a new CSV file
output_df.to_csv(output_file_path, sep=';', index=False)

print(f"Output CSV saved to {output_file_path}")


In [None]:
import pandas as pd
from fuzzywuzzy import process

# Load the CSV files
musicbrainz_df = pd.read_csv('C:\\Users\\Music\\team_project\\team37\\prepopulationStuff\\PythonNotebooksForPrepopulation\\contributor_data_prelink.csv', sep=';')  # The file generated from the previous step
spotify_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\tracks_data.csv', sep=';')  # The Spotify songs CSV

# Create a list of combined song title and artist names from Spotify for matching
spotify_combined_list = spotify_df.apply(lambda x: f"{x['song_title']} {x['song_album_id']}", axis=1).tolist()

def find_best_match(mb_row, spotify_combined_list):
    mb_combined = f"{mb_row['SONGTITLE']} {mb_row['MAINARTIST']}"
    
    # Using extractOne to find the best match from the list
    best_match_info = process.extractOne(mb_combined, spotify_combined_list)
    
    # If there is a match found, extract it
    if best_match_info:
        best_match_text, best_score = best_match_info
        # Find the index of the match in Spotify list to retrieve the full row from spotify_df
        match_index = spotify_combined_list.index(best_match_text)
        best_match_row = spotify_df.iloc[match_index]
        return best_match_row, best_score
    return None, 0

# Prepare the output DataFrame
matched_df = pd.DataFrame(columns=['CONTRIBUTOR_ID', 'SONG_TABLE_ID', 'Spotify_Song_Title', 'Spotify_Artist', 'MusicBrainz_Song_Title', 'MusicBrainz_Artist'])

# Iterate over MusicBrainz entries to find matches
for index, mb_row in musicbrainz_df.iterrows():
    best_match_row, score = find_best_match(mb_row, spotify_combined_list)
    if best_match_row is not None and score > 80:  # Adjust the threshold as needed
        matched_df = matched_df.append({
            'CONTRIBUTOR_ID': mb_row['ID'],
            'SONG_TABLE_ID': best_match_row['id'],
            'Spotify_Song_Title': best_match_row['song_title'],
            'Spotify_Artist': best_match_row['song_album_id'],  # Note: Adjust if there's a more direct artist name column
            'MusicBrainz_Song_Title': mb_row['SONGTITLE'],
            'MusicBrainz_Artist': mb_row['MAINARTIST']
        }, ignore_index=True)

# Output the matched DataFrame to CSV
matched_df.to_csv('linked_songs.csv', sep=';', index=False)

print("Linked songs CSV generated.")

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\contributors_non-normalized.csv')

# Select only the 'artist_credit_name' and 'artist_credit_id' columns
df_selected = df[['artist_credit_name', 'artist_credit_id']]

# Drop duplicates based on 'artist_credit_id' to ensure each ID is unique
df_unique = df_selected.drop_duplicates(subset=['artist_credit_id'])

# Save the result to a new CSV file
df_unique.to_csv('unique_artist_credits.csv', index=False)

print('Unique artist credits CSV file has been saved as unique_artist_credits.csv.')


In [None]:
import pandas as pd

def parse_manual(array_str):
    # Manually parse the string to extract elements between curly braces
    # Remove leading and trailing braces and split by comma
    items = array_str.strip('{}').split(',')
    # Strip quotes and extra spaces from each item
    items = [item.strip('"').strip() for item in items]
    return items

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\contributors_non-normalized.csv')

# Prepare a list to collect all artist name-MBID pairs
artist_pairs = []
i=0

# Iterate over the DataFrame rows
for _, row in df.iterrows():
    # Parse the 'artist_mbids' and 'individual_artist_names' fields manually
    artist_mbids = parse_manual(row['artist_mbids'])
    individual_artist_names = parse_manual(row['individual_artist_names'])
    
    # Ensure we have equal lengths of MBIDs and names before proceeding
    if len(artist_mbids) == len(individual_artist_names):
        # Pair each individual artist name with its corresponding MBID
        for artist_name, artist_mbid in zip(individual_artist_names, artist_mbids):
            artist_pairs.append((artist_name, artist_mbid))
    else:
        print("Warning: Mismatched MBIDs and artist names for a row, skipping.")
        print(f"Row index: {i}")
        i = i+1

# Convert the list of pairs into a DataFrame, ensuring uniqueness
df_pairs = pd.DataFrame(list(set(artist_pairs)), columns=['individual_artist_name', 'artist_mbid'])

# Save the DataFrame to a new CSV file
df_pairs.to_csv('artist_name_mbid_pairs.csv', index=False)

print('Artist name-MBID pairs CSV file has been saved as artist_name_mbid_pairs.csv.')



In [None]:
import pandas as pd

# Load the MBID CSV
mbid_df = pd.read_csv('C:\\Users\\Music\\team_project\\team37\\prepopulationStuff\\PythonNotebooksForPrepopulation\\artist_name_mbid_pairs.csv')

# Load the Spotify artist CSV, remember to use the ';' delimiter
spotify_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data.csv', delimiter=';')

# Perform a direct merge based on artist names
# Note: This assumes 'artist_name' in spotify_df exactly matches 'individual_artist_name' in mbid_df
merged_df = pd.merge(spotify_df, mbid_df, how='left', left_on='artist_name', right_on='individual_artist_name')

# Drop the 'individual_artist_name' column as it's redundant after merge
merged_df.drop(columns=['individual_artist_name'], inplace=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('spotify_artist_with_mbid_direct_match.csv', index=False)

print('Spotify artist data with direct match MBIDs has been saved as spotify_artist_with_mbid_direct_match.csv.')


In [None]:
import pandas as pd

# Adjust the path to your Spotify artists CSV file
spotify_csv_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artists_data.csv'
spotify_df = pd.read_csv(spotify_csv_path, delimiter=';')

# Print column names to verify
print(spotify_df.columns)


In [None]:
import pandas as pd
from datetime import datetime

# Load the CSV file
csv_file_path = 'C:\\Users\\Music\\team_project\\team37\\downloaded_files\\artists_data_full.csv'
df = pd.read_csv(csv_file_path, delimiter=';')

# Function to check for empty values in columns that shouldn't have them
def check_empty_values(df, columns):
    for column in columns:
        if df[column].isnull().any():
            print(f"Empty values found in column: {column}")
        else:
            print(f"No empty values in column: {column}")

# Function to check data types
def check_data_types(df):
    errors = []
    for column, expected_type in expected_column_types.items():
        if expected_type == 'numeric':
            if not pd.api.types.is_numeric_dtype(df[column]):
                errors.append(f"Column {column} is not of type {expected_type}")
        elif expected_type == 'string':
            if not pd.api.types.is_string_dtype(df[column]):
                errors.append(f"Column {column} is not of type {expected_type}")
        elif expected_type == 'date':
            try:
                pd.to_datetime(df[column])
            except ValueError:
                errors.append(f"Column {column} contains non-date values")
    
    if errors:
        for error in errors:
            print(error)
    else:
        print("All columns match their expected data types.")

# Columns expected not to have empty values based on your schema (all of them in this case)
mandatory_columns = [
    'id', 'artist_spotify_id', 'artist_name', 'artist_popularity',
    'artist_image_small', 'artist_image_medium', 'artist_image_large',
    'artist_followers', 'date_added_to_db', 'date_last_modified'
]

# Expected data types
expected_column_types = {
    'id': 'numeric',
    'artist_spotify_id': 'string',
    'artist_name': 'string',
    'artist_popularity': 'numeric',
    'artist_image_small': 'string',
    'artist_image_medium': 'string',
    'artist_image_large': 'string',
    'artist_followers': 'numeric',
    'date_added_to_db': 'date',
    'date_last_modified': 'date'
}

# Perform checks
check_empty_values(df, mandatory_columns)
check_data_types(df)


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\album_data_full.csv', sep=';')  # Adjust the path and separator as needed

# Convert the dates in the last two columns
# Assuming the last two columns contain your dates
df.iloc[:, -4] = pd.to_datetime(df.iloc[:, -4], dayfirst=True).dt.strftime('%Y-%m-%d')
df.iloc[:, -3] = pd.to_datetime(df.iloc[:, -3], dayfirst=True).dt.strftime('%Y-%m-%d')

# Save the modified DataFrame back to a new CSV file
df.to_csv('path_to_modified_csv.csv', index=False, sep=';')  # Adjust the path and separator as needed


In [None]:
import pandas as pd

# Define the path to your CSV file
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\album_data_full.csv'

# Load the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')

# Function to check for empty values in mandatory columns
def check_empty_values(df, columns):
    for column in columns:
        if df[column].isnull().any():
            print(f"Empty values found in column: {column}")
        else:
            print(f"No empty values in column: {column}")

# Function to verify data types
def check_data_types(df, column_types):
    for column, expected_type in column_types.items():
        if expected_type == 'numeric':
            if not pd.api.types.is_numeric_dtype(df[column]):
                print(f"Column {column} is not of numeric type.")
            else:
                print(f"Column {column} is of correct numeric type.")
        elif expected_type == 'string':
            if not pd.api.types.is_string_dtype(df[column]):
                print(f"Column {column} is not of string type.")
            else:
                print(f"Column {column} is of correct string type.")
        elif expected_type == 'date':
            try:
                pd.to_datetime(df[column])
                print(f"Column {column} is of correct date type.")
            except ValueError:
                print(f"Column {column} contains incorrect date format.")

# Mandatory columns (not nullable)
mandatory_columns = [
    'id', 'album_spotify_id', 'album_name', 'album_cover_art',
    'album_release_date', 'release_date_precision', 'album_popularity',
    'album_type', 'date_added_to_db', 'date_last_modified', 'musicbrainz_metadata_added'
]

# Columns with specific data types
column_types = {
    'id': 'numeric',
    'album_spotify_id': 'string',
    'album_name': 'string',
    'album_cover_art': 'string',
    'album_release_date': 'date',
    'release_date_precision': 'string',
    'album_popularity': 'numeric',
    'album_type': 'string',
    'spotify_album_upc': 'string',  # Nullable
    'spotify_album_ean': 'string',  # Nullable
    'spotify_album_isrc': 'string',  # Nullable
    'date_added_to_db': 'date',
    'date_last_modified': 'date',
    'musicbrainz_metadata_added': 'boolean',  # Assuming true/false representation
    'musicbrainz_id': 'string'  # Nullable
}

# Check for empty values in mandatory columns
check_empty_values(df, mandatory_columns)

# Check if data types are correct
check_data_types(df, column_types)
31045;59


In [None]:
import pandas as pd

# Define the path to your CSV file
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\album_data_full.csv'

# Load the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')

# Function to check for empty values in mandatory columns
def check_empty_values(df, columns):
    for column in columns:
        if df[column].isnull().any():
            print(f"Empty values found in column: {column}")
        else:
            print(f"No empty values in column: {column}")

# Function to verify data types and print incorrect date formats
def check_data_types_and_dates(df, column_types):
    for column, expected_type in column_types.items():
        if expected_type == 'numeric':
            if not pd.api.types.is_numeric_dtype(df[column]):
                print(f"Column {column} is not of numeric type.")
            else:
                print(f"Column {column} is of correct numeric type.")
        elif expected_type == 'string':
            if not pd.api.types.is_string_dtype(df[column]):
                print(f"Column {column} is not of string type.")
            else:
                print(f"Column {column} is of correct string type.")
        elif expected_type == 'date':
            incorrect_format = df[column].apply(lambda x: check_date_format(x))
            if incorrect_format.any():
                print(f"Rows with incorrect date format in column '{column}':")
                print(df[incorrect_format])
            else:
                print(f"Column {column} is of correct date type.")

# Helper function to check date format
def check_date_format(date_string):
    try:
        pd.to_datetime(date_string, errors='raise')
        return False  # Date format is correct
    except ValueError:
        return True  # Date format is incorrect

# Mandatory columns (not nullable)
mandatory_columns = [
    'id', 'album_spotify_id', 'album_name', 'album_cover_art',
    'album_release_date', 'release_date_precision', 'album_popularity',
    'album_type', 'date_added_to_db', 'date_last_modified', 'musicbrainz_metadata_added'
]

# Columns with specific data types
column_types = {
    'id': 'numeric',
    'album_spotify_id': 'string',
    'album_name': 'string',
    'album_cover_art': 'string',
    'album_release_date': 'date',
    'release_date_precision': 'string',
    'album_popularity': 'numeric',
    'album_type': 'string',
    'spotify_album_upc': 'string',  # Nullable
    'spotify_album_ean': 'string',  # Nullable
    'spotify_album_isrc': 'string',  # Nullable
    'date_added_to_db': 'date',
    'date_last_modified': 'date',
    'musicbrainz_metadata_added': 'boolean',  # Assuming true/false representation
    'musicbrainz_id': 'string'  # Nullable
}

# Check for empty values in mandatory columns
check_empty_values(df, mandatory_columns)

# Check if data types are correct and for incorrect date formats
check_data_types_and_dates(df, column_types)


In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\tracks_data_full.csv'  # Update this to the path of your CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')  # Ensure delimiter matches your CSV format

# Function to check for empty values in mandatory columns
def check_empty_values(df, columns):
    for column in columns:
        if df[column].isnull().any():
            print(f"Empty values found in column: {column}")
        else:
            print(f"No empty values in column: {column}")

# Function to verify data types
def check_data_types(df, column_types):
    for column, expected_type in column_types.items():
        if expected_type == 'numeric':
            if not pd.api.types.is_numeric_dtype(df[column]):
                print(f"Column {column} is not of numeric type.")
            else:
                print(f"Column {column} is of correct numeric type.")
        elif expected_type == 'string':
            if not pd.api.types.is_string_dtype(df[column]):
                print(f"Column {column} is not of string type.")
            else:
                print(f"Column {column} is of correct string type.")
        elif expected_type == 'date':
            try:
                pd.to_datetime(df[column])
                print(f"Column {column} is of correct date type.")
            except ValueError:
                print(f"Column {column} contains incorrect date format.")
        elif expected_type == 'boolean':
            # Checking if the column is boolean; Assuming boolean is represented as True/False or 1/0
            if not pd.api.types.is_bool_dtype(df[column]) and not all(df[column].dropna().isin([0, 1, 'True', 'False', True, False])):
                print(f"Column {column} is not of boolean type.")
            else:
                print(f"Column {column} is of correct boolean type.")

# Mandatory columns (not nullable)
mandatory_columns = [
    'id', 'song_spotify_id', 'song_title', 'song_duration', 
    'song_album_type', 'song_album_id', 'song_explicit', 
    'song_popularity', 'song_track_features_added', 'song_date_added_to_db', 
    'song_date_last_modified'
]

# Columns with specific data types
column_types = {
    'id': 'numeric',
    'song_spotify_id': 'string',
    'song_title': 'string',
    'song_duration': 'numeric',
    'song_album_type': 'string',
    'song_album_id': 'string',
    'song_explicit': 'boolean',
    'song_popularity': 'numeric',
    'song_preview_url': 'string',  # Nullable
    'song_track_features_added': 'boolean',
    # Assuming 'floatType' corresponds to 'numeric' in Python/Pandas
    'song_acousticness': 'numeric',  # Nullable
    'song_danceability': 'numeric',  # Nullable
    'song_energy': 'numeric',  # Nullable
    'song_instrumentalness': 'numeric',  # Nullable
    'song_liveness': 'numeric',  # Nullable
    'song_loudness': 'numeric',  # Nullable
    'song_speechiness': 'numeric',  # Nullable
    'song_tempo': 'numeric',  # Nullable
    'song_valence': 'numeric',  # Nullable
    'song_key': 'numeric',  # Nullable
    'song_time_signature': 'numeric',  # Nullable
    'song_date_added_to_db': 'date',
    'song_date_last_modified': 'date',
    'album_id': 'numeric'  # Nullable
}

# Check for empty values in mandatory columns
check_empty_values(df, mandatory_columns)

# Check if data types are correct
check_data_types(df, column_types)


In [None]:
import pandas as pd

# Adjust these settings as needed
pd.set_option('display.max_rows', None)  # This will allow all rows to be displayed
pd.set_option('display.max_columns', None)  # This will allow all columns to be displayed
pd.set_option('display.width', 1000)  # Adjust the width to accommodate the number of columns
pd.set_option('display.max_colwidth', None)  # This ensures that the content of each column is fully displayed

# Define the path to your CSV file
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\tracks_data_full.csv'  # Update this to the path of your CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')

# Function to print rows with empty values in a specific column
def print_rows_with_empty_values_in_column(df, column_name):
    empty_rows = df[df[column_name].isnull() | (df[column_name] == '')]
    if not empty_rows.empty:
        print(f"Rows with empty values in column '{column_name}':")
        # print the song_spotfiy_id for each row with empty values
        print(empty_rows['song_spotify_id'])
    else:
        print(f"No empty values found in column '{column_name}'.")

# Print all rows where the song_title is empty
print_rows_with_empty_values_in_column(df, 'song_title')



In [None]:
import pandas as pd
import os
import json

# Load the CSV files
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\album_data_full.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\artists_data_full.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})

# Create dictionaries for ID mappings
album_id_map = albums_df.set_index('album_spotify_id')['id'].to_dict()
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

artist_album_mapping = []

# Assuming the JSON structure is as shown in your example
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for album in data['albums']:
                    album_id = album_id_map.get(album['id'])
                    if not album_id:
                        print(f"Album ID {album['id']} not found in album CSV.")
                        continue
                    for artist in album['artists']:
                        artist_id = artist_id_map.get(artist['id'])
                        if not artist_id:
                            print(f"Artist ID {artist['id']} not found in artist CSV.")
                            continue
                        artist_album_mapping.append({'artistID': artist_id, 'albumID': album_id})

# Create DataFrame and remove duplicates
artist_album_df = pd.DataFrame(artist_album_mapping).drop_duplicates()

# Save the DataFrame to CSV
artist_album_df.to_csv('artist_album_mappings_new.csv', index=False, sep=';')


In [None]:
import pandas as pd
import os
import json

# Load the CSV files
albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\album_table.csv', delimiter=';', dtype={'id': str, 'album_spotify_id': str})
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\artists_data_full.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})

# Create dictionaries for ID mappings
album_id_map = albums_df.set_index('album_spotify_id')['id'].to_dict()
artist_id_map = artists_df.set_index('artist_spotify_id')['id'].to_dict()

artist_album_mapping = []
missing_artists = {}  # Store missing artist IDs and the album IDs they're attributed to

# Assuming the JSON structure is as shown in your example
for root, dirs, files in os.walk('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\ALBUMS'):
    for file in files:
        if file.endswith('.json'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for album in data['albums']:
                    album_id = album_id_map.get(album['id'])
                    if not album_id:
                        print(f"Album ID {album['id']} not found in album CSV.")
                        continue
                    for artist in album['artists']:
                        artist_id = artist_id_map.get(artist['id'])
                        if not artist_id:
                            # Record the missing artist ID along with the current album ID
                            missing_artists.setdefault(artist['id'], []).append(album['id'])
                            print(f"Artist ID {artist['id']} not found in artist CSV, attributed to Album ID {album['id']}.")
                            continue
                        artist_album_mapping.append({'artistID': artist_id, 'albumID': album_id})

# Display missing artist IDs and the albums they're attributed to
for artist_id, album_ids in missing_artists.items():
    print(f"Artist ID {artist_id} (not found) is attributed to Album IDs: {', '.join(album_ids)}")


In [None]:
import pandas as pd

df1 = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\recordings_artist_list.csv', delimiter=',', dtype={'id': str, 'artist_spotify_id': str})
df2 = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\artists_table.csv', delimiter=';', dtype={'id': str, 'artist_spotify_id': str})


# Convert the musicbrainz_id column into a set for fast lookup
musicbrainz_ids = set(df2['musicbrainz_id'])

# Define a function to check if any artist_mbids match the musicbrainz_ids
def matches_musicbrainz_ids(artist_mbids_str):
    # Parse the string of artist_mbids into a list
    artist_mbids = artist_mbids_str.strip('{}').split(',')
    # Check if any of the artist_mbids is in the set of musicbrainz_ids
    return any(mbid.strip() in musicbrainz_ids for mbid in artist_mbids)

# Apply the function to filter df1
df1['artist_mbids_match'] = df1['artist_mbids'].apply(matches_musicbrainz_ids)
filtered_df1 = df1[df1['artist_mbids_match']]

# Drop the helper column after filtering
filtered_df1 = filtered_df1.drop(columns=['artist_mbids_match'])

# Save the filtered DataFrame to a new CSV file
output_csv_path = 'recordings_artist_list_onlyMatchedArtists.csv'
filtered_df1.to_csv(output_csv_path, index=False)

print(f"Filtered data saved to {output_csv_path}")


In [None]:
import pandas as pd

# Load CSV files into DataFrames
csv_path_1 = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\recordings_artist_list_onlyMatchedArtists.csv'
csv_path_2 = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_normalized.csv'

df1 = pd.read_csv(csv_path_1)
df2 = pd.read_csv(csv_path_2)

# Merge the DataFrames on 'recording_mbid'
# Ensure that the key for merging ('recording_mbid') exists in both DataFrames exactly as named.
merged_df = pd.merge(df1, df2, on='recording_mbid', how='inner')

# Select only the required columns
final_df = merged_df[['recording_mbid', 'recording_name_x', 'artist_names', 'artist_mbids', 'artist_mbid', 'individual_artist_name', 'role', 'instrument']]

# Optionally, rename 'recording_name_x' to 'recording_name' to correct the column name after merging
final_df.rename(columns={'recording_name_x': 'recording_name'}, inplace=True)

# Save the merged DataFrame to a new CSV file
output_csv_path = 'contributors_normalized_onlyMatchedArtists.csv'
final_df.to_csv(output_csv_path, index=False)

print(f"Combined data saved to {output_csv_path}")


In [None]:
import pandas as pd

# Load the CSV files
contributors_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_normalized_onlyMatchedArtists.csv')
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\\PRODFILES\\artists_table.csv', sep=';')
artist_album_mapping_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\artist_album_mapping.csv', sep=';')
track_data_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\tracks_data_full.csv', sep=';')

# Step 1: Merge artists with their albums
artists_with_albums = pd.merge(artists_df, artist_album_mapping_df, left_on='id', right_on='artistID', how='inner')

In [None]:
# Step 2: Merge the above with track data on albumID
artist_tracks = pd.merge(artists_with_albums, track_data_df, left_on='albumID', right_on='album_id', how='inner')

In [None]:
contributors_dict = contributors_df.groupby('artist_mbid')['recording_mbid'].apply(set).to_dict()


In [None]:
track_dict = {}
for _, row in artist_tracks.iterrows():
    mbid = row['musicbrainz_id']
    if mbid not in track_dict:
        track_dict[mbid] = []
    track_dict[mbid].append({
        'song_title': row['song_title'],
        'song_spotify_id': row['song_spotify_id']
    })

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
# Function to map recording_mbid to song_spotify_id based on musicbrainz_id and song title
def find_recording_mbid():
    results = []
    for mbid, recordings in contributors_dict.items():
        if mbid in track_dict:
            for record in track_dict[mbid]:
                for recording_mbid in recordings:
                    # Check if contributors recording_name matches track_dict song_title
                    contributor_row = contributors_df[contributors_df['recording_mbid'] == recording_mbid]
                    if not contributor_row.empty and contributor_row['recording_name'].iloc[0] == record['song_title']:
                        results.append({
                            'song_spotify_id': record['song_spotify_id'],
                            'recording_mbid': recording_mbid
                        })
    return results

In [None]:
final_results = find_recording_mbid()


In [None]:
import pandas as pd

def load_and_process_csv(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Prepare a container for the expanded DataFrame
    expanded_rows = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Manually parse the artist names and MBIDs by trimming the braces and splitting by comma
        artist_names = row['artist_names'].strip('{}').replace('"', '').split(',')
        artist_mbids = row['artist_mbids'].strip('{}').split(',')

        # Create a new row for each artist
        for artist_name, artist_mbid in zip(artist_names, artist_mbids):
            new_row = row.copy()
            new_row['artist_names'] = f'{{"{artist_name}"}}'
            new_row['artist_mbids'] = f'{{"{artist_mbid}"}}'
            expanded_rows.append(new_row)

    # Create a new DataFrame with the expanded rows
    expanded_df = pd.DataFrame(expanded_rows)

    return expanded_df


# Example file path
file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_normalized_onlyMatchedArtists.csv'  # Update this with the actual file path

# Process the CSV file
processed_df = load_and_process_csv(file_path)

# Save the processed DataFrame to a new CSV file
processed_df.to_csv('processed_output.csv', index=False)

In [None]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed

contributors_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_normalized_onlyMatchedArtists_artistlinebyline.csv')
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\\PRODFILES\\artists_table.csv', sep=';')
artist_album_mapping_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\artist_album_mappings.csv', sep=';')
track_data_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\tracks_data_actually_full.csv', sep=';')

artists_with_albums = pd.merge(artists_df, artist_album_mapping_df, left_on='id', right_on='artistID', how='inner')
artist_tracks = pd.merge(artists_with_albums, track_data_df, left_on='albumID', right_on='album_id', how='inner')
#contributors_dict = contributors_df.groupby('artist_mbid')['recording_mbid'].apply(set).to_dict()

track_dict = {}
for _, row in artist_tracks.iterrows():
    mbid = row['musicbrainz_id']
    if mbid not in track_dict:
        track_dict[mbid] = []
    track_dict[mbid].append({
        'song_title': row['song_title'],
        'song_spotify_id': row['song_spotify_id']
    })



In [None]:
contributors_dict = {}
recording_main_artist_added = set()  # A set to track which recording and main artist combinations have been added

for _, row in contributors_df.iterrows():
    # Clean each mbid thoroughly
    mbid = row['artist_mbid'].replace('"', '').strip()
    recording_mbid = row['recording_mbid']
    main_artist_mbids = [x.replace('"', '').strip() for x in row['artist_mbids'].strip('{}').split(',')]

    # Handle individual contributor
    if mbid not in contributors_dict:
        contributors_dict[mbid] = []
    contributors_dict[mbid].append({
        'recording_mbid': recording_mbid,
        'recording_name': row['recording_name'],
        'role': row['role']
    })

    # Handle main artist(s)
    for main_mbid in main_artist_mbids:
        if (main_mbid, recording_mbid) not in recording_main_artist_added:
            if main_mbid not in contributors_dict:
                contributors_dict[main_mbid] = []
            contributors_dict[main_mbid].append({
                'recording_mbid': recording_mbid,
                'recording_name': row['recording_name'],
                'role': 'main_artist'
            })
            recording_main_artist_added.add((main_mbid, recording_mbid))


In [None]:
contributors_dict = {}
for _, row in contributors_df.iterrows():
    mbid = row['artist_mbid']
    if mbid not in contributors_dict:
        contributors_dict[mbid] = []
    contributors_dict[mbid].append({
        'recording_mbid': row['recording_mbid'],
        'recording_name': row['recording_name']
    })


In [None]:
#TESTING CONTIRBUTORS_DICT
contributors_df = pd.read_csv(
    'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_normalized_onlyMatchedArtists_artistlinebyline.csv',
    escapechar='\\',  # Add escape character if needed
    quotechar='"',    # Set quote character to handle quoted strings properly
    dtype=str         # Treat all data as strings
)
contributors_dict = {}
for _, row in contributors_df.iterrows():
    mbid = row['artist_mbid']
    if mbid not in contributors_dict:
        contributors_dict[mbid] = []
    contributors_dict[mbid].append({
        'recording_mbid': row['recording_mbid'],
        'recording_name': row['recording_name'],
        'role': row['role'],  # Assuming you want to also capture the role
        'individual_artist_name': row['individual_artist_name']  # Capturing individual artist name
    })

import csv

# File path for the CSV output
output_file = 'output_contributors_TEST.csv'

# Writing to a CSV file
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Artist MBID', 'Recording MBID', 'Recording Name'])
    
    # Write data rows
    for artist_mbid, recordings in contributors_dict.items():
        for recording in recordings:
            writer.writerow([artist_mbid, recording['recording_mbid'], recording['recording_name']])

print(f"Data successfully written to {output_file}")

In [None]:
import csv

# File path for the CSV output
output_file = 'output_contributors_with_mainartists_quotes_stripped.csv'

# Writing to a CSV file
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Artist MBID', 'Recording MBID', 'Recording Name'])
    
    # Write data rows
    for artist_mbid, recordings in contributors_dict.items():
        for recording in recordings:
            writer.writerow([artist_mbid, recording['recording_mbid'], recording['recording_name']])

print(f"Data successfully written to {output_file}")

In [None]:
import csv

# File path for the CSV output
output_file = 'track_dict_1.csv'

# Open a file to write
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL)
    
    # Write the header row
    writer.writerow(['Artist MBID', 'Song Title', 'Song Spotify ID'])
    
    # Write data rows
    for mbid, tracks in track_dict.items():
        for track in tracks:
            writer.writerow([mbid, track['song_title'], track['song_spotify_id']])

print(f"Data successfully written to {output_file}")

In [None]:
mbid_to_spotify_id = {}

# Iterate over the smaller dictionary to reduce the number of lookups
for mbid, tracks in track_dict.items():
    # Check if the same mbid is in contributors_dict
    if mbid in contributors_dict:
        recordings = contributors_dict[mbid]
        
        # Create a quick lookup for song titles to Spotify IDs from track_dict
        song_title_to_spotify_id = {track['song_title']: track['song_spotify_id'] for track in tracks}
        
        # Iterate over each recording for the current mbid
        for recording in recordings:
            # If recording name matches a song title, map the MBIDs
            if recording['recording_name'] in song_title_to_spotify_id:
                spotify_id = song_title_to_spotify_id[recording['recording_name']]
                # Add to the result dictionary
                mbid_to_spotify_id[recording['recording_mbid']] = spotify_id

In [None]:
# Result dictionary for recording MBID to Spotify ID and Artist MBID mapping
mbid_to_spotify_id_and_artist = {}

# Iterate over the smaller dictionary to reduce the number of lookups
for mbid, tracks in track_dict.items():
    # Check if the same mbid is in contributors_dict
    if mbid in contributors_dict:
        recordings = contributors_dict[mbid]
        
        # Create a quick lookup for song titles to Spotify IDs from track_dict
        song_title_to_spotify_id = {track['song_title']: track['song_spotify_id'] for track in tracks}
        
        # Iterate over each recording for the current mbid
        for recording in recordings:
            # If recording name matches a song title, map the MBIDs
            if recording['recording_name'] in song_title_to_spotify_id:
                spotify_id = song_title_to_spotify_id[recording['recording_name']]
                # Add to the result dictionary
                # Now also storing the artist MBID for each match
                mbid_to_spotify_id_and_artist[recording['recording_mbid']] = {'spotify_id': spotify_id, 'artist_mbid': mbid}

# mbid_to_spotify_id_and_artist now contains the mapping from recording MBIDs to Spotify IDs along with Artist MBIDs


In [None]:
# File path for the CSV output
output_file = 'output_mbid_spotify_mapping_with_main_artists_withquotesstripped.csv'

# Writing to a CSV file
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Recording MBID', 'Spotify ID', 'Artist MBID'])
    
    # Write data rows
    for recording_mbid, info in mbid_to_spotify_id_and_artist.items():
        writer.writerow([recording_mbid, info['spotify_id'], info['artist_mbid']])

print(f"Data successfully written to {output_file}")

In [None]:

def process_tracks(mbid, track_details):
    results = []
    if mbid in contributors_dict:
        for record in track_details:
            for recording_mbid in contributors_dict[mbid]:
                contributor_row = contributors_df[contributors_df['recording_mbid'] == recording_mbid]
                if not contributor_row.empty and contributor_row['recording_name'].iloc[0] == record['song_title']:
                    results.append({
                        'song_spotify_id': record['song_spotify_id'],
                        'recording_mbid': recording_mbid
                    })
    return results
from multiprocessing import Pool, cpu_count
# Using multiprocessing.Pool to parallelize the processing
if __name__ == '__main__':  # Required for multiprocessing safety on Windows
    with Pool(processes=cpu_count()) as pool:  # Using all available CPU cores
        results = pool.map(process_tracks, list(track_dict.keys()))

# Flatten the list of results
flattened_results = [item for sublist in results for item in sublist]

# Convert results to DataFrame and save to CSV
final_df = pd.DataFrame(flattened_results)
final_df.to_csv('final_tracks_with_mapped_mbid.csv', sep=';', index=False)

print("Mapping complete and saved to final_tracks_with_mapped_mbid.csv")

In [None]:
import os

# Get the number of CPU cores
cpu_cores = os.cpu_count()
print(f"Number of CPU cores: {cpu_cores}")

In [None]:
final_tracks_with_mbid = artist_tracks.dropna(subset=['mapped_recording_mbid'])

# Save the final DataFrame to a new CSV file
final_tracks_with_mbid.to_csv('final_tracks_with_mapped_mbid.csv', sep=';', index=False)


In [None]:
# Filter out rows where no recording_mbid was mapped (if necessary)
# final_tracks_with_mbid = artist_tracks.dropna(subset=['mapped_recording_mbid'])

# Save the final DataFrame to a new CSV file
final_tracks_with_mbid.to_csv('final_tracks_with_mapped_mbid.csv', sep=';', index=False)

print("Mapping complete and saved to final_tracks_with_mapped_mbid.csv")

In [None]:
import pandas as pd
file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\scraping_final.csv'
df = pd.read_csv(file_path, delimiter=';')

# Remove rows where 'album_id' is empty
df_filtered = df[df['album_id'].notna()]

# Save the filtered DataFrame back to a new CSV file
output_file_path = 'scraping_final_new.csv'  # Replace with your desired output file path
df_filtered.to_csv(output_file_path, index=False, sep=';')

print(f"Filtered file saved to {output_file_path}")

In [None]:
import pandas as pd

# Assuming you have already read the CSV into a DataFrame named df
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\scraping_final_new.csv', delimiter=';')

# Function to truncate strings longer than 254 characters
def truncate_string(s):
    return s[:254] if isinstance(s, str) and len(s) > 254 else s

# Apply the truncation function to the 'song_title' column
df['song_title'] = df['song_title'].apply(truncate_string)

# Save the DataFrame back to CSV
df.to_csv('scraping_final_new_new.csv', index=False, sep=';')

In [None]:
import pandas as pd

# Parameters
input_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\NONSPLIT\\tracks_full.csv'  # Replace with the path to your large CSV
output_file_base_path = 'tracks'  # Replace with your desired output base path
max_rows_per_file = 200000  # Maximum rows per subfile
delimiter = ';'  # Define the delimiter used in your CSV file

chunk_number = 1
for chunk in pd.read_csv(input_file_path, delimiter=delimiter, chunksize=max_rows_per_file):
    output_file_path = f"{output_file_base_path}_part_{chunk_number}.csv"
    chunk.to_csv(output_file_path, index=False, sep=delimiter)
    chunk_number += 1

print(f"Splitting complete. {chunk_number} files were created.")


In [None]:
import pandas as pd

# Replace 'path_to_your_csv_file.csv' with the path to your CSV file
input_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\related_artists.csv'
# Replace 'path_to_your_output_csv_file.csv' with the desired path for the output CSV file
output_file_path = 'related_artists.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(input_file_path, delimiter=';')

# Add the 'ID' column starting with 1 and incrementing by 1 for each row
df.insert(0, 'ID', range(1, 1 + len(df)))

# Save the DataFrame with the new 'ID' column back to a new CSV file
df.to_csv(output_file_path, index=False, sep=';')

# Print out a message to confirm completion
print(f"New CSV file with 'ID' column saved to {output_file_path}")


In [None]:
import pandas as pd

# Load the text file containing the IDs (assuming it's saved as 'ids.txt' in JSON list format)
with open('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\artist_albums\\FULL_ALBUM_LIST.txt', 'r') as file:
    id_list = eval(file.read())

# Load the CSV file (assuming it's named 'data.csv' and uses ';' as the delimiter)
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\album_table.csv', delimiter=';')

# Extract the column of IDs from the CSV file
csv_ids_dict = {id_: True for id_ in df['album_spotify_id'].tolist()}

# Find IDs that are in the text file list but not in the CSV column
missing_ids = [id_ for id_ in id_list if id_ not in csv_ids_dict]

with open('missing_ids.txt', 'w') as output_file:
    for id_ in missing_ids:
        output_file.write(id_ + '\n')


In [None]:


import pandas as pd

# Load the CSV file containing songs (assuming it's named 'songs.csv' and uses ';' as the delimiter)
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\tracks_data_full.csv', delimiter=';')

# Convert the song Spotify IDs from the CSV into a dictionary for faster lookup
song_ids_dict = {id_: True for id_ in df['song_spotify_id'].tolist()}

# Assuming the list of song IDs is stored in a text file named 'song_ids.txt', one ID per line
with open('unique_track_ids.txt', 'r') as file:
    # Read the file and split by lines to create a list of IDs
    song_list = file.read().splitlines()

# Find song IDs that are not in the CSV file using dictionary
unique_song_ids = [song_id for song_id in song_list if song_id not in song_ids_dict]

# Save the list of unique song IDs to a new text file
with open('unique_song_ids.txt', 'w') as output_file:
    for song_id in unique_song_ids:
        output_file.write(song_id + '\n')

print("Unique song IDs have been saved to 'unique_song_ids.txt'")

In [None]:
import pandas as pd
from datetime import datetime

# Define the path to your CSV file
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\album_table.csv'

# Load the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')

# Function to convert date format
def convert_date_format(date_str):
    try:
        return datetime.strptime(date_str, '%d/%m/%Y').strftime('%Y-%m-%d')
    except ValueError:
        return date_str  # Returns the original string if the format is incorrect

# Apply the conversion function to the album_release_date column
df['album_release_date'] = df['album_release_date'].apply(convert_date_format)

# Specify the path where you want to save the updated CSV
output_csv_file_path = 'updated_album_data.csv'

# Save the DataFrame to a new CSV file
df.to_csv(output_csv_file_path, index=False, sep=';')


In [None]:
import pandas as pd

# Load the CSV file
file_path = 'merged_output_file.csv'
df = pd.read_csv(file_path, sep=';')

# Check if 'album_id' column exists and then convert it
if 'album_id' in df.columns:
    df['album_id'] = df['album_id'].apply(lambda x: int(x) if pd.notnull(x) else x)

# Save the modified DataFrame back to a new CSV file
output_file_path = 'modified_file.csv'
df.to_csv(output_file_path, index=False, sep=';')

print(f"CSV file has been processed and saved as {output_file_path}")

In [None]:
import pandas as pd

# Load the CSV files
file_path_songs = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\tracks_data_actually_full.csv'
file_path_recordings = 'output_mbid_spotify_mapping_with_main_artists_withquotesstripped.csv'

df_songs = pd.read_csv(file_path_songs, sep=';')
df_recordings = pd.read_csv(file_path_recordings, sep=',')

# Check and process necessary columns
if 'Recording MBID' in df_recordings.columns and 'Spotify ID' in df_recordings.columns:
    # Rename columns for clarity and to prevent conflicts
    df_recordings.rename(columns={'Recording MBID': 'recording_mbid', 'Spotify ID': 'song_spotify_id'}, inplace=True)
    
    # Convert to string if not already (important for matching)
    df_songs['song_spotify_id'] = df_songs['song_spotify_id'].astype(str)
    df_recordings['song_spotify_id'] = df_recordings['song_spotify_id'].astype(str)
    
    # Merge the DataFrames based on 'song_spotify_id'
    df_merged = pd.merge(df_songs, df_recordings[['song_spotify_id', 'recording_mbid']], on='song_spotify_id', how='left')
else:
    print("Required columns are missing in the recordings data.")

# Save the modified DataFrame back to a new CSV file
output_file_path = 'merged_output_file.csv'
df_merged.to_csv(output_file_path, index=False, sep=';')

print(f"CSV file has been processed and saved as {output_file_path}")


In [None]:
import pandas as pd

# Load the CSV files
file_path_songs = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\tracks_full.csv'
file_path_recordings = 'output_mbid_spotify_mapping_with_main_artists_withquotesstripped.csv'

df_songs = pd.read_csv(file_path_songs, sep=';')
df_recordings = pd.read_csv(file_path_recordings, sep=',')

# Rename columns for clarity and to prevent conflicts
df_recordings.rename(columns={'Recording MBID': 'new_recording_mbid', 'Spotify ID': 'song_spotify_id'}, inplace=True)

# Convert to string if not already (important for matching)
df_songs['song_spotify_id'] = df_songs['song_spotify_id'].astype(str)
df_recordings['song_spotify_id'] = df_recordings['song_spotify_id'].astype(str)

# Merge the DataFrames based on 'song_spotify_id'
df_merged = pd.merge(df_songs, df_recordings[['song_spotify_id', 'new_recording_mbid']], on='song_spotify_id', how='left')

# Update 'recording_mbid' in the original DataFrame where applicable
df_merged['recording_mbid'] = df_merged['new_recording_mbid'].where(pd.notnull(df_merged['new_recording_mbid']), df_merged['recording_mbid'])

# Drop the temporary 'new_recording_mbid' column
df_merged.drop(columns=['new_recording_mbid'], inplace=True)

# Save the modified DataFrame back to a new CSV file
output_file_path = 'updated_songs_file.csv'
df_merged.to_csv(output_file_path, index=False, sep=';')

print(f"CSV file has been processed and saved as {output_file_path}")


In [None]:
import pandas as pd

# Load the CSV files
path_csv_1 = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\tracks_full.csv'  # Update with the path to your first CSV
path_csv_2 = 'output_mbid_spotify_mapping_with_main_artists_withquotesstripped.csv'  # Update with the path to your second CSV

df1 = pd.read_csv(path_csv_1, sep=';')
df2 = pd.read_csv(path_csv_2, sep=',')

# Rename columns in df2 for clarity
df2.rename(columns={'Recording MBID': 'recording_mbid', 'Spotify ID': 'spotify_id'}, inplace=True)

# Merge df1 and df2 based on the Spotify IDs to update recording_mbid_x
df1 = df1.merge(df2[['spotify_id', 'recording_mbid']], left_on='song_spotify_id', right_on='spotify_id', how='left')

# Update the recording_mbid_x with the value from df2
df1['recording_mbid_x'] = df1['recording_mbid']

# Drop the temporary columns and the recording_mbid_y column
df1.drop(['spotify_id', 'recording_mbid', 'recording_mbid_y'], axis=1, inplace=True)

# Save the modified DataFrame back to a new CSV file
output_file_path = 'updated_file.csv'
df1.to_csv(output_file_path, index=False, sep=';')

print(f"CSV file has been updated and saved as {output_file_path}")



In [None]:
import pandas as pd
import re
import ast

# Load your CSV files
df1 = pd.read_csv('output_mbid_spotify_mapping_with_main_artists_withquotesstripped.csv')
df2 = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_non-normalized.csv')

def convert_to_set(mbids_str):
    # Add quotes around the UUIDs to make them valid for literal evaluation
    mbids_str_quoted = re.sub(r'([a-f0-9-]{36})', r"'\1'", mbids_str)
    try:
        # Convert the string to a set
        return set(ast.literal_eval(mbids_str_quoted))
    except:
        # Return an empty set in case of any error
        return set()


# Apply the conversion function to the 'artist_mbids' column
df2['artist_mbids'] = df2['artist_mbids'].apply(convert_to_set)

# Explode 'artist_mbids' into separate rows for each MBID
df2 = df2.explode('artist_mbids')

# Remove duplicates and count unique artist MBIDs for each recording MBID
artist_count = df2.groupby('recording_mbid')['artist_mbids'].nunique().reset_index()
artist_count.columns = ['Recording MBID', 'Unique Artist Count']

# Merge this count back to the first DataFrame
df1 = df1.merge(artist_count, on='Recording MBID', how='left')

# Select the row with the highest count of unique artist MBIDs for each Spotify ID
result = df1.loc[df1.groupby('Spotify ID')['Unique Artist Count'].idxmax()]

# Save or display the result
result.to_csv('filtered_output.csv', index=False)


In [None]:
import pandas as pd

# Load the CSV files
df_filtered_output = pd.read_csv('filtered_output.csv')  # This is the first CSV from the previous task
df_songs = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\tracks_full.csv', delimiter=';')  # Make sure to specify the delimiter

# Merge the two DataFrames on the matching Spotify ID fields
merged_df = pd.merge(df_songs, df_filtered_output, left_on='song_spotify_id', right_on='Spotify ID', how='left')

# Update the 'recording_mbid_x' column with the 'Recording MBID' from the first CSV
merged_df['recording_mbid_x'] = merged_df['Recording MBID']

# Drop the 'recording_mbid_y' column and any other unwanted columns
merged_df.drop(columns=['recording_mbid_y', 'Recording MBID', 'Spotify ID', 'Artist MBID', 'Unique Artist Count'], inplace=True)

# Save or display the resulting DataFrame
merged_df.to_csv('final_song_data.csv', index=False)
merged_df.head()  # Display the first few rows to check


In [None]:
merged_df.to_csv('final_song_data.csv', index=False, sep=';')

In [None]:
import pandas as pd

# Load the first CSV file
first_csv = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\filtered_output.csv')
# Load the second CSV file
second_csv = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\WORKING\\contributors_normalized.csv')

# Ensure that the 'Recording MBID' column in the first CSV and 'recording_mbid' in the second CSV are strings
first_csv['Recording MBID'] = first_csv['Recording MBID'].astype(str)
second_csv['recording_mbid'] = second_csv['recording_mbid'].astype(str)

# Filter the second CSV to only include rows where the 'recording_mbid' exists in the 'Recording MBID' column of the first CSV
filtered_second_csv = second_csv[second_csv['recording_mbid'].isin(first_csv['Recording MBID'])]

# Save the filtered data to a new CSV file (optional)
filtered_second_csv.to_csv('contributors_normalized_filteredToOnlyMatched.csv', index=False, sep=';')


In [None]:



import pandas as pd

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\contributors_normalized_filteredToOnlyMatched.csv', delimiter=';')

# Add a new column 'ID' before 'recording_mbid' with sequential IDs starting from 1
df.insert(0, 'ID', range(1, 1 + len(df)))

# Save the modified DataFrame to a new CSV file
df.to_csv('modified_output.csv', sep=';', index=False)

# Display the modified DataFrame
print(df)


In [None]:
import pandas as pd

# Load the first CSV file
df1 = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\contributors_normalized_filteredToOnlyMatched.csv', delimiter=';')

# Load the second CSV file
df2 = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\tracks_full.csv', delimiter=';')

# Ensure that 'recording_mbid' columns in both dataframes are of type string for accurate comparison
df1['recording_mbid'] = df1['recording_mbid'].astype(str)
df2['recording_mbid'] = df2['recording_mbid'].astype(str)

# Create a mapping DataFrame based on matching 'recording_mbid' in both DataFrames
# 'id' from df1 is 'CONTRIBUTOR_ID' and 'id' from df2 is 'SONG_TABLE_ID'
mapping_df = df1.merge(df2, on='recording_mbid', suffixes=('_df1', '_df2'))

# Select only the needed columns for the final output and rename them
final_df = mapping_df[['id_df1', 'id_df2']].rename(columns={'id_df1': 'CONTRIBUTOR_ID', 'id_df2': 'SONG_TABLE_ID'})

# Save the result to a new CSV file
final_df.to_csv('mapping_output.csv', sep=';', index=False)

# Display the result
print(final_df)


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\contributors_normalized_filteredToOnlyMatched.csv', delimiter=';')

# Drop the specified columns
df_dropped = df.drop(columns=['recording_mbid', 'recording_name', 'artist_credit_name', 'artist_credit_id'])

# Save the modified DataFrame to a new CSV file
df_dropped.to_csv('reduced_output.csv', sep=';', index=False)

# Display the reduced DataFrame
print(df_dropped)

In [None]:
import pandas as pd

# Load the CSV files
artists_albums_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\artist_album_mapping.csv', delimiter=';')
artists_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\artist_data.csv', delimiter=';')
albums1_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\album_part_1.csv', delimiter=';')
albums2_df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\album_part_2.csv', delimiter=';')

combined_albums_df = pd.concat([albums1_df, albums2_df], ignore_index=True)

# Function to check if all IDs in the main DataFrame exist in another DataFrame
def check_ids(main_df, check_df, main_column, check_column, name):
    missing_ids = main_df[~main_df[main_column].isin(check_df[check_column])]
    if missing_ids.empty:
        print(f"All {main_column} in {name} are present.")
    else:
        print(f"Missing {main_column} in {name}:")
        print(missing_ids)

# Check if all artistIDs from artists_albums.csv are in artists.csv
check_ids(artists_albums_df, artists_df, 'artistID', 'id', 'artists.csv')

check_ids(artists_albums_df, combined_albums_df, 'albumID', 'id', 'combined albums.csv')



In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\NEWEST\\PRODFILES\\NONSPLIT\\tracks_full.csv', delimiter=';')

# Remove duplicate rows
df_cleaned = df.drop_duplicates()

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('tracks_full_NODUPES.csv', index=False, sep=';')

# If you want to overwrite the original file, you can use:
# df_cleaned.to_csv('path/to/your_file.csv', index=False)


In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\working\\matched_artists.csv')

# Assuming columns are known and fixed as "Recording MBID", "Spotify ID", "Artist MBID", "Unique Artist Count"
df_reduced = df.drop(columns=[df.columns[0], df.columns[2]])

# Save the modified DataFrame to a new CSV file
df_reduced.to_csv('modified_output.csv', index=False)

# Display the reduced DataFrame
print(df_reduced)

                                artist_mbid              spotify_id
0      2f7bc75e-4ffe-4400-a200-69cc9adaac23  3Wkf3Zv5wMaIsz2MwkVOKF
1      870ab832-904b-4b49-ab72-7e6b0e0da65f  4xcYVPssil6vbG6tq3W43S
2      2e0f7014-ce5e-4ce1-a3d9-f52c249482dc  5bmpsjaT6cyVlVFX8vYnBm
3      a972372a-6d87-4b21-8f5c-5ba6e2d640b0  07tB6knv7c0JsSbkfV5iZd
4      831fe0ce-c4b3-4044-bbf6-8db6b9ac2070  1SlJJwJtVLy0X1RjfDTmVm
...                                     ...                     ...
63708  8bc56b79-d760-431f-a1d3-a9f71318680a  1Bxew4Eedjg5DLlAyQZK9c
63709  ff7b28cc-34d2-4182-a5f3-b06336e14c23  1XaPi3wwjPdMMsSw58bgCO
63710  c47028fa-0086-426d-a7b8-ed20eba3ed61  0WmzT6tMLhdST5BfYagbha
63711  86cb3243-62c1-4b0c-aeb9-bc8502bba200  7MRWNE1dN3o6bPMLPH0c3h
63712  60a21ce6-01ef-49dd-8566-6351599609f4  6HkzZenXCsrdGcqMgWJECN

[63713 rows x 2 columns]
