In [None]:
import pandas as pd
import re  # Import the re module for regular expressions

# Updated function to extract Spotify URL IDs
def extract_spotify_url_id(urls_str):
    urls = urls_str.strip("{}").split(',')
    spotify_url_pattern = 'https://open.spotify.com/artist/'
    for url in urls:
        if spotify_url_pattern in url:
            # Extract everything after 'https://open.spotify.com/artist/'
            spotify_id = url.replace(spotify_url_pattern, '')
            # Further clean the ID in case the URL has query parameters or a trailing slash
            spotify_id = re.sub(r'[?/].*$', '', spotify_id)
            return spotify_id
    return None

def filter_spotify_artists(input_csv_path, output_csv_path):
    # Load the CSV file
    df = pd.read_csv(input_csv_path, sep=',')
    
    # Ensure the 'associated_urls' column exists
    if 'associated_urls' not in df.columns:
        raise ValueError("Column 'associated_urls' does not exist in the CSV.")
    
    # Print the column names to check if 'associated_urls' is present
    print(df.columns)
    
    # Apply the function to extract Spotify URL IDs
    df['spotify_id'] = df['associated_urls'].apply(extract_spotify_url_id)
    
    # Filter rows where spotify_id is not None
    spotify_df = df[df['spotify_id'].notnull()][['artist_mbid', 'artist_name', 'spotify_id']]
    
    # Save the filtered DataFrame to a new CSV file
    spotify_df.to_csv(output_csv_path, index=False, sep=',')


# Example usage
input_csv_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artist_url_list.csv'  # Update this to the path of your input CSV file
output_csv_path = 'spotify_artists.csv'    # The output file will be saved with this name

# Uncomment the following line to run the function with your file paths
filter_spotify_artists(input_csv_path, output_csv_path)


In [None]:
import pandas as pd

def generate_matched_and_unmatched_csvs(input_csv_path, spotify_csv_path, matched_output_path, unmatched_output_path):
    # Load the CSV files
    input_df = pd.read_csv(input_csv_path)
    spotify_df = pd.read_csv(spotify_csv_path)
    
    # Merge the input CSV with the Spotify CSV to find matches
    matched_df = pd.merge(input_df, spotify_df, on='artist_mbid', how='inner')
    
    # Save the matched entries to a new CSV file
    matched_df.to_csv(matched_output_path, index=False)
    
    # Find unmatched entries by performing an outer join and filtering out matched ones
    all_df = pd.merge(input_df, spotify_df[['artist_mbid']], on='artist_mbid', how='outer', indicator=True)
    unmatched_df = all_df[all_df['_merge'] == 'left_only'][['individual_artist_name', 'artist_mbid']]
    
    # Save the unmatched entries to a new CSV file
    unmatched_df.to_csv(unmatched_output_path, index=False)

# Example usage
input_csv_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\artist_name_mbid_pairs.csv'  # Path to the CSV with individual_artist_name, artist_mbid
spotify_csv_path = 'spotify_artists.csv'  # Path to the CSV with artist_mbid, artist_name, spotify_id
matched_output_path = 'matched_artists.csv'  # Output path for matched artists
unmatched_output_path = 'unmatched_artists.csv'  # Output path for unmatched artists

# Uncomment the following lines to run the function with your file paths
generate_matched_and_unmatched_csvs(input_csv_path, spotify_csv_path, matched_output_path, unmatched_output_path)


In [None]:
C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\artists_data_full.csv

In [None]:
# Import necessary libraries
import pandas as pd
from rapidfuzz import process, fuzz

# Function to search for a string in the artist_name column
def search_artist_name(csv_file, search_query, threshold=85):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file, delimiter=';')
    
    # Ensure the artist_name column is a string to avoid any type-related errors
    df['artist_name'] = df['artist_name'].astype(str)
    
    # Using RapidFuzz to find matches above the threshold
    matches = process.extract(search_query, df['artist_name'], scorer=fuzz.WRatio, score_cutoff=threshold)
    
    # Extracting the indices of matching rows
    match_indices = [df[df['artist_name'] == match[0]].index[0] for match in matches]
    
    # Return the matching rows
    return df.loc[match_indices]

# Example usage
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\artists_data_full.csv'  # Update this path to your CSV file
search_query = 'deadmaus'  # Replace this with the artist name you're searching for
matching_rows = search_artist_name(csv_file_path, search_query)

print(matching_rows)


In [17]:
# Import necessary libraries
import pandas as pd
import difflib

# Function to search for a string in the artist_name column using difflib
def search_artist_name_with_difflib(csv_file, search_query, cutoff=0.8):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file, delimiter=';')
    
    # Ensure the artist_name column is a string
    df['artist_name'] = df['artist_name'].astype(str)
    
    # Use difflib to find close matches
    close_matches = difflib.get_close_matches(search_query, df['artist_name'], n=len(df), cutoff=cutoff)
    print(f"Close matches found: {close_matches}")
    # Filter the DataFrame to only include rows with matching artist names
    matched_df = df[df['artist_name'].isin(close_matches)]
    
    return matched_df

# Example usage
csv_file_path = 'C:\\Users\\Music\\Desktop\\PROJECTS\\Spotify Project\\SCRAPED_DATA\\FINISHED\\artists_data_full.csv'  # Update this path to your CSV file
search_query = "Magnus Karlsson's Free Fall"  # Replace this with the artist name you're searching for
matching_rows = search_artist_name_with_difflib(csv_file_path, search_query)

print(matching_rows)


Close matches found: ['Magnus Karlsson’s Free Fall']
           id       artist_spotify_id                  artist_name  \
118939  87909  44xWxpnL7VA4R3bziyzvfh  Magnus Karlsson’s Free Fall   

        artist_popularity                                 artist_image_small  \
118939                 19  https://i.scdn.co/image/ab6761610000f178f05c26...   

                                      artist_image_medium  \
118939  https://i.scdn.co/image/ab67616100005174f05c26...   

                                       artist_image_large  artist_followers  \
118939  https://i.scdn.co/image/ab6761610000e5ebf05c26...           11438.0   

       date_added_to_db date_last_modified  
118939       2024-03-24         2024-03-24  
