In [1]:
import requests

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import time  # Importing time for delay

def get_artist_musicbrainz_url(artist_name):
    """Search for the artist on MusicBrainz and get the artist URL."""
    base_url = "https://musicbrainz.org"
    search_url = f"{base_url}/search?query={artist_name.replace(' ', '+')}&type=artist&method=indexed"
    try:
        response = requests.get(search_url, headers={"User-Agent": "YourAppName/1.0 (your-email@example.com)"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the first search result link
        artist_link = soup.select_one("table.tbl a")
        if artist_link:
            return base_url + artist_link['href']
        return None
    except requests.RequestException as e:
        print(f"Error fetching artist page: {e}")
        return None

def clean_text(text):
    """Remove unnecessary characters and clean up text."""
    text = text.replace('\n', ' ').replace('\r', '').strip()
    return " ".join(text.split())

def scrape_wikipedia_summary(artist_url):
    """Scrape the Wikipedia summary from the artist's MusicBrainz page."""
    try:
        response = requests.get(artist_url, headers={"User-Agent": "YourAppName/1.0 (your-email@example.com)"})
        response.raise_for_status()

        # Adding a small delay after the request to ensure content has loaded
        time.sleep(1)  # Wait for 1 second

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the Wikipedia section using the div with class "wikipedia-extract-body"
        wiki_div = soup.find("div", class_="wikipedia-extract-body wikipedia-extract-collapse")
        if wiki_div:
            print(wiki_div)
            # Extract all paragraphs separately and join them with proper spacing
            paragraphs = wiki_div.find_all("p")
            summary_parts = []
            for p in paragraphs:
                # Extract the text from each paragraph
                paragraph_text = p.get_text(separator=" ", strip=True)
                cleaned_text = clean_text(paragraph_text)
                summary_parts.append(cleaned_text)

            # Join all paragraphs into a single summary
            full_summary = " ".join(summary_parts)

            # Remove any "Continue reading at Wikipedia" links or extra texts
            if "Continue reading at Wikipedia" in full_summary:
                full_summary = full_summary.split("Continue reading at Wikipedia")[0].strip()

            return full_summary
        return "N/A"
    except requests.RequestException as e:
        print(f"Error scraping Wikipedia summary: {e}")
        return "N/A"

def fetch_artist_details(artist_name):
    """Fetch the Wikipedia summary directly from the MusicBrainz artist page."""
    artist_url = get_artist_musicbrainz_url(artist_name)
    if artist_url:
        print(f"Found artist page: {artist_url}")
        wikipedia_summary = scrape_wikipedia_summary(artist_url)
        return {"wikipedia_summary": wikipedia_summary}
    else:
        print("Artist page not found.")
        return {"wikipedia_summary": "N/A"}

def process_file(file_path):
    """Process each JSONL file to extract song and artist details."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    # Parse the JSON line
                    song_data = json.loads(line.strip())

                    # Handle single and multiple artist formats
                    artists = song_data.get("artist")
                    if isinstance(artists, str):
                        artists = [artists]

                    song_name = song_data.get("title", "N/A")

                    for artist_name in artists:
                        # Fetch the Wikipedia summary
                        artist_details = fetch_artist_details(artist_name)
                        wikipedia_summary = artist_details.get("wikipedia_summary", "N/A")

                        print(f"Title: {song_name}")
                        print(f"Artist: {artist_name}")
                        print(f"Wikipedia Summary: {wikipedia_summary}")
                        print("-" * 50)
                        break
                except json.JSONDecodeError:
                    print("Error: Invalid JSON line.")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")

# Base directory for files
base_dir = '/Users/chrisapton/Desktop/Spring 2025/DSCI 558/Music-KG-Project/data/raw_data/'

# Loop through pages 1 to 10
for page_number in range(1, 11):
    file_name = f'whosampled_tracks_2024_{page_number}.jsonl'
    file_path = os.path.join(base_dir, file_name)
    print(f"Processing file: {file_name}")
    process_file(file_path)
    print("=" * 80)




















In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import json
import os
import time

def get_artist_musicbrainz_url(artist_name):
    """Search for the artist on MusicBrainz and get the artist URL."""
    base_url = "https://musicbrainz.org"
    search_url = f"{base_url}/search?query={artist_name.replace(' ', '+')}&type=artist&method=indexed"
    attempts = 5  # Number of retries

    for attempt in range(1, attempts + 1):
        try:
            response = requests.get(search_url, headers={"User-Agent": "YourAppName/1.0 (your-email@example.com)"})
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the first search result link
            artist_link = soup.select_one("table.tbl a")
            if artist_link:
                full_url = base_url + artist_link['href']
                print(f"Found artist URL: {full_url} (Attempt {attempt})")
                return full_url
            
            print(f"Attempt {attempt}: No artist link found. Retrying...")
            time.sleep(1)

        except requests.RequestException as e:
            print(f"Attempt {attempt}: Error fetching artist page: {e}")
            time.sleep(1)  # Brief pause before retrying

    # If all attempts fail, print the fetched page content for debugging
    print(f"Failed to retrieve artist URL after {attempts} attempts.")
    print(f"Search URL: {search_url}")
    print("Page content:")
    print(response.text[:1000])  # Print the first 1000 characters to avoid overwhelming output
    return None


def get_release_date(recording):
    """Extract the release date from a recording."""
    # Check for "first-release-date" field first
    if 'first-release-date' in recording:
        return recording['first-release-date']

    # Fallback: Check inside the "releases" list
    release_dates = []
    if 'releases' in recording and recording['releases']:
        for release in recording['releases']:
            date = release.get('date', 'N/A')
            if date != 'N/A':
                release_dates.append(date)

        # Sort by length and value to prioritize full dates (YYYY-MM-DD)
        if release_dates:
            return min(release_dates, key=lambda x: (len(x), x))

    return "N/A"


def get_genres(recording):
    """Extract the genres from a recording."""
    print(recording)
    genres = set()

    # Check for genres directly in the recording tags
    if 'tags' in recording:
        for tag in recording['tags']:
            genres.add(tag['name'])

    # Fallback: Check the release group for genres
    if 'releases' in recording and recording['releases']:
        for release in recording['releases']:
            if 'release-group' in release and 'tags' in release['release-group']:
                for tag in release['release-group']['tags']:
                    genres.add(tag['name'])

    # If no genres found, try to check the artist tags
    if not genres and 'artist-credit' in recording:
        for artist_credit in recording['artist-credit']:
            artist = artist_credit.get('artist', {})
            if 'tags' in artist:
                for tag in artist['tags']:
                    genres.add(tag['name'])

    return list(genres)


def scrape_wikipedia_summary_selenium(artist_url):
    """Scrape the Wikipedia summary from the artist's MusicBrainz page using Selenium."""
    try:
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')

        # Path to your ChromeDriver
        driver_path = '/Users/chrisapton/Desktop/Spring 2025/DSCI 558/chromedriver-mac-arm64/chromedriver'
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)

        driver.get(artist_url)

        try:
            wiki_div = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'wikipedia-extract-body'))
            )
            summary = wiki_div.text.strip()

            # Remove any "Continue reading at Wikipedia" text
            if "Continue reading at Wikipedia" in summary:
                summary = summary.split("Continue reading at Wikipedia")[0].strip()

            driver.quit()
            return summary if summary else "N/A"
        except Exception as e:
            print(artist_url)
            print(f"Error finding Wikipedia summary: {e}")

            driver.quit()
            return "N/A"
    except Exception as e:
        print(f"Error with Selenium: {e}")
        return "N/A"

def fetch_song_details(song_name, artists):
    """Fetch song release date and genre from MusicBrainz."""

    if not artists:
        print(f"No artists found for song: {song_name}. Skipping...")
        return {
            "title": song_name,
            "artist": "N/A",
            "release_date": "N/A",
            "genres": [],
            "wikipedia_summary": "N/A"
        }
    # Join artist names with " AND " for the query
    artist_query = " OR ".join([f'artist:"{artist}"' for artist in artists])
    # Combine the title and artist queries
    query = f'recording:"{song_name}" AND ({artist_query})'
    # Encode the query to make it URL-safe
    encoded_query = requests.utils.quote(query)
    # Construct the full URL
    url = f'https://musicbrainz.org/ws/2/recording/?query={encoded_query}&fmt=json'
    print(url)
    
    try:
        response = requests.get(url, headers={"User-Agent": "YourAppName/1.0 (your-email@example.com)"})
        response.raise_for_status()
        data = response.json()

        if 'recordings' in data and data['recordings']:
            for recording in data['recordings']:
                title = recording.get('title', 'N/A')
                artist_credit = recording.get('artist-credit', [])

                # Extract all artist names from the recording
                recording_artists = [credit.get('name', 'N/A').lower() for credit in artist_credit]

                # Check if all given artists are present in the recording
                if all(artist.lower() in recording_artists for artist in artists) and song_name.lower() in title.lower():
                    release_date = get_release_date(recording)
                    genres = get_genres(recording)
                    return {
                        "title": title,
                        "artist": [artist['name'] for artist in artist_credit],
                        "release_date": release_date,
                        "genres": genres
                    }
                else: 
                    print("can't find all artists: approx search")
                    release_date = get_release_date(recording)
                    genres = get_genres(recording)
                    return {
                        "title": title,
                        "artist": [artist['name'] for artist in artist_credit],
                        "release_date": release_date,
                        "genres": genres
                    }
        return {"title": song_name, "artist": ", ".join(artists), "release_date": "N/A", "genres": "N/A"}
    except requests.RequestException as e:
        print(f"Error fetching song details: {e}")
        return {"title": song_name, "artist": ", ".join(artists), "release_date": "N/A", "genres": "N/A"}

def fetch_artist_wikipedia(artist_name):
    """Fetch Wikipedia summary for a given artist."""
    artist_url = get_artist_musicbrainz_url(artist_name)
    wikipedia_summary = scrape_wikipedia_summary_selenium(artist_url)
    return wikipedia_summary

def fetch_artist_genres(artist_name):
    """
    Fetches the top genres of an artist from the MusicBrainz API.
    """
    try:
        # Step 1: Search for the artist to get their MusicBrainz ID
        search_url = f'https://musicbrainz.org/ws/2/artist/?query={artist_name}&fmt=json'
        search_response = requests.get(search_url, headers={"User-Agent": "YourAppName/1.0 (your-email@example.com)"})
        search_response.raise_for_status()
        search_data = search_response.json()
        
        # Get the first matching artist ID
        if search_data['artists']:
            artist_id = search_data['artists'][0]['id']
        else:
            print(f"No artist found for {artist_name}")
            return []

        # Step 2: Get the artist details using the ID
        artist_url = f'https://musicbrainz.org/ws/2/artist/{artist_id}?inc=tags&fmt=json'
        response = requests.get(artist_url, headers={"User-Agent": "YourAppName/1.0 (your-email@example.com)"})
        response.raise_for_status()
        artist_data = response.json()

        # Extract genres from the tags field
        genres = [tag['name'] for tag in artist_data.get('tags', [])]

        # Return top 3-4 genres if available
        return genres[:4] if genres else []
    
    except requests.RequestException as e:
        print(f"Error fetching artist genres: {e}")
        return []

def process_file(file_path, page_number):
    """Process each JSONL file to extract song and artist details."""
    output_file = f'musicbrainz_tracks_{page_number}.jsonl'

    # Get the number of lines already processed in the output file
    processed_count = 0
    try:
        with open(output_file, 'r', encoding='utf-8') as file:
            processed_count = sum(1 for _ in file)
    except FileNotFoundError:
        pass  # If the file doesn't exist, start from scratch

    print(f"Resuming from line {processed_count + 1}")

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, start=1):
                # Skip already processed lines
                if line_number <= processed_count:
                    continue

                try:
                    song_data = json.loads(line.strip())
                    artists = song_data.get("artist")
                    if isinstance(artists, str):
                        artists = [artists]

                    song_name = song_data.get("title", "N/A")

                    # Fetch song details
                    artist_details = fetch_song_details(song_name, artists)
                    wikipedia_summaries = list(map(fetch_artist_wikipedia, artists))

                    # Fetch genres if none found initially
                    if len(artist_details['genres']) == 0:
                        for artist in artists:
                            artist_details['genres'].extend(fetch_artist_genres(artist))
                        artist_details['genres'] = list(set(artist_details['genres']))

                    # Print extracted details
                    print(f"ID: {song_data.get('whosampled_id', 'N/A')}")
                    print(f"Title: {artist_details['title']}")
                    print(f"Artist: {artist_details['artist']}")
                    print(f"Release Date: {artist_details['release_date']}")
                    print(f"Genres: {artist_details['genres']}")
                    print(f"Wikipedia Summary: {wikipedia_summaries}")
                    print("-" * 50)

                    # Create the song info dictionary
                    song_info = {
                        "id": song_data.get("whosampled_id", "N/A"),
                        "title": artist_details.get("title", "N/A"),
                        "artist": artist_details.get("artist", "N/A"),
                        "release_date": artist_details.get("release_date", "N/A"),
                        "genres": artist_details.get("genres", "N/A"),
                        "wikipedia_summary": wikipedia_summaries,
                    }

                    # Append the song info to the output file
                    with open(output_file, 'a', encoding='utf-8') as outfile:
                        outfile.write(json.dumps(song_info) + '\n')

                except (json.JSONDecodeError, KeyError) as e:
                    print(f"Error processing line {line_number}: {e}")
                    song_info = {
                        "id": song_data.get("whosampled_id", "N/A"),
                        "title": artist_details.get("title", "N/A"),
                        "artist": artist_details.get("artist", "N/A"),
                        "release_date": artist_details.get("release_date", "N/A"),
                        "genres": artist_details.get("genres", "N/A"),
                        "wikipedia_summary": ["N/A"],
                    }

                    # Append the song info to the output file
                    with open(output_file, 'a', encoding='utf-8') as outfile:
                        outfile.write(json.dumps(song_info) + '\n')

                except Exception as e:
                    print(f"Unexpected error at line {line_number}: {e}")
                    song_info = {
                        "id": song_data.get("whosampled_id", "N/A"),
                        "title": artist_details.get("title", "N/A"),
                        "artist": artist_details.get("artist", "N/A"),
                        "release_date": artist_details.get("release_date", "N/A"),
                        "genres": artist_details.get("genres", "N/A"),
                        "wikipedia_summary": ["N/A"],
                    }

                    # Append the song info to the output file
                    with open(output_file, 'a', encoding='utf-8') as outfile:
                        outfile.write(json.dumps(song_info) + '\n')

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")       

# Base directory for files
base_dir = '/Users/chrisapton/Desktop/Spring 2025/DSCI 558/Music-KG-Project/data/processed/'

# Loop through pages 1 to 10
for page_number in range(2022, 2025):
    file_name = f'whosampled_tracks_{page_number}.jsonl'
    file_path = os.path.join(base_dir, file_name)
    print(f"Processing file: {file_name}")
    process_file(file_path, page_number)
    print("=" * 80)




Processing file: whosampled_tracks_2022.jsonl
Resuming from line 7687
Processing file: whosampled_tracks_2023.jsonl
Resuming from line 5009
Processing file: whosampled_tracks_2024.jsonl
Resuming from line 2218
https://musicbrainz.org/ws/2/recording/?query=recording%3A%22Paid%20in%20Full%22%20AND%20%28artist%3A%22Eric%20B.%20%26%20Rakim%22%29&fmt=json
{'id': 'ff856d27-f68c-4546-bb79-fc12c1e7690f', 'score': 100, 'title': 'Paid in Full', 'length': 206000, 'video': None, 'artist-credit': [{'name': 'Eric B. & Rakim', 'artist': {'id': '925228de-bbe5-4c7b-b76f-78e382ec9148', 'name': 'Eric B. & Rakim', 'sort-name': 'Eric B. & Rakim', 'aliases': [{'sort-name': 'Eric B & Rakim', 'type-id': '1937e404-b981-3cb7-8151-4c86ebfc8d8e', 'name': 'Eric B & Rakim', 'locale': None, 'type': 'Search hint', 'primary': None, 'begin-date': None, 'end-date': None}, {'sort-name': 'Eric B. and Rakim', 'type-id': '1937e404-b981-3cb7-8151-4c86ebfc8d8e', 'name': 'Eric B. and Rakim', 'locale': None, 'type': 'Search hin