### download ablums

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Loop through the years from 1960 to 2023
for year in range(1960, 2024):
    # URL to scrape
    url = f"https://digitaldreamdoor.com/pages/albums_by_year/albums_{year}.html"

    # Simulate browser headers for the HTTP request
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/89.0.4389.82 Safari/537.36"
        )
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the webpage content with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # List to store album and artist data
        albums_and_artists = []

        # First structure parsing logic
        divs = soup.find_all("div", class_="list")
        for div in divs:
            album_rows = div.text.split('\n')  # Split rows by newline
            for row in album_rows:
                if " - " in row:  # Identify rows with album and artist data
                    parts = row.split(" - ", 1)  # Split into album and artist
                    album = parts[0].split(".", 1)[-1].strip()  # Extract album name
                    artist = parts[1].strip()  # Extract artist name
                    albums_and_artists.append({"Album Name": album, "Artist Name": artist})

        # If data was extracted using the first structure
        if albums_and_artists:
            df = pd.DataFrame(albums_and_artists)
            df['Year'] = year

        # Second structure parsing logic (fallback if the first method fails)
        if not albums_and_artists:
            for div in divs:
                lines = div.stripped_strings  # Extract clean lines from the div
                for line in lines:
                    if " - " in line:  # Identify lines with album and artist data
                        parts = line.split(" - ", 1)  # Split into album and artist
                        album = parts[0].split(".", 1)[-1].strip()  # Extract album name
                        artist = parts[1].strip()  # Extract artist name
                        albums_and_artists.append({"Album Name": album, "Artist Name": artist})
            
            # If data was extracted using the second structure
            if albums_and_artists:
                df = pd.DataFrame(albums_and_artists)
                df['Year'] = year

        # Create a directory to save CSV files if it does not exist
        if not os.path.exists("album"):
            os.makedirs("album")

        # Save the data to a CSV file
        df.to_csv(f"album/top100_albums_{year}.csv", index=False, encoding="utf-8")
    else:
        # Print an error message if the request fails
        print(f"Failed request, status code: {response.status_code}")


### download the songs in each ablum

In [None]:
import musicbrainzngs
import pandas as pd
import time

# Initialize MusicBrainz API with a user agent
musicbrainzngs.set_useragent("AlbumInfoFetcher", "1.0", "your_email@example.com")

# Function to fetch album songs and metadata
def get_album_songs(album_name):
    try:
        # Search for the album using the MusicBrainz API
        result = musicbrainzngs.search_releases(release=album_name, limit=1)
        
        # Check if any results are found
        if not result['release-list']:
            print(f"No results found for album: {album_name}")
            return []

        # Select the first matching album
        release = result['release-list'][0]
        release_id = release['id']

        # Fetch detailed album information using the release ID
        album_data = musicbrainzngs.get_release_by_id(release_id, includes=["recordings", "tags"])
        release_info = album_data['release']
        tracks = release_info['medium-list'][0]['track-list']

        # Extract general album metadata
        album_details = {
            "Album Name": album_name,
            "Status": release_info.get('status', 'Unknown'),
            "Quality": release_info.get('quality', 'Unknown'),
            "Packaging": release_info.get('packaging', 'Unknown'),
            "Language": release_info.get('text-representation', {}).get('language', 'Unknown'),
            "Script": release_info.get('text-representation', {}).get('script', 'Unknown'),
            "Release Date": release_info.get('date', 'Unknown'),
            "Release Country": release_info.get('country', 'Unknown'),
            "Has Artwork": release_info.get('cover-art-archive', {}).get('artwork', 'false'),
            "Front Cover Available": release_info.get('cover-art-archive', {}).get('front', 'false'),
            "Back Cover Available": release_info.get('cover-art-archive', {}).get('back', 'false'),
        }

        # Extract track-level metadata
        songs = []
        for track in tracks:
            recording = track.get('recording', {})
            
            # Extract tags/genres for the recording
            recording_tags = recording.get('tag-list', [])
            recording_genres = ", ".join([tag['name'] for tag in recording_tags]) if recording_tags else "Unknown"

            # Combine album-level and track-level metadata
            song_details = {
                "Track Number": track.get('position', 'Unknown'),
                "Title": recording.get('title', 'Unknown'),
                "Length (ms)": recording.get('length', 'Unknown'),
                "Recording Genres": recording_genres if recording_genres else "Unknown",
                **album_details,
            }
            songs.append(song_details)

        return songs

    except Exception as e:
        # Handle exceptions and print error messages
        print(f"Error fetching album data: {e}")
        return []


In [None]:
import os
if not os.path.exists("songs"):
    os.makedirs("songs")
for year in range(1960,2024):
    df = pd.read_csv(f'album/top100_albums_{year}.csv')
    all_songs = []
    for name in df['Album Name']:
        songs_data = get_album_songs(name)
        if songs_data:
            all_songs.extend(songs_data)
    
    if all_songs:
        df_all_songs = pd.DataFrame(all_songs)
        df_all_songs.to_csv(f"songs/top100_albums_{year}_songs.csv", index=False)
        print(f'finish exacting{year}')

### download the lyrics of each song 

In [None]:
import os
import re
import time
import pandas as pd
import lyricsgenius

# Create directories for saving lyrics and processed songs
if not os.path.exists("lyrics"):
    os.makedirs("lyrics")
if not os.path.exists("songs_net"):
    os.makedirs("songs_net")

# Function to sanitize filenames by replacing invalid characters
def safe_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

# Initialize Genius API
genius = lyricsgenius.Genius("FZyxPjY4GKltQjpyKulBfJZwwZWHNySpmbSOtRN1ELpB9MV5b2GSsh6i1dQEgvse")

# Process songs year by year
for year in range(1960, 2024):  # Adjust year range as needed
    print(f"Processing year: {year}")
    try:
        # Load song and album data
        df_song = pd.read_csv(f'songs/top100_albums_{year}_songs.csv')
        df_album = pd.read_csv(f'album/top100_albums_{year}.csv')
        
        # Merge song and album data on 'Album Name'
        merged_left = pd.merge(df_song, df_album, on='Album Name', how='left')

        # List to keep track of rows where lyrics are not found
        index_record = []

        # Process each song in the merged data
        for index, row in merged_left.iterrows():
            try:
                # Search for lyrics using Genius API
                time.sleep(1)  # Avoid hitting the API rate limit
                song = genius.search_song(row['Title'], row['Artist Name'])
                if song:
                    lyrics = song.lyrics
                else:
                    lyrics = None

                # If lyrics are not found, log and skip
                if not lyrics:
                    print(f"Lyrics not found for {row['Artist Name']} - {row['Title']}")
                    index_record.append(index)
                    continue

                # Save lyrics to a text file
                file_path = f'lyrics/{safe_filename(row["Album Name"])}_{safe_filename(row["Artist Name"])}_{safe_filename(row["Title"])}.txt'
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(lyrics)

            except Exception as e:
                # Handle any errors while downloading lyrics
                print(f"Error downloading lyrics for {row['Artist Name']} - {row['Title']}: {e}")
                index_record.append(index)

        # Remove rows where lyrics were not found
        merged_left = merged_left.drop(index=index_record)

        # Save the updated DataFrame
        merged_left.to_csv(f'songs_net/top100_albums_{year}_songs.csv', index=False)
        print(f"Finished processing year: {year}")

    except Exception as e:
        # Handle any errors while processing the year
        print(f"Error processing year {year}: {e}")
