In [None]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
def fetch_all_artists(estilo_id, increment=50, max_retries=3, delay=1):
    """
    Iteratively fetch artist URLs by incrementing the 'ini' parameter.

    Parameters:
    - base_url (str): The base URL with placeholders for 'ini'.
    - increment (int): The amount to increment 'ini' each time.
    - max_retries (int): Number of retries for failed requests.
    - delay (int or float): Seconds to wait between requests.

    Returns:
    - List of all artist URLs.
    """
    all_artists = []
    ini = 0
    session = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; Bot/1.0; +https://yourdomain.com/bot)"
    }

    while True:
        # Construct the URL with the current 'ini' value
        params = {
            "ini": ini,
            "req_pais": "ar",
            "req_estilo": estilo_id
        }
        url = f"https://acordes.lacuerda.net/ARCH/indices.php"

        try:
            resp = session.get(url, params=params, headers=headers, timeout=10)
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to fetch page with ini={ini}. Error: {e}")
            break  # Stop on request failure

        soup = BeautifulSoup(resp.text, "html.parser")

        main_ul = soup.find("ul", id="i_main")
        if not main_ul:
            print(f"No artist list found on page with ini={ini}. Stopping.")
            break  # No more artists to fetch

        # Extract artist links from the current page
        artist_links = []
        for li in main_ul.find_all("li"):
            a_tag = li.find("a")
            if a_tag and a_tag.has_attr("href"):
                relative_link = a_tag["href"].strip()
                full_url = urljoin("https://acordes.lacuerda.net", relative_link)
                artist_links.append(full_url)

        if not artist_links:
            print(f"No artists found on page with ini={ini}. Stopping.")
            break  # No artists found; assume no more pages

        print(f"Fetched {len(artist_links)} artists from ini={ini}.")

        all_artists.extend(artist_links)

        ini += increment
        time.sleep(delay)

    return all_artists

In [None]:
def extract_urls_from_page(page_url):
    """
    Fetches the HTML content from the given URL and extracts a list of absolute URLs
    from the <ul> element with id 'b_main'.

    Args:
        page_url (str): The URL of the webpage to extract URLs from.

    Returns:
        list: A list of absolute URLs extracted from the page.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' +
                      'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                      'Chrome/58.0.3029.110 Safari/537.3'
    }

    try:
        response = requests.get(page_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise HTTPError for bad responses (4XX, 5XX)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    urls = []

    # Find the <ul> element with id 'b_main'
    b_main = soup.find('ul', id='b_main')
    if not b_main:
        print("No <ul> with id 'b_main' found.")
        return urls

    # Find all <a> tags within this <ul>
    a_tags = b_main.find_all('a', href=True)
    if not a_tags:
        print("No <a> tags with href found within <ul id='b_main'>.")
        return urls

    for a in a_tags:
        href = a['href'].strip()
        # Construct the full URL
        full_url = urljoin(page_url, href)
        urls.append(full_url)

    return urls

In [None]:
def extract_lyrics_from_url(page_url):
    """
    Fetches the HTML content from the given URL and extracts the lyrics
    contained within the <div class="rLetra"> element.

    Args:
        page_url (str): The URL of the webpage to extract lyrics from.

    Returns:
        str: The extracted lyrics as a clean string. Returns an empty string
             if lyrics are not found or an error occurs.
    """
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/58.0.3029.110 Safari/537.3'
        )
    }

    try:
        response = requests.get(page_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise HTTPError for bad responses (4XX, 5XX)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the <div> with class 'rLetra'
    r_letra_div = soup.find('div', class_='rLetra')
    if not r_letra_div:
        print("No <div> with class 'rLetra' found.")
        return ""

    lyrics = r_letra_div.get_text(separator='<br>', strip=True)

    return lyrics

In [None]:
estilos_id = [
    "bal",
    "can",
    "gru",
    "pop",
    "rel",
    "rok",
    "tra",
    "tro",
    "rom"
    ]

In [None]:
for estilo in list(set(estilos_id)):
  print(estilo)
  df_data = list()
  artist_urls = fetch_all_artists(
    estilo_id=estilo,
    increment=50,
    max_retries=3,
    delay=1)
  for artist_url in artist_urls:
    songs_urls = extract_urls_from_page(artist_url)
    for song_url in songs_urls:
      lyrics = extract_lyrics_from_url(song_url)
      df_data.append({"estilo": estilo, "artist_url":artist_url, "song_url": song_url, "lyrics": lyrics})
  df = pd.DataFrame(df_data)
  df.to_csv(f"datasets/{estilo}.csv")