In [3]:
from dotenv import load_dotenv
import os
import httpx
load_dotenv()

# 2. Get the token safely
# We use the exact name you provided: GENIUS_CLIENT_ACCESS_TOKEN
GENIUS_TOKEN = os.getenv("GENIUS_CLIENT_ACCESS_TOKEN")

# 3. Validation check
if not GENIUS_TOKEN:
    raise ValueError("‚ùå Token not found! Make sure your .env file is in the same folder and contains 'GENIUS_CLIENT_ACCESS_TOKEN=your_token_here'")

print(f"‚úÖ Token loaded successfully: {GENIUS_TOKEN[:5]}...")

# 4. Set constants
ARTIST_NAME = "Yeat"
HEADERS = {"Authorization": f"Bearer {GENIUS_TOKEN}"}
API_BASE_URL = "https://api.genius.com"

‚úÖ Token loaded successfully: dqOwh...


In [None]:
import os
import httpx
import asyncio
import random
import polars as pl
from lxml import html
from dataclasses import dataclass
from dotenv import load_dotenv

load_dotenv()
GENIUS_TOKEN = os.getenv("GENIUS_CLIENT_ACCESS_TOKEN")
API_BASE_URL = "https://api.genius.com"

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]

@dataclass
class Song:
    title: str
    url: str
    id: int

async def fetch_song_list():
    headers = {"Authorization": f"Bearer {GENIUS_TOKEN}", "User-Agent": random.choice(USER_AGENTS)}
    
    async with httpx.AsyncClient(headers=headers, timeout=30) as client:
        search = await client.get(f"{API_BASE_URL}/search", params={"q": "Yeat"})
        if search.status_code != 200:
            return []

        hits = search.json()['response']['hits']
        artist_id = next((h['result']['primary_artist']['id'] for h in hits if h['result']['primary_artist']['name'].lower() == "yeat"), None)
        
        if not artist_id: 
            return []
            
        songs = []
        page = 1
        while page:
            await asyncio.sleep(random.uniform(0.5, 1.5))
            res = await client.get(
                f"{API_BASE_URL}/artists/{artist_id}/songs",
                params={"per_page": 50, "page": page, "sort": "popularity"}
            )
            if res.status_code != 200: break
            
            data = res.json()['response']
            for s in data['songs']:
                if s['primary_artist']['id'] == artist_id:
                    songs.append(Song(s['title'], s['url'], s['id']))
            
            print(f"Page {page} done", end="\r")
            page = data['next_page']
            
        return songs

async def scrape_song_safe(client, song):
    retries = 3
    while retries > 0:
        try:
            await asyncio.sleep(random.uniform(1.0, 3.5))
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            res = await client.get(song.url, headers=headers, follow_redirects=True)
            
            if res.status_code in [429, 403]:
                print(f"Rate limit: {song.title}. Waiting 60s...")
                await asyncio.sleep(60)
                retries -= 1
                continue
                
            if res.status_code != 200: return None

            tree = html.fromstring(res.content)
            divs = tree.xpath('//div[@data-lyrics-container="true"]')
            if not divs: divs = tree.xpath('//div[contains(@class, "Lyrics__Container")]')
            if not divs: return None

            full_text = "\n".join(["".join(div.itertext()) for div in divs])
            return {"title": song.title, "text": full_text}

        except:
            return None
    return None

async def download_lyrics_safe(songs):
    limits = httpx.Limits(max_connections=3, max_keepalive_connections=5)
    async with httpx.AsyncClient(timeout=20, limits=limits) as client:
        tasks = [scrape_song_safe(client, s) for s in songs]
        results = await asyncio.gather(*tasks)
    return [r for r in results if r]

all_songs = await fetch_song_list()
if all_songs:
    raw_data = await download_lyrics_safe(all_songs)
    df = pl.DataFrame(raw_data)
    df_clean = df.with_columns(
        pl.col("text")
        .str.replace_all(r"\[.*?\]", "")
        .str.strip_chars()
    ).filter(pl.col("text").str.len_chars() > 20)

    os.makedirs("data", exist_ok=True)
    df_clean.write_csv("data/yeat_lyrics.csv")

üîç Searching for Yeat...
‚ùå Search failed.
üíæ Saved song list to data/yeat_songs_list.json


In [5]:
all_songs

[{'title': 'Talk', 'url': 'https://genius.com/Yeat-talk-lyrics'},
 {'title': 'Mon√´y so big',
  'url': 'https://genius.com/Yeat-money-so-big-lyrics'},
 {'title': 'Dub', 'url': 'https://genius.com/Yeat-dub-lyrics'},
 {'title': 'Flawl√´ss', 'url': 'https://genius.com/Yeat-flawless-lyrics'},
 {'title': 'Rich Minion', 'url': 'https://genius.com/Yeat-rich-minion-lyrics'},
 {'title': 'G√´t Busy', 'url': 'https://genius.com/Yeat-get-busy-lyrics'},
 {'title': 'Already Rich',
  'url': 'https://genius.com/Yeat-already-rich-lyrics'},
 {'title': 'Sorry Bout That',
  'url': 'https://genius.com/Yeat-sorry-bout-that-lyrics'},
 {'title': 'If We Being R√´al',
  'url': 'https://genius.com/Yeat-if-we-being-real-lyrics'},
 {'title': 'Out th√´ way', 'url': 'https://genius.com/Yeat-out-the-way-lyrics'},
 {'title': 'Poppin', 'url': 'https://genius.com/Yeat-poppin-lyrics'},
 {'title': 'Big tonka', 'url': 'https://genius.com/Yeat-big-tonka-lyrics'},
 {'title': 'Turban', 'url': 'https://genius.com/Yeat-turban-l