# YEAT GPT

## Scrape Genius for Yeat Lyrics

Genius is really strict and kept rate limiting me. It was for only a few hours at first and then it was for over a day. This was my solution to avoid that. This script took over a few hours to run.

I decided to just save the raw HTML of each webpage. It autosaves and pickles every 5 songs.

In [None]:
import nest_asyncio
nest_asyncio.apply()

import os
import httpx
import asyncio
import random
import pickle
from dataclasses import dataclass
from dotenv import load_dotenv
from tqdm.auto import tqdm

In [None]:
load_dotenv()
GENIUS_TOKEN = os.getenv("GENIUS_CLIENT_ACCESS_TOKEN")

ARTIST_NAME = "Yeat"
API_BASE_URL = "https://api.genius.com"
PROGRESS_FILE = "yeat_raw_html_progress.pkl"

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]

@dataclass
class Song:
    title: str
    url: str
    id: int

In [None]:

# we add basic saving and resume logic here because it will take hours to scrape all songs safely
scraped_data = []
scraped_ids = set()

if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "rb") as f:
        scraped_data = pickle.load(f)
        scraped_ids = {entry['song'].id for entry in scraped_data}
    print(f"Resuming! Found {len(scraped_data)} songs already downloaded.")

async def fetch_song_list():
    headers = {"Authorization": f"Bearer {GENIUS_TOKEN}", "User-Agent": random.choice(USER_AGENTS)}
    async with httpx.AsyncClient(headers=headers, timeout=30) as client:
        # get artist id
        print(f"Finding Artist ID for {ARTIST_NAME}...")
        search = await client.get(f"{API_BASE_URL}/search", params={"q": ARTIST_NAME})
        if search.status_code != 200: return []
        
        hits = search.json()['response']['hits']
        artist_id = next((h['result']['primary_artist']['id'] for h in hits if h['result']['primary_artist']['name'].lower() == ARTIST_NAME.lower()), None)
        
        if not artist_id: return []
        print(f"Found Artist ID: {artist_id}")

        # get songs
        songs = []
        page = 1
        with tqdm(desc="Fetching Metadata", unit="page") as pbar:
            while page:
                await asyncio.sleep(0.5)
                res = await client.get(f"{API_BASE_URL}/artists/{artist_id}/songs", params={"per_page": 50, "page": page, "sort": "popularity"})
                if res.status_code != 200: break
                
                data = res.json()['response']
                for s in data['songs']:
                    if s['primary_artist']['id'] == artist_id:
                        songs.append(Song(s['title'], s['url'], s['id']))
                
                pbar.update(1)
                page = data['next_page']
        return songs

async def download_safely(songs):
    songs_to_download = [s for s in songs if s.id not in scraped_ids]
    
    if not songs_to_download:
        print("All songs are already downloaded!")
        return

    print(f"Starting safe download for {len(songs_to_download)} new songs.")
    
    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
        for song in tqdm(songs_to_download, desc="Downloading Lyrics", unit="song"):
            
            retries = 3
            while retries > 0:
                try:
                    # 5 to 10 sec pause :/
                    await asyncio.sleep(random.uniform(5.0, 10.0))

                    headers = {"User-Agent": random.choice(USER_AGENTS)}
                    res = await client.get(song.url, headers=headers)

                    # handle blocks
                    if res.status_code in [403, 429, 1015]:
                        tqdm.write(f"RATE LIMIT HIT. Pausing 2 mins... ({song.title})")
                        await asyncio.sleep(120) 
                        retries -= 1
                        continue 
                    
                    if res.status_code == 200:
                        entry = {"song": song, "html": res.text}
                        scraped_data.append(entry)
                        
                        # autosave every 5 songs
                        if len(scraped_data) % 5 == 0:
                            with open(PROGRESS_FILE, "wb") as f:
                                pickle.dump(scraped_data, f)
                        break 
                    
                except Exception as e:
                    tqdm.write(f"Error: {e}")
                    await asyncio.sleep(5)
                
                retries -= 1
    
    with open(PROGRESS_FILE, "wb") as f:
        pickle.dump(scraped_data, f)
    print(f"\nSaved {len(scraped_data)} songs to {PROGRESS_FILE}")

async def main():
    all_songs = await fetch_song_list()
    if all_songs:
        await download_safely(all_songs)

await main()

  from .autonotebook import tqdm as notebook_tqdm


Resuming! Found 1280 songs already downloaded.
Finding Artist ID for Yeat...
Found Artist ID: 1476681


Fetching Metadata: 87page [01:24,  1.03page/s]

All songs are already downloaded!





## Data Cleaning

Now we use XPath selectors to find the lyrics and then clean to only include Yeat's raw lyrics.

Genius doesn't do a great job designated featured verses, producer tags, adlibs, etc. so the data isn't going to be that clean. That is okay for a project like this :)

In [None]:
import pickle
import re
from lxml import html

with open("yeat_raw_html_progress.pkl", "rb") as f:
    scraped_data = pickle.load(f)

clean_songs = []

def process_lyrics_with_styles(lyric_containers):
    final_lyrics = []
    
    for container in lyric_containers:
        current_style_needed = None 
        
        for element in container.xpath("./* | ./text()"):
            text_val = element if isinstance(element, str) else element.text_content()
            
            header_match = re.search(r'\[(.*?):(.*?)]', text_val)
            if header_match:
                header_content = header_match.group(0).lower()
                if "yeat" in header_content:
                    if "italic" in header_content:
                        current_style_needed = "i"
                    elif "bold" in header_content:
                        current_style_needed = "b"
                    else:
                        current_style_needed = None 
                else:
                    current_style_needed = "SKIP"
                continue

            if current_style_needed == "SKIP":
                continue

            if isinstance(element, str):
                if current_style_needed is None:
                    final_lyrics.append(element)
            else:
                tag = element.tag
                if current_style_needed is None:
                    final_lyrics.append(element.text_content())
                elif tag == current_style_needed:
                    final_lyrics.append(element.text_content())
                if tag == "br":
                    final_lyrics.append("\n")

    return "".join(final_lyrics)

for entry in scraped_data:
    raw_html = entry['html']
    tree = html.fromstring(raw_html)
    lyric_containers = tree.xpath('//div[@data-lyrics-container="true"]')
    
    if lyric_containers:
        yeat_only_text = process_lyrics_with_styles(lyric_containers)
        
        # kill everything from "Read More" onwards (metadata/descriptions)
        # This catches the "The snippet of this track... Read More" blocks
        yeat_only_text = re.sub(r'.*?Read More.*', '', yeat_only_text, flags=re.DOTALL)
        
        # kill the "Embed" text and trailing digits that Genius adds at the bottom
        yeat_only_text = re.sub(r'\d*Embed$', '', yeat_only_text.strip())
        yeat_only_text = re.sub(r'\d+Embed', '', yeat_only_text)

        # this is from features that are missed or producer tags. we do not want this in the training data!!!!
        n_word_pattern = r'\bn[i|e]gg[a|e][rh]?s?\b'
        yeat_only_text = re.sub(n_word_pattern, "[SCRUBBED]", yeat_only_text, flags=re.IGNORECASE)
        
        # remove brackets [Chorus], [Verse], etc. 
        yeat_only_text = re.sub(r'\[.*?\]', '', yeat_only_text)
        
        # remove parentheses but keep content
        yeat_only_text = yeat_only_text.replace("(", "").replace(")", "")

        # fix cases where the break tag was misplaced and there isn't a space
        yeat_only_text = re.sub(r'([a-z0-9\!\?\'\"]) ([A-Z])', r'\1 \2', yeat_only_text)
        
        lines = []
        for line in yeat_only_text.splitlines():
            clean_line = line.strip()

            # filter out Genius UI clutter
            noise_words = ["You might also like", "Contributors", "Lyrics", "Songs Like", "Translations"]
            if clean_line and not any(x in clean_line for x in noise_words):
                lines.append(clean_line)
        
        if lines:
            clean_songs.append("\n".join(lines))


if clean_songs:
    output_file = "yeat_lyrics_clean.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        content = "\n\n<|endoftext|>\n\n".join(clean_songs)
        f.write(content)
    print(f"Success! Saved {len(clean_songs)} songs.")

Success! Saved 970 songs.


In [None]:
import torch
print(f"CUDA status: {torch.cuda.is_available()}")
print(f"Active GPU: {torch.cuda.get_device_name(0)}")

CUDA status: True
Active GPU: NVIDIA GeForce RTX 3070


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# setup
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

dataset = load_dataset('text', data_files='yeat_lyrics_clean.txt')
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="yeat-gpt-v1",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,   
    gradient_accumulation_steps=2,    
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                       
    dataloader_num_workers=0,         
    
    logging_strategy="steps",
    logging_steps=100,
    disable_tqdm=False,
    report_to="none",
    
    save_steps=500,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

print("Training starting... (Logs will only appear every 100 steps)")
trainer.train()

model.save_pretrained("yeat-gpt-final")
tokenizer.save_pretrained("yeat-gpt-final")
print("Model saved to 'yeat-gpt-final'")

  from .autonotebook import tqdm as notebook_tqdm


Training starting... (Logs will only appear every 100 steps)


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.8873
200,3.5302
300,3.3822
400,3.3311
500,3.3028
600,3.1943
700,3.1578
800,3.1592
900,3.1346
1000,3.0455


Model saved to 'yeat-gpt-final'


In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "yeat-gpt-final"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

def generate_yeat_verse(prompt="I just pulled up", max_new_tokens=100):

    inputs = tokenizer(prompt, return_tensors='pt', padding=True).to(device)

    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)



  from .autonotebook import tqdm as notebook_tqdm


In [48]:
generate_yeat_verse("Can't decide between")

Can't decide between the X and the Percs, bitch I'm a drug addict Yeah, fuck it, yeah, what? What happened?, where you been at for so long? Where you been in town? Go hit up that mall Oh, woo-yeah, oh, uh-uh, let's go get this shit Huh, huh, woah, wowUh, no wayI don’t got time to waitin' on nothing No wayI just came back from the dead Let's all com


In [83]:
generate_yeat_verse("Call up")

Call up Eliantte, I got diamonds on my ring Yeah, yeah, yes, diamond necklace Woo, woo, woo, fuck it, let's go Fuck it, what? Huh, no, woah, bitch, you ain't heard me call out that shit Ah-aah, ah, buh-buh-boh-bop Phew, phe—, boo-phew, pseudorеs Oh, oh-yeah, they been talkin


In [282]:
generate_yeat_verse("I just pulled up in a")

I just pulled up in a Tonka, I'ma blow it 'til the sun come back on Turn around then turn that bitch to an elephant Woah, woah, wooh-ooh-oop Yeah-yeah-Yeah-yeah, yeah-yeah, let's go Woo-oow, woo-oosh Buh-brah, buh-bitch, brazy, bang, oh, ah Ah-aah, hah-hae-hae-haee, hrr
