# Scraping for rap and poetry

**Author** : Elsa Bidant

**Date** : April 2025

## 1. Library import

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin

## 2. Rap corpus import

In [None]:
df = pd.read_csv("corpus.csv")
pd.set_option('display.max_colwidth', 50)
display(df)

Unnamed: 0,artist,title,year,lyrics,pageviews,contributors,url,topic,topic_clean,ranking,...,birthdate_artist,age_artist,pageview_mean,pageviews_2,n_sexe,hate,sexism,n_lines,sentiment2,born_in_france
0,Dee Nasty,Paname City Rappin,1984.0,"Paname city rappin', c'est le Paname city rap...",0,9,https://genius.com/Dee-nasty-paname-city-rappi...,bt-7,Rap coquin,3466.0,...,,,0.000000,,1,0.217241,0.108526,87,-0.208669,
1,IAM,IAM concept,1989.0,"En force, Asiatic attaque en rimes Déclarant ...",0,7,https://genius.com/Iam-iam-concept-lyrics,bt-6,Egotrip/Méta-rap,1860.0,...,1966.0,23.0,0.000000,,0,0.290877,0.106583,81,-0.023471,0.857143
2,IAM,Total Kheops,1989.0,"Quand ce sera la fin d'la musique, tu tireras...",0,4,https://genius.com/Iam-total-kheops-lyrics,bt-6,Egotrip/Méta-rap,192.0,...,1966.0,23.0,0.000000,,0,0.427435,0.331550,85,-0.363965,0.857143
3,IAM,Elle est à moi,1989.0,Aton almighty la rentrée sur le beat Marquant...,0,6,https://genius.com/Iam-elle-est-a-moi-lyrics,bt-7,Rap coquin,2494.0,...,1966.0,23.0,0.000000,,1,0.230399,0.135270,76,-0.079048,0.857143
4,IAM,Il n’y a pas d’autre alternative,1989.0,"""Ain't no other alternative"" Ils traquent, as...",0,2,https://genius.com/Iam-il-ny-a-pas-dautre-alte...,bt-0,Rap conscient,1204.0,...,1966.0,23.0,0.000000,,1,0.294673,0.145438,89,-0.113286,0.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37302,BraSco (FRA),Un autre jour,,Juste un autre jour... Yeh yeh yeh yeh yeh La ...,0,1,https://genius.com/Brasco-fra-un-autre-jour-ly...,bt-1,Chansons d'amour,3133.0,...,,,0.800895,0.0,1,0.204019,0.073574,54,-0.056830,
37303,Black M,Parce que c’est dar,,Parce que c'est dar Parce que maintenant même...,0,5,https://genius.com/Black-m-parce-que-cest-dar-...,bt-0,Rap conscient,2627.0,...,1984.0,,0.800895,0.0,1,0.223225,0.181337,28,-0.412702,1.000000
37304,Black M,Sans concession solo,,Rien de bon akhi à l'envers ça donne un bon à...,0,2,https://genius.com/Black-m-sans-concession-sol...,bt-0,Rap conscient,,...,1984.0,,0.800895,0.0,0,0.250734,0.267365,50,-0.571320,1.000000
37305,Black M,Red Bull Studios Challenge #11 (Lolita),,(Ritchy Boy & Lynda] Toi tu es ma Lolita (Loli...,0,1,https://genius.com/Black-m-red-bull-studios-ch...,bt-1,Chansons d'amour,3408.0,...,1984.0,,0.800895,0.0,0,0.173207,0.373303,41,-0.408719,1.000000


## 3. Poetry corpus scraping

To create a dataset on poetry texts I chose to scrape the poetica website (https://www.poetica.fr/) because it contains a wide variety of poets from different styles and periods, without concentrating solely on the poets of the canon.

In [None]:
BASE_URL = "https://www.poetica.fr"

def get_author_links():
    # Scrape all author links in the aside#nav_menu-3 section

    url = BASE_URL  # URL of the homepage or the page containing the authors section
    res = requests.get(url)
    if res.status_code != 200:
        print(f"Error retrieving the main page: {res.status_code}")
        return []

    soup = BeautifulSoup(res.text, "html.parser")
    nav_menu = soup.select_one("aside#nav_menu-3")  # Select the authors section

    if not nav_menu:
        print("The authors section was not found.")
        return []

    links = nav_menu.select("a")  # Select all links in this section
    print(f"Number of authors found: {len(links)}")
    return [a['href'] for a in links if 'href' in a.attrs]

def get_full_poem_text(poem_url):
    # Retrieve the full text of the poem by following the 'continuer...' link if necessary

    res = requests.get(poem_url)
    if res.status_code != 200:
        print(f"Error retrieving the poem {poem_url}: {res.status_code}")
        return ""

    soup = BeautifulSoup(res.text, "html.parser")
    poem_div = soup.select_one(".entry-content")
    if not poem_div:
        return ""

    # Remove navigation tags and other irrelevant elements
    for tag in poem_div.select(".sharedaddy, .code-block, .wp-caption-text"):
        tag.decompose()

    return poem_div.get_text(separator = "\n").strip()

def extract_date_from_text(poem_text):
    # Extract the date from the poem text

    # Use a regular expression to find the date at the end of the text
    date_pattern = re.compile(r"(\d{4})")
    match = date_pattern.search(poem_text)
    if match:
        return match.group(1)
    return None

def get_poem_data_from_articles(author_url, max_poems_per_author=None):
    # Retrieve poem data directly from the articles listed on the author's page

    res = requests.get(author_url)
    if res.status_code != 200:
        print(f"Error retrieving the author {author_url}: {res.status_code}")
        return []

    soup = BeautifulSoup(res.text, "html.parser")
    articles = soup.select("article.post")  # Select articles containing poems
    print(f"Number of articles found for the author {author_url}: {len(articles)}")

    results = []
    for i, article in enumerate(articles):
        if max_poems_per_author is not None and i >= max_poems_per_author:
            break

        title_tag = article.select_one(".entry-title a")
        title = title_tag.text.strip() if title_tag else "Unknown Title"

        author_tag = article.select_one(".cat-links a[href*='/categories/']")
        author = author_tag.text.strip() if author_tag else "Unknown"

        poem_div = article.select_one(".entry-summary")
        poem_text = poem_div.get_text(separator="\n").strip() if poem_div else ""

        # Check if there is a "continuer..." link
        continue_link = article.select_one("a:contains('continuer...')")
        if continue_link:
            poem_url = urljoin(BASE_URL, continue_link['href'])
            poem_text = get_full_poem_text(poem_url)
        else:
            poem_url = urljoin(BASE_URL, title_tag['href']) if title_tag else ""

        # Extract the date from the poem text
        date = extract_date_from_text(poem_text)

        print(f"Data extracted for the poem {title} by {author}")
        results.append({
            "title": title,
            "author": author,
            "date": date,
            "text": poem_text,
            "url": poem_url
        })

    return results

def scrape_all_poems(max_poems_per_author = None):
    # Scrape poems from all authors found in the aside#nav_menu-3 section
    author_links = get_author_links()
    results = []
    for i, author_url in enumerate(author_links):
        try:
            print(f"[{i+1}/{len(author_links)}] Author: {author_url}")
            poem_data = get_poem_data_from_articles(author_url, max_poems_per_author)
            results.extend(poem_data)
            time.sleep(1)  # Respect the server
        except Exception as e:
            print(f"Error author {author_url}: {e}")
    return results


data = scrape_all_poems(max_poems_per_author = None) 

# Export CSV
df = pd.DataFrame(data)
df.to_csv("corpus_poetica.csv", index = False, encoding = "utf-8") 
print("Export completed: corpus_poetica.csv")


Number of authors found: 120
[1/120] Author: #
Error author #: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
[2/120] Author: https://www.poetica.fr/categories/louise-ackermann/
Number of articles found for the author https://www.poetica.fr/categories/louise-ackermann/: 48
Data extracted for the poem À Alfred de Musset by Louise Ackermann
Data extracted for the poem À une artiste by Louise Ackermann
Data extracted for the poem Adieu à la poésie by Louise Ackermann
Data extracted for the poem Aux femmes by Louise Ackermann
Data extracted for the poem Bel astre voyageur by Louise Ackermann
Data extracted for the poem Daphné by Louise Ackermann
Data extracted for the poem De la Lumière ! by Louise Ackermann
Data extracted for the poem Deux vers d’Alcée by Louise Ackermann
Data extracted for the poem Élan mystique by Louise Ackermann
Data extracted for the poem Endymion by Louise Ackermann
Data extracted for the poem Hébé by Louise Ackermann
Data extracted for the poem I

In [3]:
df2 = pd.read_csv("corpus_poetica.csv", encoding = 'utf-8')
df2['text'] = df2['text'].apply(lambda x: x.replace("\n", " ") if isinstance(x, str) else x) # remove \n in the text
df2.to_csv("corpus_poetica_cleaned.csv", index = False, encoding = "utf-8") 
pd.set_option('display.max_colwidth', 200)
display(df2)

Unnamed: 0,title,author,date,text,url
0,À Alfred de Musset,Louise Ackermann,1871.0,"Un poète est parti ; sur sa tombe fermée Pas un chant, pas un mot dans cette langue aimée Dont la douceur divine ici-bas l’enivrait. Seul, un pauvre arbre triste à la pâle verdure, Le saule qu...",https://www.poetica.fr/poeme-726/louise-ackermann-a-alfred-de-musset/
1,À une artiste,Louise Ackermann,1871.0,"Puisque les plus heureux ont des douleurs sans nombre, Puisque le sol est froid, puisque les cieux sont lourds, Puisque l’homme ici-bas promène son cœur sombre Parmi les vains regrets et les co...",https://www.poetica.fr/poeme-3961/louise-ackermann-a-une-artiste/
2,Adieu à la poésie,Louise Ackermann,1835.0,"Mes pleurs sont à moi, nul au monde Ne les a comptés ni reçus, Pas un oeil étranger qui sonde Les désespoirs que j’ai conçus L’être qui souffre est un mystère Parmi ses frères ici-bas ; Il ...",https://www.poetica.fr/poeme-453/louise-ackermann-adieu-a-la-poesie/
3,Aux femmes,Louise Ackermann,1835.0,"S’il arrivait un jour, en quelque lieu sur terre, Qu’une entre vous vraiment comprît sa tâche austère, Si, dans le sentier rude avançant lentement, Cette âme s’arrêtait à quelque dévouement, S...",https://www.poetica.fr/poeme-455/louise-ackermann-aux-femmes/
4,Bel astre voyageur,Louise Ackermann,1861.0,"À La Comète de 1861 Bel astre voyageur, hôte qui nous arrives Des profondeurs du ciel et qu’on n’attendait pas, Où vas-tu ? Quel dessein pousse vers nous tes pas ? Toi qui vogues au large en ...",https://www.poetica.fr/poeme-524/louise-ackermann-bel-astre-voyageur/
...,...,...,...,...,...
3160,Marioupol,Kamal Zerdoumi,2022.0,Ville fantôme tas de cendres proie des détrousseurs de cadavres Des bateaux chargés de matière première quittent ton port à destination de tes exterminateurs La résistance dans les entraille...,https://www.poetica.fr/poeme-7439/kamal-zerdoumi-marioupol/
3161,Martyrs,Kamal Zerdoumi,2022.0,"Sebastian Abbo, Liberté pour l’Ukraine , 2022. Gravure édition limitée disponible dans notre Galerie d’Art Vous faites vos adieux à l’humanité en demandant que l’on se souvienne de vous ...",https://www.poetica.fr/poeme-7405/kamal-zerdoumi-martyrs/
3162,Maternelle,Kamal Zerdoumi,2020.0,Cette main de la nuit lumineuse caresse un cerveau ignorant dans son tabernacle Dehors l’impatience solaire de ceux qui savent assiège le tête-à-tête sacré de la mère et de l’enfant Vain...,https://www.poetica.fr/poeme-6681/kamal-zerdoumi-maternelle/
3163,Matin,Kamal Zerdoumi,,Sur le rebord de ma fenêtre l’oeil aux aguets vient boire l’âme au coeur d’oiseau Son chant en quête de lui-même se glisse dans mon oreille y verse la candeur du réveil Je suis à Casabla...,https://www.poetica.fr/poeme-4415/kamal-zerdoumi-matin/
