In [41]:
from prompt_tuning_moliere.dataset.crawler import Crawler
from os import path as osp
from itertools import chain
from typing import List

import pandas as pd

# Fetch data from a bag of HTML part
The file `bag_of_html.txt` contains uncleaned URLs pointing to different chapters of Moliere books. Retrieve them and extract each URL 

In [11]:
bag_of_html_path = osp.abspath("../../bag_of_html.txt")
with open(bag_of_html_path) as f:
    html_fragments = f.readlines()

In [17]:
# now filter URLs from these HTML fragments
urls = []
for fragment in html_fragments:
    if "url: " in fragment:
        try:
            url = fragment.split("url: \"")[-1].split(".html")[0] + ".html"
            urls.append(url)
        except:
            pass
print(f"{len(urls)} URLs retrieved")

435 URLs retrieved


In [18]:
crawler = Crawler(urls=urls, max_depth=0)
crawler.run()

2024-02-26 21:56:54,102 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-premier-scene-i-540.html', depth=0)
2024-02-26 21:56:54,323 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-premier-scene-ii-541.html', depth=0)
2024-02-26 21:56:54,682 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-premier-scene-iii-542.html', depth=0)
2024-02-26 21:56:54,914 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-i-scene-iii-543.html', depth=0)
2024-02-26 21:56:55,899 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-i-scene-iv-544.html', depth=0)
2024-02-26 21:56:56,302 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-i-scene-v-545.html', depth=0)
2024-02-26 21:56:56,479 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-i-scene-6-546.html', depth=0)
2024-02-26 21:56:56,885 INFO:Crawling: URL(url='https://www.texteslibres.fr/l-avare/acte-i-scene-7-547.html', depth=0)
2024-02-26 21:56:57,140 

# Now parse each HTML to extract texts

In [23]:
def maybe_extract_speaker_from_line(line: str):
    html_b = '<span class="personnage">'
    if html_b in line:
        speaker = line.split(html_b)[-1].split("<")[0]
        return speaker

In [48]:
def extract_dialogue_from_html_chapter(html_chapter: str) -> List[dict]:
    dialogue = []
    speaker = None
    for line in html_chapter.split("\n"):
        if (_speaker := maybe_extract_speaker_from_line(line)):
            speaker = _speaker
            continue
        if speaker:
            if not "didascalie" in line:
                dialogue.append({"speaker": speaker, "text": line.split("<br />")[0]})
        speaker = None
    return dialogue
        

In [49]:
dialogues = [extract_dialogue_from_html_chapter(html_chapter) for html_chapter in crawler.crawled_docs]
dialogues = pd.DataFrame(list(chain(*dialogues)))

In [50]:
print(f"Finally got {len(dialogues)} lines of text")
dialogues

Finally got 9460 lines of text


Unnamed: 0,speaker,text
0,Valère,"Hé quoi ! charmante Élise, vous devenez mélanc..."
1,Élise,"Non, Valère, je ne puis pas me repentir de tou..."
2,Valère,"Eh ! que pouvez-vous craindre, Élise, dans les..."
3,Élise,Hélas ! cent choses à la fois l'emportement d'...
4,Valère,"Ah ! ne me faites pas ce tort, de juger de moi..."
...,...,...
9455,Une Égyptienne,"La gloire,"
9456,Un Égyptien,"Les grandeurs,"
9457,Une Égyptienne,"Les sceptres qui font tant d'envie,"
9458,Un Égyptien,"Tout n'est rien, si l'amour n'y mêle ses ardeurs."


In [51]:
dialogues.to_csv("../../dialogues_moliere.csv")