# Prototyping:
I will use this notebook to prototype and do fast iterations on the different utilities I will implement.

In [9]:
from Scripts.rst2odt import output
from resiliparse.parse.encoding import detect_encoding, bytes_to_str
from resiliparse.extract.html2text import extract_plain_text


In [28]:
def extract_text(html_bytes:bytes) -> str:
    decoded = bytes_to_str(html_bytes, detect_encoding(html_bytes), fallback_encodings=[])
    text = extract_plain_text(decoded, noscript=True)
    return text

moby_path = "../../tests/fixtures/moby.html"
with open(moby_path, "rb") as f:
        moby_bytes = f.read()
extract_text(moby_bytes)

"Herman Melville - Moby-Dick\n\nAvailing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture.\n\nThis is a test paragraph with a link.\n\n  • Novel"

In [30]:
from fastwarc.warc import ArchiveIterator, WarcRecordType
from fastwarc.stream_io import GZipStream, FileStream
from pathlib import Path

output_file = Path("../../data/CC-MAIN-20250417135010-20250417165010-00065.txt")
Path.touch(output_file, exist_ok=True)
# 
stream = GZipStream(FileStream("../../data/CC-MAIN-20250417135010-20250417165010-00065.warc.gz", "rb"))

with open(output_file,"a", encoding="utf-8") as f:
    for record in ArchiveIterator(stream, record_types= WarcRecordType.response):
        record_bytes = record.reader.read()
        extracted_text = extract_text(record_bytes)
        f.write(extracted_text)

KeyboardInterrupt: 

### Language identification

In [48]:
import fasttext

model = fasttext.load_model("../../classifier_models/fasttext_language_ID.bin")
label, proba = model.predict("c'est tres interessant et facile a utuliser n'est ce pas")
label[0][9:], float(proba[0])

('fr', 0.9914689660072327)

### Masking PII

In [86]:
import re
text = """
Hi John, please email me at john.doe@e.com or call 
My SSN is 123-45-6789 and I live at 1234 Elm Street.
"""
EMAIL_RE = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
PHONE_RE = re.compile(r'(\+\d{1,3}\s?)?((\(\d{3}\)\s?)|(\d{3})(\s|-?))(\d{3}(\s|-?))(\d{4})(\s?(([Ee]xt[:\.]?)|x|X)(\s?\d+))?')
text_replaced, count = re.subn(EMAIL_RE, "|||EMAIL_ADDRESS|||", text)
text_replaced, count = re.subn(PHONE_RE, "|||PHONE_NUMBER|||", text_replaced)
text_replaced

'\nHi John, please email me at |||EMAIL_ADDRESS||| or call 98995849\nMy SSN is 123-45-6789 and I live at 1234 Elm Street.\n'