# Prototyping:
I will use this notebook to prototype and do fast iterations on the different utilities I will implement.

In [9]:
from Scripts.rst2odt import output
from resiliparse.parse.encoding import detect_encoding, bytes_to_str
from resiliparse.extract.html2text import extract_plain_text


In [28]:
def extract_text(html_bytes:bytes) -> str:
    decoded = bytes_to_str(html_bytes, detect_encoding(html_bytes), fallback_encodings=[])
    text = extract_plain_text(decoded, noscript=True)
    return text

moby_path = "../../tests/fixtures/moby.html"
with open(moby_path, "rb") as f:
        moby_bytes = f.read()
extract_text(moby_bytes)

"Herman Melville - Moby-Dick\n\nAvailing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture.\n\nThis is a test paragraph with a link.\n\n  • Novel"

In [30]:
from fastwarc.warc import ArchiveIterator, WarcRecordType
from fastwarc.stream_io import GZipStream, FileStream
from pathlib import Path

output_file = Path("../../data/CC-MAIN-20250417135010-20250417165010-00065.txt")
Path.touch(output_file, exist_ok=True)
# 
stream = GZipStream(FileStream("../../data/CC-MAIN-20250417135010-20250417165010-00065.warc.gz", "rb"))

with open(output_file,"a", encoding="utf-8") as f:
    for record in ArchiveIterator(stream, record_types= WarcRecordType.response):
        record_bytes = record.reader.read()
        extracted_text = extract_text(record_bytes)
        f.write(extracted_text)

KeyboardInterrupt: 

### Language identification

In [48]:
import fasttext

model = fasttext.load_model("../../classifier_models/fasttext_language_ID.bin")
label, proba = model.predict("c'est tres interessant et facile a utuliser n'est ce pas")
label[0][9:], float(proba[0])

('fr', 0.9914689660072327)

### Masking PII

In [86]:
import re
text = """
Hi John, please email me at john.doe@e.com or call 
My SSN is 123-45-6789 and I live at 1234 Elm Street.
"""
EMAIL_RE = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
PHONE_RE = re.compile(r'(\+\d{1,3}\s?)?((\(\d{3}\)\s?)|(\d{3})(\s|-?))(\d{3}(\s|-?))(\d{4})(\s?(([Ee]xt[:\.]?)|x|X)(\s?\d+))?')
text_replaced, count = re.subn(EMAIL_RE, "|||EMAIL_ADDRESS|||", text)
text_replaced, count = re.subn(PHONE_RE, "|||PHONE_NUMBER|||", text_replaced)
text_replaced

'\nHi John, please email me at |||EMAIL_ADDRESS||| or call 98995849\nMy SSN is 123-45-6789 and I live at 1234 Elm Street.\n'

### Harmful content

In [88]:
model_nsfw = fasttext.load_model("../../classifier_models/jigsaw_fasttext_bigrams_nsfw_final.bin")

(('__label__nsfw',), array([1.00001001]))

In [95]:
model_nsfw.predict("you are a piece of shit")

(('__label__nsfw',), array([1.00001001]))

In [None]:
model_nsfw.predict("shit is a noun, it has been used recently in unpleasant ways but that does not mean we shouldn't use the word in scientific settings")

In [98]:
model_nsfw.predict("bitch is used to characterize the female dog, it has been used recently in unpleasant ways but that does not mean we shouldn't use the word in scientific settings")

(('__label__non-nsfw',), array([0.99772972]))

### Deduplication

In [26]:
# some normalization:
import unicodedata
import string
import re
text = """   English text with café and résumé. Español con acentos: niño, mañana. 
        Français: être, très, où. português: coração, não, então."""

text = text.lower()    
text = unicodedata.normalize("NFD", text)
text = "".join(char for char in text if unicodedata.category(char) != "Mn")
text = text.translate(str.maketrans("", "", string.punctuation))
text = re.sub(r"\s+", " ", text).strip()
word_list = text.split(" ")
n = 3
ngrams_set = (set
        (" ".join(word for word in word_list[i: i + n])
         for i in range(len(word_list) - n + 1)
    ))
ngrams_set

{'acentos nino manana',
 'and resume espanol',
 'cafe and resume',
 'con acentos nino',
 'coracao nao entao',
 'english text with',
 'espanol con acentos',
 'etre tres ou',
 'francais etre tres',
 'manana francais etre',
 'nino manana francais',
 'ou portugues coracao',
 'portugues coracao nao',
 'resume espanol con',
 'text with cafe',
 'tres ou portugues',
 'with cafe and'}

In [29]:
import mmh3
num_hashes = 3
signature = []
for seed in range(num_hashes):
        min_hash = min([mmh3.hash(ngram, seed) & 0xffffffff for ngram in ngrams_set]) 
        signature.append(min_hash)
signature

[103687852, 655216250, 40863664]

In [31]:
def compute_minhash_signature(ngrams_set, num_hashes: int):
    signature = []
    for seed in range(num_hashes):
        min_hash = min([mmh3.hash(ngram, seed) & 0xffffffff for ngram in ngrams_set])
        signature.append(min_hash)
    return signature

In [32]:
doc1 = {"quick brown", "brown fox", "fox jumps"}
doc2 = {"quick brown", "brown fox", "fox leaps"}

print(compute_minhash_signature(doc1, 10))
print(compute_minhash_signature(doc2, 10))

[170037781, 2235656572, 1991133704, 2563705786, 189524970, 286324933, 816939020, 1006671516, 1294049192, 2706901154]
[170037781, 814793719, 2038137965, 354557187, 2318800073, 286324933, 816939020, 1838360388, 1294049192, 877606299]


In [2]:
import fasttext
model = fasttext.load_model("../../classifier_models/quality_fasttext.ftz")

In [1]:
from data_filtering.filtering_utilities.harmful_content import classify_harmful_content
text = """
Photo of Small chapel near Onsovice
|||PHONE_NUMBER||| pictures, photos, wallpapers with free licences!
Please login in order to download photos in full size
Login or E-mail Password
If you are not registered, please register for free: www.Free-Photos.biz/register
Please note to download premium images you also need to join as a free member..
You can also save the photos without the registration - but only in small and average sizes, and some of them will have the site's watermark. Please simply click your right mouse button and save the image.
Please login in order to like photos
Login or E-mail Password
If you are not registered, please register for free:
Sorry, non-members can download up to 100 full-size photos per month.
It looks like you have used up your limit.
Free members can download an unlimited number of full-size photos - including the premium free photos.
Join as a member today for FREE! - and download the images without limitations:
You can also save the images without the membership - but only in small and average sizes, and some of them may have the site's watermark. Please simply click your right mouse button and save the image.
You are trying to download a premium photo in full size.
This photo can be downloaded in full size only by free members of Free-Photos.biz.
You can also save this image without the membership - but only in small and average sizes, and it may have the site's watermark. Please simply click your right mouse button and save the image.
All free photos → Pictures of Architecture → Photos of Churches → Photo of Small chapel near Onsovice
Download top free photographs!
This photo was viewed times and was downloaded in full size 0 times.
This photo was liked 0 times
If you are a member, please login in order to see the source link of the above image.
DescriptionSmall chapel near Onsovice.JPG
Čeština: Kaplička poblíž obce Onšovice, část obce Čkyně, okres Prachatice, Česko
English: Small chapel near Onšovice village, part of Čkyně, Prachatice District, Czech Republic
49° 06′ 42.51″ N, 13° 46′ 45.99″ E View this and other nearby images on: OpenStreetMap - Google Earth 49.|||PHONE_NUMBER|||; 13.|||PHONE_NUMBER|||
Date 9 May 2011, 14:40:37
I, the copyright holder of this work, hereby publish it under the following licenses:
Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license is included in the section entitled GNU Free Documentation License.https://www.gnu.org/copyleft/fdl.htmlGFDLGNU Free Documentation Licensetruetrue
This file is licensed under the Creative Commons Attribution-Share Alike 3.0 Unported, 2.5 Generic, 2.0 Generic and 1.0 Generic license.
to share – to copy, distribute and transmit the work
to remix – to adapt the work
Under the following conditions:
attribution – You must attribute the work in the manner specified by the author or licensor (but not in any way that suggests that they endorse you or your use of the work).
share alike – If you alter, transform, or build upon this work, you may distribute the resulting work only under the same or similar license to this one.
https://creativecommons.org/licenses/by-sa/3.0 CC BY-SA 3.0 Creative Commons Attribution-Share Alike 3.0 truetrue
You may select the license of your choice.
This image has been assessed using the Quality image guidelines and is considered a Quality image.
Kaplička poblíž obce Onšovice, část obce Čkyně, okres Prachatice, Česko Small chapel near Onšovice village, part of Čkyně, Prachatice District, Czech Republic Camera location 49° 06′ 42.51″ N, 13° 46′ 45.99″ E
GNU Free Documentation License
Only registered users can post comments. Please login.
Image resolution in width direction
Image resolution in height direction
Unit of X and Y resolution
Date and time original image was generated
Date and time image was made digital data
Meaning of each component
Focal plane X resolution
Focal plane Y resolution
Focal plane resolution unit
Search ALL words Search ANY words
The images at Free-Photos.biz come mainly from Wikimedia Commons or from our own production. The photos are either in the public domain, or licensed under free linceses: Free-Photos.biz license, GPL, Creative Commons or Free-Art license. Some very few other photos where uploaded to Free-Photos.biz by our users and released into the public domain or into free usage under another free license (like GPL etc.)
While the copyright and licensing information supplied for each photo is believed to be accurate, Free-Photos.biz does not provide any warranty regarding the copyright status or correctness of licensing terms. If you decide to reuse the images from Free-Photos.biz, you should verify the copyright status of each image just as you would when obtaining images from other sources.
The use of depictions of living or deceased persons may be restricted in some jurisdictions by laws regarding personality rights. Such images are exhibited at Free-Photos.biz as works of art that serve higher artistic interests.
By registering your account and/or by subscribing to new and newly rated photographs you agree we may send you the links to photos and we may occasionally share other information with you.
We do NOT disclose your personal data.
© Copyright Free-Photos.biz, |||PHONE_NUMBER|||, Vyshenskoho st., Lviv 79010, Ukraine, e-mail: SerhiyLvivsky at gmail dot com .
"""

In [3]:
classify_harmful_content(text, model)

('good', 0.9986441731452942)

In [22]:
import re
from data_filtering.filtering_utilities.blacklist import BLACKLIST

def filter_lines(text: str,
                 min_chars:int,
                 ) -> str:
    kept_lines = []

    BLACKLIST_RE = re.compile(
        r'\b(?:' + '|'.join(map(re.escape, BLACKLIST)) + r')\b',
        re.IGNORECASE | re.UNICODE
    )

    for line in text.splitlines():
        print(line.split())
        if len(line.split()) < 3:
            continue
        if BLACKLIST_RE.search(line):
            continue
        kept_lines.append(line)

    return "\n".join(kept_lines)

In [23]:
filter_lines(text, 35)

[]
['Royal', 'Doulton', 'Slater’s', 'Patent', 'White', '&', 'Gold', 'Floral', 'Blue', 'Beige', 'Stoneware', 'Ewer', 'Nice', 'antique', 'British', 'blue,', 'green,', 'and']
['brown', 'stoneware', 'jug', 'with', 'gold', 'bands.', 'Has', 'a', 'beige', 'Doulton', 'Slater’s', 'Patent', 'design', 'with', 'blue', 'and', 'white', 'enamel', 'and', 'gold']
['flowers.', 'Marked', 'on', 'the', 'bottom', 'Royal', 'Doulton', 'England.', 'This', 'mark', 'was', 'used', 'between', '1902', 'and', '1922.', 'Does', 'not', 'have', 'the']
['Slater’s', 'Patent', 'mark.', 'Has', 'some', 'wear', 'to', 'the', 'gilding,', 'but', 'otherwise', 'good', 'condition.', 'Measures', 'approximately', '4', '5/8']
['inches', 'wide', 'x', '11', '1/2', 'inches', 'tall.', 'This', 'is', 'an', 'antique', 'and', 'may', 'have', 'small', 'in', 'manufacture', 'defects', 'that', 'do', 'not', 'affect']
['the', 'display', 'of', 'the', 'piece.']
['One', 'moment,', 'please...']
['Loader']
['Please', 'wait', 'while', 'your', 'request', '

'Royal Doulton Slater’s Patent White & Gold Floral Blue Beige Stoneware Ewer Nice antique British blue, green, and \nbrown stoneware jug with gold bands. Has a beige Doulton Slater’s Patent design with blue and white enamel and gold \nflowers. Marked on the bottom Royal Doulton England. This mark was used between 1902 and 1922. Does not have the \nSlater’s Patent mark. Has some wear to the gilding, but otherwise good condition. Measures approximately 4 5/8 \ninches wide x 11 1/2 inches tall. This is an antique and may have small in manufacture defects that do not affect \nthe display of the piece.\nOne moment, please...\nPlease wait while your request is being verified...'