# Create a function that receives one URL and scrape all 100 other urls connected via href

In [2]:
from utils import create_s3_connection
S3 = create_s3_connection()

2023-04-25 10:54:47 [info     ] Success connecting S3          endpoint=localhost:9010
2023-04-25 10:54:47 [info     ] Bucket at S3 already exists    bucket_name=dev-nlp-chatbot


In [3]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6
[0mNote: you may need to restart the kernel to use updated packages.


In [28]:
import re
import string
import random
import requests

from bs4 import BeautifulSoup

from unidecode import unidecode

def sanitize(text):
    text = text.lower()    
    # Remove any non-word characters from text
    text = re.sub(r'\W+', ' ', text)
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    #Normalize
    text = unidecode(text)
    list_text = text.split()
    text = ' '.join([word for word in list_text if word.isalpha()])
    return text

def is_link_to_file(link: str):
    extensions = [
        ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".txt",
        ".html", ".htm", ".css", ".js", ".xml", ".json", ".csv", ".tsv",
        ".zip", ".tar", ".gz", ".rar", ".7z", ".bz2", ".apk", ".exe",
        ".bmp", ".jpg", ".jpeg", ".png", ".gif", ".tif", ".tiff", ".svg",
        ".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".mp4", ".avi",
        ".mkv", ".mov", ".wmv", ".flv", ".webm", ".srt", ".sub", ".ass",
        ".pdf", ".epub", ".mobi", ".azw", ".azw3", ".djvu", ".fb2", ".ibook"
    ]
    return any(ext in link for ext in extensions)
    
def filter_link(link: str):
    if not link:
        return None
    
    if not (link.startswith('http://') or link.startswith('https://')):
        return None
    
    filter_list = ['facebook', 'twitter', 'youtube', 'instagram', 'linkedin', 'github']
    if (any(site in link for site in filter_list)):
        return None
    
    if is_link_to_file(link):
        return None
    
    return link


def get_external_links(url):
    url = filter_link(url)
    if not url:
        return '', set([])
    
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the response with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <a> tags with an href attribute that starts with 'http' or 'https'
    external_links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        filtered_link = filter_link(link=href)
        if filtered_link:
            external_links.append(filtered_link)

    all_text = soup.get_text(strip=True, separator=' ')
    all_text = sanitize(all_text)
    # Return the list of external links
    return all_text, set(external_links)


def crawl(url: str):
    # First website to crawl
    all_text, links = get_external_links(url)
    destination_fpath = f"html_content/{url.replace('https://', '').replace('http://', '').strip('/')}.txt"
    S3.save_html(content=all_text, fpath=destination_fpath)
    
    unique_links = list(set(links))
    if len(unique_links) < 1:
        print('Did not find links to crawl')
        return 
    
    crawl_children(urls=unique_links, n_sites=10, depth=2)
    return
    
def crawl_children(urls: list[str], n_sites: int, depth: int):
    # Crawl N children websites until depth == 0
    if depth == 0:
        return 
    
    try:
        urls = random.sample(urls, n_sites) 
    except:
        pass

    for idx,url in enumerate(urls):
        print('\nIdx: ', idx)
        all_text, links = get_external_links(url)
        destination_fpath = f"html_content/{url.replace('https://', '').replace('http://', '').strip('/')}.txt"
        S3.save_html(content=all_text, fpath=destination_fpath)
        print('\n')
        unique_links = list(set(links))
        pprint(unique_links)
        print(f'Save content at: {destination_fpath}')
        crawl_children(urls=unique_links, n_sites=5, depth=depth-1)

In [61]:
#url = 'https://realpython.com/build-a-chatbot-python-chatterbot/'
#url = 'https://www.receiteria.com.br/receitas-de-comidas-faceis-de-fazer/'
url = 'https://pt.wikipedia.org/wiki/Python'
crawl(url)


Idx:  0


['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://stats.wikimedia.org/#/commons.wikimedia.org',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://petscan.wmflabs.org/?language=commons&categories=Python_(programming_language)&project=wikimedia&ns%5B6%5D=1',
 'https://en.wikiquote.org/wiki/en:Python',
 'https://id.loc.gov/authorities/sh96008834',
 'https://it.wikipedia.org/wiki/Python',
 'https



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wik



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wikipedia.org/wiki/Python_(programmeertaal)'



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'http://www.amk.ca/python/writing/gvr-interview',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.org/wiki/%E7%9A%AE%E5%90%8C',
 'https://www.mediawiki.org/',
 'https://hr.wikipe



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://www.python.org/downloads/release/python-3812/',
 'https://gl.wikipedia.org/wiki/Python',
 'https://an.wikipedia.org/wiki/Python',
 'https://www.python.org/downloads/release/python-3120a6/',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://it.wikibooks.org/wiki/Python',
 'https://www.python.org/news/',
 'https://www.python.org/dev/peps/pep-0569/',
 'https://it.wikipedia.org/wiki/Python',
 'https://golden.com/wiki/Python_%28programming_language%29-MNA48',
 'https://nqo.wikipedia.org/wiki/%DF



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'http://www.amk.ca/python/writing/gvr-interview',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://stats.wikimedia.org/#/my.wikipedia.org',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'http://www.amk.ca/python/writing/gvr-interview',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wiki



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'http://www.jython.org',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.org/wiki/%E7%9A%AE%E5%90%8C',
 'https://www.mediawiki.org/',
 'https://hr.wikipedia.org/wiki/Python_(prog



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wik



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'http://aleph.nkp.cz/F/?func=find-c&local_base=aut&ccl_term=ica=ph170668&CON_LNG=ENG',
 'https://gl.wikipedia.org/wiki/Python',
 'https://www.wikidata.org/wiki/Special:EntityPage/Q4306983',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://web.archive.org/web/20190511065650/http://insidetech.monster.com/training/articles/8114-15-ways-python-is-a-powerful-force-on-the-web',
 'https://www.wikidata.org/wiki/Special:EntityPage/Q28923017',
 'https://www.h2desk.com



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'http://aleph.nkp.cz/F/?func=find-c&local_base=aut&ccl_term=ica=ph170668&CON_LNG=ENG',
 'https://gl.wikipedia.org/wiki/Python',
 'https://stackoverflow.com/questions/10104805/python-2to3-windows-cmd',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://www.pcsteps.gr/224468-%ce%bc%ce%b1%ce%b8%ce%ae%ce%bc%ce%b1%cf%84%ce%b1-%cf%80%cf%81%ce%bf%ce%b3%cf%81%ce%b1%ce%bc%ce%bc%ce%b1%cf%84%ce%b9%cf%83%ce%bc%ce%



[]
Save content at: html_content/stats.wikimedia.org/#/my.wikipedia.org.txt

Idx:  2


['https://slashdotmedia.com/terms-of-use/',
 'https://disqus.com/?ref_noscript',
 'https://slashdotmedia.com/privacy-statement/',
 'https://www.linuxjournal.com/article/6011#disqus_thread']
Save content at: html_content/www.linuxjournal.com/article/6011.txt

Idx:  0


['https://sourceforge.net/',
 'https://library.slashdotmedia.com/',
 'https://slashdotmedia.com/advertising-and-marketing-services/native-advertising/',
 'https://slashdotmedia.com/audience/',
 'https://slashdotmedia.com/advertising-and-marketing-services/demand-generation/',
 'https://slashdotmedia.com/',
 'https://slashdotmedia.com/advertising-and-marketing-services/custom-content/',
 'https://slashdotmedia.com/contact/',
 'https://slashdot.org/',
 'https://slashdotmedia.com/passport/',
 'https://slashdotmedia.com/advertising-and-marketing-services/email-marketing/',
 'https://slashdotmedia.com/advertising-and-marketing-services/dis



['https://developer.wikimedia.org',
 'https://wikimedia.org',
 'https://meta.wikimedia.org/wiki/Special:MyLanguage/Wikimedia_chapters',
 'https://meta.wikimedia.org/wiki/Special:MyLanguage/Data_retention_guidelines',
 'https://wikimediafoundation.org/w/index.php?title=Donor_policy/en&oldid=94164',
 'https://meta.wikimedia.org/wiki/Special:MyLanguage/Wikimedia_movement_affiliates',
 'https://www.mediawiki.org/',
 'https://creativecommons.org/licenses/by-sa/3.0/',
 'https://foundation.wikimedia.org/w/index.php?title=Donor_privacy_policy/en&oldid=122990',
 'https://meta.wikimedia.org/wiki/Special:MyLanguage/Fundraising_principles',
 'https://wikimediafoundation.org/news/',
 'https://wikimediafoundation.org/',
 'https://meta.wikimedia.org/wiki/Special:MyLanguage/Fundraising/Reports',
 'https://wikimediafoundation.org/w/index.php?title=Donor_policy&oldid=62924',
 'https://donate.wikimedia.org/wiki/',
 'https://meta.wikimedia.org/wiki/Special:MyLanguage/Answers',
 'https://stats.wikimedia.



['https://developer.wikimedia.org',
 'https://foundation.wikimedia.org/wiki/Special:MyLanguage/Donor_privacy_policy',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://foundation.wikimedia.org/wiki/Special:MyLanguage/Legal:Data_retention_guidelines',
 'https://foundation.wikimedia.org/wiki/Talk:Legal:Data_retention_guidelines',
 'https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy/FAQ',
 'https://www.wikidata.org/wiki/Q82069695',
 'https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement',
 'https://foundation.wikimedia.org/wiki/Privacy_policy',
 'https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Donor_privacy_policy',
 'https://www.wikidata.org/wiki/Q81068910',
 'https://www.mediawiki.org/',
 'https://creativecommons.org/licenses/by-sa/3.0/',
 'https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Access_to_nonpublic_personal_data_policy/Underage_exemptions',
 'https://www.mediawiki.org/wik



['https://sk.wikipedia.org/wiki/Hlavn%C3%A1_str%C3%A1nka',
 'https://din.wikipedia.org/wiki/Apam_k%C3%ABd%C3%AFt',
 'https://br.wikipedia.org/wiki/Degemer',
 'https://mdf.wikipedia.org/wiki/%D0%9F%D1%80%D1%8F_%D0%BB%D0%BE%D0%BF%D0%B0',
 'https://crh.wikipedia.org/wiki/Ba%C5%9F_Saife',
 'https://gcr.wikipedia.org/wiki/Paj_Prensipal',
 'https://ur.wikipedia.org/wiki/%D8%B5%D9%81%D8%AD%DB%82_%D8%A7%D9%88%D9%84',
 'https://kw.wikipedia.org/wiki/Folen_dre',
 'https://es.wikipedia.org/wiki/Wikipedia:Portada',
 'https://hu.wikipedia.org/wiki/Kezd%C5%91lap',
 'https://sd.wikipedia.org/wiki/%D9%85%D9%8F%DA%A9_%D8%B5%D9%81%D8%AD%D9%88',
 'https://udm.wikipedia.org/wiki/%D0%9A%D1%83%D1%82%D1%81%D0%BA%D0%BE%D0%BD_%D0%B1%D0%B0%D0%BC',
 'https://myv.wikipedia.org/wiki/%D0%9F%D1%80%D1%8F%D0%B2%D1%82%D0%BB%D0%BE%D0%BF%D0%B0',
 'https://gag.wikipedia.org/wiki/Ba%C5%9F_yaprak',
 'https://pi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%AE%E0%A5%81%E0%A4%96_%E0%A4%AA%E0%A4%A4%E0%A5%8D%E0%A4%A4_Pamukha_patta',
 'h



['https://mg.wikipedia.org/wiki/',
 'https://ast.wikiquote.org/wiki/Portada',
 'https://tg.wikipedia.org/wiki/',
 'https://ms.wikipedia.org/wiki/',
 'https://species.wikimedia.org/wiki/P%C3%A1xina_principal',
 'https://vi.wikipedia.org/wiki/',
 'https://hy.wikipedia.org/wiki/',
 'https://hr.wikipedia.org/wiki/',
 'https://www.wikidata.org/',
 'https://he.wikipedia.org/wiki/',
 'https://eu.wikipedia.org/wiki/',
 'https://sv.wikipedia.org/wiki/',
 'https://pt.wikipedia.org/wiki/',
 'https://uk.wikipedia.org/wiki/',
 'https://www.wikinews.org/',
 'https://ast.wikipedia.org/w/index.php?title=Portada&oldid=3920607',
 'https://it.wikipedia.org/wiki/',
 'https://www.mediawiki.org/',
 'https://ml.wikipedia.org/wiki/',
 'https://ast.wikiquote.org/wiki/',
 'https://lt.wikipedia.org/wiki/',
 'https://wikisource.org/wiki/Main_Page/Asturianu',
 'https://ext.wikipedia.org/wiki/',
 'https://ur.wikipedia.org/wiki/',
 'https://ast.wiktionary.org/wiki/',
 'https://no.wikipedia.org/wiki/',
 'https://pm



['https://be.wikipedia.org/wiki/%D0%9D%D0%B5%D0%BA%D0%B0%D0%BC%D0%B5%D1%80%D1%86%D1%8B%D0%B9%D0%BD%D0%B0%D1%8F_%D0%B0%D1%80%D0%B3%D0%B0%D0%BD%D1%96%D0%B7%D0%B0%D1%86%D1%8B%D1%8F',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://eo.wikipedia.org/wiki/Neprofitcela_organiza%C4%B5o',
 'https://ta.wikipedia.org/wiki/%E0%AE%87%E0%AE%B2%E0%AE%BE%E0%AE%AA_%E0%AE%A8%E0%AF%8B%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AE%B1%E0%AF%8D%E0%AE%B1_%E0%AE%85%E0%AE%AE%E0%AF%88%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AF%81',
 'https://books.google.com/books?id=5OFFNw0a1dkC',
 'https://hbr.org/2013/03/nonprofits-need-to-compete-for',
 'https://wikimediafoundation.org/wiki/Wikimedia_Foundation_bylaws#ARTICLE_III_-_MEMBERSHIP',
 'https://nl.wikipedia.org/wiki/Non-profit',
 'https://tl.wikipedia.org/wiki/Samahang_hindi_pangkalakalan',
 'https://www.mediawiki.org/',
 'https://aleph.nkp.cz/F/?func=find-c&local_base=aut&ccl_term=ica=ph123286&CON_LNG=ENG',
 'https://inh.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BC%D0%B



['https://pt.wikipedia.org/w/index.php?title=Especial:Registro&type=protect&page=Predefini%C3%A7%C3%A3o:Linguagens_de_programa%C3%A7%C3%A3o',
 'https://developer.wikimedia.org',
 'https://meta.wikimedia.org/wiki/Privacy_policy/pt-br',
 'https://www.wikidata.org/wiki/Special:EntityPage/Q7480464',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://www.mediawiki.org/',
 'https://stats.wikimedia.org/#/pt.wikipedia.org',
 'https://pt.wikipedia.org/wiki/Predefinição:Linguagens_de_programação',
 'https://pt.wikipedia.org/w/index.php?title=Especial:Entrar&type=signup',
 'https://wikimediafoundation.org/',
Save content at: html_content/pt.wikipedia.org/w/index.php?title=Predefini%C3%A7%C3%A3o:Linguagens_de_programa%C3%A7%C3%A3o&action=edit.txt

Idx:  0


['https://developer.wikimedia.org',
 'https://wikimediafoundation.org/',
 'https://meta.wikimedia.org/wiki/Privacy_policy/pt-br',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://www.mediawiki.org/',
 'ht



['https://hsb.wikipedia.org/wiki/P%C5%99ed%C5%82oha:Nawigaciska_lajsta_programowanskich_r%C4%9B%C4%8Dow',
 'https://developer.wikimedia.org',
 'https://zh.wikipedia.org/wiki/Template:%E7%A8%8B%E5%BA%8F%E8%AE%BE%E8%AE%A1%E8%AF%AD%E8%A8%80',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://fi.wikipedia.org/wiki/Malline:Ohjelmointikielet',
 'https://id.wikipedia.org/wiki/Templat:Bahasa_pemrograman',
 'https://ko.wikipedia.org/wiki/%ED%8B%80:%ED%94%84%EB%A1%9C%EA%B7%B8%EB%9E%98%EB%B0%8D_%EC%96%B8%EC%96%B4',
 'https://si.wikipedia.org/wiki/%E0%B7%83%E0%B7%90%E0%B6%9A%E0%B7%92%E0%B6%BD%E0%B7%8A%E0%B6%BD:Programming_languages',
 'https://th.wikipedia.org/wiki/%E0%B9%81%E0%B8%A1%E0%B9%88%E0%B9%81%E0%B8%9A%E0%B8%9A:%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1',
 'https://vi.wikipedia.org/wiki/B%E1%BA%A3n_m%E1%BA%ABu:Ng%C3%B4n_ng%E1%BB%AF_l%E1%BA%ADp_tr%C3%ACnh',
 'https://stats.wikimedia.org/#/www.wikidata.org',
 'http



['https://survey.stackoverflow.co/2022',
 'https://stackoverflow.co/company/work-here',
 'https://stackoverflow.com/legal/privacy-policy',
 'https://stackoverflow.co/',
 'https://stackoverflow.co/company/contact',
 'https://stackoverflow.co/advertising',
 'https://stackoverflow.blog/',
 'https://stackoverflow.com/legal/cookie-policy',
 'https://stackoverflow.com/questions',
 'https://stackoverflow.co/collectives',
 'https://stackoverflow.com/legal',
 'https://stackoverflow.com/legal/terms-of-service',
 'https://stackoverflow.blog/podcast',
 'https://stackoverflow.co/company/press',
 'https://resources.stackoverflow.co/advertising',
 'https://stackoverflow.co/talent',
 'https://stackoverflow.co/teams',
 'https://stackoverflow.blog/newsletter']
Save content at: html_content/www.stackoverflowbusiness.com/advertise.txt

Idx:  2


['https://stackoverflowteams.com/teams/create/free?utm_source=so-owned&utm_medium=side-bar&utm_campaign=campaign-38&utm_content=cta',
 'https://stackoverflow.bl



['https://stackoverflowteams.com/teams/create/free?utm_source=so-owned&utm_medium=side-bar&utm_campaign=campaign-38&utm_content=cta',
 'https://superuser.com/questions/1780665/what-prevents-unsolicited-internet-packets-from-using-up-my-monthly-data-limit',
 'https://physics.stackexchange.com/questions/761135/why-is-non-abelian-gauge-theory-unique-in-4-dimensional-spacetime',
 'https://stackoverflow.co/company/work-here',
 'https://stackoverflow.com/legal/privacy-policy',
 'https://physics.stackexchange.com/questions/761139/how-does-special-relativity-work-at-lower-velocities',
 'https://stackoverflow.co/advertising',
 'https://meta.stackoverflow.com/questions/423798/content-discovery-initiative-april-13-update-related-questions-using-a-machine',
 'https://try.stackoverflow.co/why-teams/?utm_source=so-owned&utm_medium=side-bar&utm_campaign=campaign-38&utm_content=cta',
 'https://scifi.stackexchange.com/questions/275048/short-story-about-reincarnation-and-inheriting-debt-from-your-prev



['https://web.archive.org/web/20070501105422/http://www.python.org/~guido/',
 'https://web.archive.org/web/20070501105422/mailto:comments@amk.ca',
 'https://web.archive.org/web/20070501105422/http://www.cwi.nl/',
 'https://web.archive.org/web/20070501105422/http://www.m3.org/',
 'https://web.archive.org/web/20070501105422/http://www.swig.org/',
 'https://web.archive.org/web/20070501105422/http://www.cnri.reston.va.us/']
Save content at: html_content/web.archive.org/web/20070501105422/www.amk.ca/python/writing/gvr-interview.txt

Idx:  1


['https://www.mediawiki.org/wiki/Template:Main_page/sr',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://www.mediawiki.org/wiki/Template:Main_page/gu',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://www.mediawiki.org/wiki/Template:Main_page/yue',
 'https://www.mediawiki.org/wiki/Template:Main_page/ru',
 'https://www.mediawiki.org/w/index.php?title=MediaWiki&oldid=3878227',
 'https://foundation.wikimedia.org/wiki/Privacy_po



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wik



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wikipedia.org/wiki/Python_(programmeertaal)',
 'https://ca.wikipedia.org/wiki/Python',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wikipedia.org/wiki/Python_(programmeertaal)',



['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://nl.wik



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://www.python.org/downloads/',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.org/wiki/%E7%9A%AE%E5%90%8C',
 'https://www.mediawiki.org/',
 'https://hr.wikipedia.org/wiki/



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.org/wiki/%E7%9A%AE%E5%90%8C',
 'https://www.mediawiki.org/',
 'https://km.wikipedia.org/w/index.php?title=ផាយថុន&oldid=260517',
 'h



['https://www.gtk.org/docs/language-bindings/python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://web.archive.org/web/20201001161823/https://fedoramagazine.org/using-gns3-with-fedora/',
 'https://web.archive.org/web/20200614153717/https://www.python.org/dev/peps/pep-0289/',
 'http://www.amk.ca/python/writing/gvr-interview',
 'https://ru.wikisource.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://dx.doi.org/10.1088%2F1742-6596%2F423%2F1%2F012027',
 'https://www.python.org/doc/sunset-python-2/',
 'https://web.archive.org/web/20210609072244/https://hkvalidate.perfdrive.com/?ssa=7e557f91-2a23-4954-9336-4d6aa8eff543&ssb=13723216276&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1088%2F1742-6596%2F423%2F1%2F012027&ssi=68a9da23-8427-477c-a539-9f5448a945ec&ssk=support%40shieldsquare.com&ssm=42114570057895697108641720190350&ssn=b37115f7a9ef8f78371ddc2fc86e93ac7b7d49c27ea8-1bd1-433f-87f



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'http://www.amk.ca/python/writing/gvr-interview',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://zh-min-nan.wikipedia.org/w/index.php?title=Pang-b%C3%B4%CD%98:Programming_languages&action=edit',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://stats.wikimedia.org/#/zh-min-nan.wik



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.org/wiki/%E7%9A%AE%E5%90%8C',
 'https://www.mediawiki.org/',
 'https://www.python.org',
 'https://hr.wikipedia.org/wiki/Python_(pro



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://www.tiobe.com/tiobe-index/',
 'http://data.bnf.fr/ark:/12148/cb13560465c',
 'https://id.loc.gov/authorities/sh96008834',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.



['https://www.python.org/',
 'http://www.cs.arizona.edu/icon/',
 'http://sf.net/cvs/?group_id=5470',
 'http://python.ca/nas/python/generator.diff',
 'http://www.stackless.com/']
Save content at: html_content/www.python.org/dev/peps/pep-0255.txt

Idx:  3


['https://tg.wikipedia.org/wiki/Python',
 'https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%87%E0%A4%A5%E0%A4%A8',
 'https://jbo.wikipedia.org/wiki/paiton',
 'https://zh.wikipedia.org/wiki/Python',
 'https://gl.wikipedia.org/wiki/Python',
 'https://lmo.wikipedia.org/wiki/Python',
 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wi



['https://la.wikipedia.org/wiki/Python_(lingua_programmandi)',
 'https://gl.wikipedia.org/wiki/Python',
 'https://foundation.wikimedia.org/wiki/Cookie_statement',
 'https://an.wikipedia.org/wiki/Python',
 'https://te.wikipedia.org/wiki/%E0%B0%AA%E0%B1%88%E0%B0%A5%E0%B0%BE%E0%B0%A8%E0%B1%8D_(%E0%B0%95%E0%B0%82%E0%B0%AA%E0%B1%8D%E0%B0%AF%E0%B1%82%E0%B0%9F%E0%B0%B0%E0%B1%8D_%E0%B0%AD%E0%B0%BE%E0%B0%B7)',
 'https://da.wikipedia.org/wiki/Python_(programmeringssprog)',
 'https://mk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%98%D1%82%D0%BE%D0%BD_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B0%D0%B7%D0%B8%D0%BA)',
 'https://uk.wikipedia.org/wiki/Python',
 'https://www.python.org/downloads/',
 'https://it.wikipedia.org/wiki/Python',
 'https://nqo.wikipedia.org/wiki/%DF%94%DF%8A%DF%8C%DF%95%DF%90%DF%B2%DF%AC',
 'https://lt.wikipedia.org/wiki/Python',
 'https://zh-classical.wikipedia.org/wiki/%E7%9A%AE%E5%90%8C',
 'https://www.mediawiki.org/',
 'https://hr.wikipedia.org/wiki/

In [29]:
from pprint import pprint

pprint(list(dic_texts.keys()))

['br.pinterest.com/receiteria',
 'www.grupocontteudo.com.br',
 'www.receiteria.com.br',
 'www.receiteria.com.br/arroz-e-risotos',
 'www.receiteria.com.br/bebidas',
 'www.receiteria.com.br/bolos',
 'www.receiteria.com.br/carnes',
 'www.receiteria.com.br/como-fazer-leite-evaporado',
 'www.receiteria.com.br/como-fazer-pipoca-de-cinema',
 'www.receiteria.com.br/doces-e-sobremesas',
 'www.receiteria.com.br/entradas-e-petiscos',
 'www.receiteria.com.br/entrar',
 'www.receiteria.com.br/imprimir?id=142617',
 'www.receiteria.com.br/lanches-e-salgados',
 'www.receiteria.com.br/molhos-e-pates',
 'www.receiteria.com.br/paes',
 'www.receiteria.com.br/privacidade',
 'www.receiteria.com.br/receita/arroz-com-calabresa-na-pressao-facil',
 'www.receiteria.com.br/receita/batata-rustica',
 'www.receiteria.com.br/receita/biscoito-amanteigado-sem-fermento',
 'www.receiteria.com.br/receita/bisnaguinha-sem-gluten',
 'www.receiteria.com.br/receita/bolinho-de-arroz-recheado-com-queijo',
 'www.receiteria.com.br/

# Load content from S3

In [5]:
dic_texts = {}

for s3_path in S3.list_files_recursive(folder='html_content/'):
    key = s3_path.split('.txt')[0]
    value = S3.read_html(fpath='html_content/'+s3_path)
    dic_texts[key] = value
    
    
for k,v in dic_texts.items():
    print(f'Path: {k}\nContent: {v[:10]}\n')

Path: als.wikipedia.org/wiki/Python_(Programmiersprache)
Content: python pro

Path: ast.wikipedia.org/wiki/Portada
Content: wikipedia 

Path: az.wikipedia.org/wiki/Python_(proqramla%C5%9Fd%C4%B1rma_dili)
Content: python pro

Path: br.pinterest.com/receiteria
Content: receiteria

Path: ca.wikipedia.org/wiki/Python
Content: python viq

Path: ceb.wikipedia.org/wiki/Python_(programming_language)
Content: python pro

Path: code.google.com/p/unladen-swallow/wiki/ProjectPlan
Content: internal s

Path: commons.wikimedia.org/wiki/Special:Search/Category:Python_(programming_language)
Content: category p

Path: disqus.com/?ref_noscript
Content: disqus the

Path: el.wikipedia.org/wiki/Python
Content: python bik

Path: en.wikipedia.org/wiki/Non-profit_organization
Content: nonprofit 

Path: et.wikipedia.org/wiki/Python_(programmeerimiskeel)
Content: python pro

Path: eu.wikipedia.org/wiki/Python_(informatika)
Content: python inf

Path: fa.wikipedia.org/wiki/%D9%BE%D8%A7%DB%8C%D8%AA%D9%88%D9%86_(%D8

In [1]:
pip install scikit-learn

[0mNote: you may need to restart the kernel to use updated packages.


# Build reverse-index

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_reverse_index(dic_texts: dict):
    content_from_urls = list(dic_texts.values())
    website_urls      = list(dic_texts.keys())
    
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(content_from_urls)

    res = {}

    for word in vectorizer.get_feature_names_out():
        j = vectorizer.vocabulary_[word]
        doc_df = {}
        
        for i in range(len(website_urls)):
            if tfidf[i,j] > 0:
                doc_df[website_urls[i]] = tfidf[i,j]
                
        res[word] = doc_df
    
    return res

reverse_index = build_reverse_index(dic_texts)

In [15]:
def search_words(list_words: list[str], rev_index, n: int):
    assert type(list_words)==list
    result = {}
    not_found = []
    
    for word in list_words:
        word = word.lower()
        if word in rev_index.keys():
            for documento in rev_index[word].keys():
                if documento not in result.keys():
                    result[documento]  = rev_index[word][documento]
                else:
                    result[documento] += rev_index[word][documento]
        else:
            not_found.append(word)
    
    if result:
        sorted_res = dict(sorted(result.items(), key=lambda x: x[1], reverse=True)[:n])
        return sorted_res, not_found
    
    return {}, not_found

search_words(list_words=['carne'], rev_index=reverse_index, n=5)

({'www.receiteria.com.br/carnes': 0.14227529655628074,
  'www.receiteria.com.br/receita/penne-com-frango': 0.08051617862138481,
  'www.receiteria.com.br': 0.08002302073683487,
  'www.receiteria.com.br/receita/maionese-de-ovo-cozido': 0.0732510728163526,
  'www.receiteria.com.br/receita/molho-de-shimeji': 0.07110451840364586},
 [])