In [17]:
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import time
from tqdm import tqdm

class WikipediaSearch:
    def __init__(self):
        pass

    def list_last_pt_articles(self, total_limit: int = 500) -> List[Dict]:
        """Lists the last articles."""
        search_results = []
        url = f"https://pt.wikipedia.org/w/index.php?title=Especial:P%C3%A1ginas_novas&limit={total_limit}"
        
        # Make a single request to get the list of new pages
        res = requests.get(url)
        
        if res.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(res.content, 'html.parser')
            
            # Find all <ul> elements with class 'mw-contributions-list'
            ul_elements_list = soup.find_all('ul', class_='mw-contributions-list')
            for ul_element in ul_elements_list:
                # Get all the list items (li) inside the ul
                list_items = ul_element.find_all('li')
                
                for item in list_items:
                    # Extract article date and hyperlink
                    item_hyperlink = item.find('a')
                    article_date = item_hyperlink.text.strip() if item_hyperlink else None
                    article_link = f"https://pt.wikipedia.org{item_hyperlink['href']}" if item_hyperlink else None
                    search_results.append({'date': article_date, 'link': article_link})
        else:
            raise ValueError(f"Failed to retrieve data. Status code: {res.status_code}")
        
        return search_results

    def parse_article(self, article: Dict, processing_type: str = 'default', parse_text_processing_type: str = 'simple') -> Dict:
        """Fetches the article content from its link and adds the text to the article dict.
        
        Args:
            article (Dict): The article dictionary containing the link.
            processing_type (str): The type of processing to use for extracting text.
                                   Options: 'default' (parses HTML) or 'simple' (gets text directly).
            parse_text_processing_type (str): The type of text processing to apply after fetching the article.
                                               Options: 'default' (processes as HTML) or 'simple' (raw text).

        Returns:
            Dict: The updated article dictionary with the text added.
        """
        article_url = article.get("link")
        if not article_url:
            return article
        
        res = requests.get(article_url)
        
        if res.status_code == 200:
            if processing_type == 'default':
                soup = BeautifulSoup(res.content, 'html.parser')
                
                # Parse the content from the main body of the article
                content = soup.find('div', class_='mw-parser-output')
                if content:
                    paragraphs = content.find_all('p')
                    # Join all paragraph texts to form the full article text
                    article_text = '\n'.join([para.get_text(strip=False) for para in paragraphs])
                    article["text"] = article_text
                else:
                    article["text"] = "No content available"
            elif processing_type == 'simple':
                # Simple processing type: directly extract text from the response
                article["text"] = res.text
            else:
                raise ValueError("Invalid processing type specified.")
            
            # Apply text processing type
            if parse_text_processing_type == 'simple':
                # Directly extract text (no HTML processing)
                article["text"] = res.text
            elif parse_text_processing_type != 'default':
                article["text"] += " Invalid text processing type specified."
        else:
            article["text"] = "Failed to fetch article content"
        
        return article

    def calculate_processing_time(self, num_articles: int, requests_per_second: int) -> float:
        """
        Calculates the expected time to process a given number of articles at a specified rate of requests per second.
        """        
        # Total time to process all articles
        total_time = num_articles / requests_per_second
        return total_time

    def get_last_pt_articles(self, total_limit: int = 10, requests_per_second: int = 10000,
                            parse_text_processing_type = "simple", 
                            verbose=True) -> List[Dict]:
        """Combines listing and parsing of the articles, applying a request rate limit.""" 
        articles = self.list_last_pt_articles(total_limit=total_limit)
        parsed_articles = []

        # Calculate the delay between requests to achieve the desired rate
        delay = 1 / requests_per_second  # Time to wait between requests

        # Use tqdm with total parameter for correct progress display
        for article in tqdm(articles, total=len(articles) if verbose else None):
            # Parse the article's content
            parsed_article = self.parse_article(article, parse_text_processing_type = parse_text_processing_type)
            parsed_articles.append(parsed_article)

            # Apply rate limiting: wait before processing the next article
            time.sleep(delay)

        return parsed_articles

# Usage example
wiki_search = WikipediaSearch()
total_limit = 10
requests_per_second = 4
parse_text_processing_type = "default"
verbose = True
expected_processing_time = wiki_search.calculate_processing_time(num_articles=total_limit, requests_per_second=requests_per_second)
print(f"Expected processing time: {int(expected_processing_time // 60)} minutes and {int(expected_processing_time % 60)} seconds.")


Expected processing time: 0 minutes and 2 seconds.


In [18]:
articles = wiki_search.get_last_pt_articles(total_limit=total_limit, requests_per_second=requests_per_second,
                    parse_text_processing_type = parse_text_processing_type, verbose = verbose)
print(f"Total of {len(articles)} articles fetched and parsed.")
for article in articles:
    print(f"Date: {article['date']}, Link: {article['link']}, Text: {article.get('text', 'No content')[:100]}...")  # Print first 100 chars of the article text

100%|██████████| 10/10 [00:05<00:00,  1.89it/s]

Total of 10 articles fetched and parsed.
Date: 16h43min de 5 de outubro de 2024, Link: https://pt.wikipedia.org/w/index.php?title=Glucano_1,4-alfa-glicosidase&oldid=68762060, Text: A glucana 1,4-α-glicosidase' (Numero EC: 3.2.1.3), anteriormente conhecida por γ-amilase', é uma enz...
Date: 16h10min de 5 de outubro de 2024, Link: https://pt.wikipedia.org/w/index.php?title=Rep%C3%BAblica_de_Labin&oldid=68761894, Text: Labinska Republika (Croata)Repubblica di Albona (Italiano)
Estado não reconhecido
A República de Lab...
Date: 16h10min de 5 de outubro de 2024, Link: https://pt.wikipedia.org/w/index.php?title=Ruyter_Poubel&oldid=68761891, Text: Ruyter de Mendonça Poubel (Duque de Caxias, 16 de julho de 1997), mais conhecido como Ruyter Poubel,...
Date: 16h04min de 5 de outubro de 2024, Link: https://pt.wikipedia.org/w/index.php?title=Ansalonga&oldid=68761865, Text: Ansalonga é uma localidade de Andorra, situada na Paróquia de Ordino. Situada a 1.331 metros de alti...
Date: 16h00min de 5 de


