In [1]:
!pip install wikipedia



In [2]:
!pip install pandas




In [9]:
import wikipedia
import re
import pandas as pd
import json

In [10]:
help(wikipedia)

Help on package wikipedia:

NAME
    wikipedia

PACKAGE CONTENTS
    exceptions
    util
    wikipedia

DATA
    API_URL = 'http://en.wikipedia.org/w/api.php'
    ODD_ERROR_MESSAGE = "This shouldn't happen. Please report on GitHub: g...
    RATE_LIMIT = False
    RATE_LIMIT_LAST_CALL = None
    RATE_LIMIT_MIN_WAIT = None
    USER_AGENT = 'wikipedia (https://github.com/goldsmith/Wikipedia/)'
    geosearch = <wikipedia.util.cache object>
        Do a wikipedia geo search for `latitude` and `longitude`
        using HTTP API described in http://www.mediawiki.org/wiki/Extension:GeoData
        
        Arguments:
        
        * latitude (float or decimal.Decimal)
        * longitude (float or decimal.Decimal)
        
        Keyword arguments:
        
        * title - The title of an article to search for
        * results - the maximum number of results returned
        * radius - Search radius in meters. The value must be between 10 and 10000
    
    languages = <wikipedia.util.c

In [11]:
# Topics Dictionary with their related subtopics
topics = {
    'Health': ['common diseases', 'global health statistics', 'mental health trends'],
    'Environment': ['global warming', 'endangered species', 'deforestation rates'],
    'Technology': ['emerging technologies', 'AI advancements'],
    'Economy': ['stock market performance', 'job markets', 'cryptocurrency trends'],
    'Entertainment': ['music industry', 'popular cultural events', 'streaming platforms'],
    'Sports': ['major sporting events', 'sports analytics'],
    'Politics': ['elections', 'public policy analysis', 'international relations'],
    'Education': ['literacy rates', 'online education trends', 'student loan data'],
    'Travel': ['top tourist destinations', 'airline industry data', 'travel trends'],
    'Food': ['crop yield statistics', 'global hunger', 'food security']
}

In [12]:
def preprocessing(summary):
    # Regex for keeping only alphanumeric characters
    return re.sub(r'[^a-zA-Z0-9 ]+', '', summary)

In [13]:
def wikipedia_data_scrape(main_topic, subtopic, max_docs, min_summary_length, global_visited_url):
    visited_url = set()  # To prevent duplication within a single subtopic
    unique_titles = set()  # To ensure unique content by title
    unique_summaries = set()  # To ensure unique content by summary hash
    docs = []
    short_summary_count = 0
    search_results = wikipedia.search(subtopic, results=500)
    
    for result in search_results:
        if len(docs) >= max_docs:
            break        
        try:
            content = wikipedia.page(result, auto_suggest=False)
            if content.url in visited_url or content.url in global_visited_url:
                continue
            
            cleaned_summary = preprocessing(content.summary)
            
            # Ensure cleaned_summary is not empty
            if not cleaned_summary.strip():
                continue  # Skip null or empty summaries
            
            if len(cleaned_summary) < min_summary_length:
                short_summary_count += 1
                if short_summary_count / max_docs > 0.05:  # Only allow 5% of summaries to be short
                    continue
            
            # Uniqueness check based on title and summary
            if content.title in unique_titles or hash(cleaned_summary) in unique_summaries:
                continue
            
            # Append document with correct field types
            docs.append({
                'title': str(content.title),
                'revision_id': str(content.revision_id),  # Ensure revision_id is a string
                'summary': {'text_en': cleaned_summary},  # Ensure the field is marked text_en
                'url': content.url,
                'topic': main_topic
            })
            
            # Update uniqueness sets
            unique_titles.add(content.title)
            unique_summaries.add(hash(cleaned_summary))
            visited_url.add(content.url)
            global_visited_url.add(content.url)
        
            # Process linked pages
            if len(docs) < max_docs:
                for link in content.links:
                    if len(docs) >= max_docs:
                        break
                    
                    try:
                        linked_content = wikipedia.page(link, auto_suggest=False)
                        if linked_content.url in visited_url or linked_content.url in global_visited_url:
                            continue
                        
                        linked_summary = preprocessing(linked_content.summary)
                        
                        if not linked_summary.strip():
                            continue  # Skip null or empty summaries
                        
                        if len(linked_summary) < min_summary_length:
                            short_summary_count += 1
                            if short_summary_count / max_docs > 0.05:
                                continue
                        
                        # Uniqueness check based on title and summary
                        if linked_content.title in unique_titles or hash(linked_summary) in unique_summaries:
                            continue
                        
                        docs.append({
                            'title': str(linked_content.title),
                            'revision_id': str(linked_content.revision_id),
                            'summary': {'text_en': linked_summary},
                            'url': linked_content.url,
                            'topic': main_topic
                        })

                        unique_titles.add(linked_content.title)
                        unique_summaries.add(hash(linked_summary))
                        visited_url.add(linked_content.url) 
                        global_visited_url.add(linked_content.url)
                    
                    except wikipedia.exceptions.DisambiguationError:
                        continue
                    except wikipedia.exceptions.PageError:
                        continue
        except wikipedia.exceptions.DisambiguationError as e:
            print(f"DisambiguationError for {result}: {e}")
        except wikipedia.exceptions.PageError as e:
            print(f"PageError for {result}: {e}")
    
    # to ensure all summaries are non-empty
    for doc in docs:
        assert doc['summary']['text_en'].strip(), f"Summary is null or empty for revision ID: {doc['revision_id']}"

    return docs


In [14]:
def all_topics_scrape(topics):
    data = {}
    
    for main_topic, subtopics in topics.items():
        data[main_topic] = []  
        global_visited_url = set() # to prevent duplication within subtopics under a single topic
        for subtopic in subtopics:
            print(f"Fetching started data for subtopic: {subtopic} under topic: {main_topic}")
            documents = wikipedia_data_scrape(main_topic, subtopic, 3000, 200, global_visited_url) # For each subtopic within a single topic this function will scrape 300 docs
            data[main_topic].extend(documents)  
            print(f"Fetching of {len(documents)} documents completed for subtopic: {subtopic}")
    
    return data


In [15]:
all_scraped_data = all_topics_scrape(topics) # Total 70000 docs will be scraped and preprocessed

Fetching started data for subtopic: common diseases under topic: Health




  lis = BeautifulSoup(html).find_all('li')


Fetching of 3000 documents completed for subtopic: common diseases
Fetching started data for subtopic: global health statistics under topic: Health
Fetching of 3000 documents completed for subtopic: global health statistics
Fetching started data for subtopic: mental health trends under topic: Health
Fetching of 3000 documents completed for subtopic: mental health trends
Fetching started data for subtopic: global warming under topic: Environment
DisambiguationError for Global warming (disambiguation): "Global warming (disambiguation)" may refer to: 
global surface temperature
global surface temperature
ocean heat content
ocean temperature
sea surface temperature
Earth's Energy Imbalance
Global Warming (Pitbull album)
Global Warming (Sonny Rollins album)
Global Warming: The Signs and The Science
Global Warming: What You Need to Know
From Mars to Sirius
Continent
All pages with titles beginning with Global warming
All pages with titles containing Global warming
DisambiguationError for Uns

ConnectTimeout: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?prop=extracts&explaintext=&exintro=&titles=Education+in+Bahrain&format=json&action=query (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x13a1472d0>, 'Connection to en.wikipedia.org timed out. (connect timeout=None)'))

In [None]:
with open('wikipedia_scraped_data.json', 'w', encoding='utf-8') as f:
    json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)  # saving the data in a json file

print("Data saved to 'wikipedia_scraped_data.json'.")