## Retrieving Wikipedia data

## Defining the tokenization function

In [71]:
import os
import nltk
from nltk.tokenize import word_tokenize

# Ensure you have the necessary NLTK resource downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

def nb_tokens(text):
    # More sophisticated tokenization can be used here, such as punctuation
    tokens = word_tokenize(text)
    return len(tokens)

[nltk_data] Downloading package punkt to /home/ongin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ongin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Creating an Wikipedia API instance

In [72]:
import wikipediaapi
wikipedia = wikipediaapi.Wikipedia(language='en', user_agent='KnowledgeGraph/0.0.2 (honglin@duck.com)')

topic = "Solar_cell"

### Root page summary

In [73]:
import textwrap

page = wikipedia.page(topic)

if page.exists():
    print(f"Title: {page.title}")
    print(f"Summary: {textwrap.fill(page.summary, width=80)}\n")
    print(f"Number of tokens: {nb_tokens(page.summary)}")
else:
    print("Page not found!")

Title: Solar cell
Summary: A solar cell, also known as a photovoltaic cell (PV cell), is an electronic
device that converts the energy of light directly into electricity by means of
the photovoltaic effect. It is a type of photoelectric cell, a device whose
electrical characteristics (such as current, voltage, or resistance) vary when
it is exposed to light. Individual solar cell devices are often the electrical
building blocks of photovoltaic modules, known colloquially as "solar panels".
Almost all commercial PV cells consist of crystalline silicon, with a market
share of 95%. Cadmium telluride thin-film solar cells account for the remainder.
The common single-junction silicon solar cell can produce a maximum open-circuit
voltage of approximately 0.5 to 0.6 volts. Photovoltaic cells may operate under
sunlight or artificial light. In addition to producing solar power, they can be
used as a photodetector (for example infrared detectors), to detect light or
other electromagnetic radiati

### Collecting URLs and fetch content to documents

In [74]:
print(page.fullurl)

# Get all the links on the page
links = page.links
print(f"Number of links: {len(links)}")

def safe_file_name(s):
    # Replace spaces with underscores
    s = s.replace(' ', '_')
    # Remove any characters that are not allowed in file names
    safe_str = ''.join(c for c in s if c.isalpha() or c.isdigit() or c in [' ', '.', '_', '-'])
    return safe_str

def file_exists_and_has_content(file_path):
    # Check if the file exists
    if not os.path.exists(file_path):
        return False
    
    # Check if the file is not empty
    with open(file_path, 'r', encoding='utf-8') as file:
        first_char = file.read(1)
        if first_char:
            return True
        else:
            return False

# Directory to store the output file
output_dir = './documents/'
os.makedirs(output_dir, exist_ok=True)

def save_document(page):
    file_name = safe_file_name(page.title)
    file_path = os.path.join(output_dir, f"{file_name}.txt")
    
    if file_exists_and_has_content(file_path):
        return

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Title: {page.title}\n\n")
        file.write(f"URL: {page.fullurl}\n\n")
        file.write(f"Content:\n{page.text}\n")

# Save topic document
save_document(page)

https://en.wikipedia.org/wiki/Solar_cell
Number of links: 568


In [75]:

from tqdm import tqdm
import time

for link in tqdm(sorted(links), desc="Fetching Wikipedia articles"):
    page = wikipedia.page(link)
    if page.exists() and page.fullurl:
        save_document(page)
        time.sleep(0.5)

Fetching Wikipedia articles: 100%|██████████| 568/568 [14:19<00:00,  1.51s/it]
