In [3]:
pip install wikipedia-api

Note: you may need to restart the kernel to use updated packages.


In [4]:
import wikipediaapi
import spacy
import pandas as pd
from collections import defaultdict

# Initialize the Wikipedia API with a custom user agent - change user to your identifier e.g. email
user_agent = "MicrobialTermExtractor/1.0 (user)"
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})

In [5]:
# List of Wikipedia pages related to microbes
pages = ["Marine microorganisms", "marine microbiome", "Marine viruses", "Marine bacteria", "Bacterioplankton", "Bacterial motility", "Marine prokaryotes", "Marine archaea", "Marine protists", "Marine fungi", "Mycoplankton", "Marine microanimals", "Ichthyoplankton", "Marine primary production", "Algae", "Marine microplankton","Marine microbenthos", "Sea ice microbial communities", "Hydrothermal vent microbial communities", "deep biosphere", "microbial dark matter"]

In [6]:
# Fetch content from the Wikipedia pages
content = ""
for page_title in pages:
    page = wiki_wiki.page(page_title)
    if page.exists():
        print(f"Fetching content from Wikipedia page: {page_title}")
        content += page.text
    else:
        print(f"Page {page_title} does not exist on Wikipedia.")

# Save the fetched content to a text file (optional)
with open("wikipedia_microbes.txt", "w", encoding="utf-8") as file:
    file.write(content)

Fetching content from Wikipedia page: Marine microorganisms
Fetching content from Wikipedia page: marine microbiome
Fetching content from Wikipedia page: Marine viruses
Fetching content from Wikipedia page: Marine bacteria
Fetching content from Wikipedia page: Bacterioplankton
Fetching content from Wikipedia page: Bacterial motility
Fetching content from Wikipedia page: Marine prokaryotes
Fetching content from Wikipedia page: Marine archaea
Fetching content from Wikipedia page: Marine protists
Fetching content from Wikipedia page: Marine fungi
Fetching content from Wikipedia page: Mycoplankton
Page Marine microanimals does not exist on Wikipedia.
Fetching content from Wikipedia page: Ichthyoplankton
Fetching content from Wikipedia page: Marine primary production
Fetching content from Wikipedia page: Algae
Fetching content from Wikipedia page: Marine microplankton
Page Marine microbenthos does not exist on Wikipedia.
Fetching content from Wikipedia page: Sea ice microbial communities
Fe

In [7]:
# Load the spaCy model
nlp = spacy.load("en_core_web_md")

# Process the fetched content using spaCy
doc = nlp(content)


In [8]:
# Extract potential microbial-related terms
terms_freq = defaultdict(int)
for token in doc:
    if token.is_alpha and not token.is_stop and len(token.text) > 3:
        terms_freq[token.text.lower()] += 1

In [None]:
# Convert to a sorted list
all_terms = sorted(terms_freq.keys())

In [None]:
# Save the terms to a CSV file
df = pd.DataFrame(all_terms, columns=["Microbial-Related Terms"])
df.to_csv("microbial_terms_wikipedia.csv", index=False)

In [None]:
print(f"Saved {len(all_terms)} microbial-related terms to 'microbial_terms_wikipedia.csv'.")