# Method 1

## A) Using wikipedia

In [1]:
# Import packages
import wikipedia

# Specify the title of the Wikipedia page
wiki = wikipedia.page('John D. Hunter')

# Extract the plain text content of the page, excluding images, tables, and other data.
text = wiki.content

# Replace '==' with '' (an empty string)
text = text.replace('==', '')

# Replace '\n' (a new line) with '' & end the string at $1000.
text = text.replace('\n', '')[:-12]
print(text)

ModuleNotFoundError: No module named 'wikipedia'

## Using urllib & BeatifulSoup

In [None]:
# Import packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# Specify url of the web page
source = urlopen('https://en.wikipedia.org/wiki/Science').read()

# Make a soup 
soup = BeautifulSoup(source,'lxml')

# Extract the plain text content from paragraphs
paras = []
for paragraph in soup.find_all('p'):
    paras.append(str(paragraph.text))

# Extract text from paragraph headers
heads = []
for head in soup.find_all('span', attrs={'mw-headline'}):
    heads.append(str(head.text))

# Interleave paragraphs & headers
text = [val for pair in zip(paras, heads) for val in pair]
text = ' '.join(text)

# Drop footnote superscripts in brackets
text = re.sub(r"\[.*?\]+", '', text)

# Replace '\n' (a new line) with '' and end the string at $1000.
text = text.replace('\n', '')[:-11]

[]


# Method 2

In [None]:
# https://en.wikipedia.org/wiki/Python_(programming_language)
# https://www.mediawiki.org/wiki/API:Main_page

# 1. Get a plain text representation of either the entire page or the page "extract" straight from the API with the extracts prop

# Note that this approach only works on MediaWiki sites with the TextExtracts extension. This notably includes Wikipedia, but not some smaller Mediawiki sites like, say, http://www.wikia.com/

# You want to hit a URL like

# https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Bla_Bla_Bla&prop=extracts&exintro&explaintext

# Breaking that down, we've got the following parameters in there (documented at https://www.mediawiki.org/wiki/Extension:TextExtracts#query+extracts):

# action=query, format=json, and title=Bla_Bla_Bla are all standard MediaWiki API parameters
# prop=extracts makes us use the TextExtracts extension
# exintro limits the response to content before the first section heading
# explaintext makes the extract in the response be plain text instead of HTML
# Then parse the JSON response and extract the extract:

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Python_(programming_language)',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }).json()
page = next(iter(response['query']['pages'].values()))
print(page['extract'])


# 2. Get the full HTML of the page using the parse endpoint, parse it, and extract the first paragraph
# MediaWiki has a parse endpoint that you can hit with a URL like https://en.wikipedia.org/w/api.php?action=parse&page=Bla_Bla_Bla to get the HTML of a page. You can then parse it with an HTML parser like lxml (install it first with pip install lxml) to extract the first paragraph.

# For example:

import requests
from lxml import html
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'parse',
        'page': 'Python_(programming_language)',
        'format': 'json',
    }).json()
raw_html = response['parse']['text']['*']
document = html.document_fromstring(raw_html)
first_p = document.xpath('//p')[0]
intro_text = first_p.text_content()
print(intro_text)

# 3. Parse wikitext yourself
# You can use the query API to get the page's wikitext, parse it using mwparserfromhell (install it first using pip install mwparserfromhell), then reduce it down to human-readable text using strip_code. strip_code doesn't work perfectly at the time of writing (as shown clearly in the example below) but will hopefully improve.
import requests
import mwparserfromhell
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Python_(programming_language)',
        'prop': 'revisions',
        'rvprop': 'content'
    }).json()
page = next(iter(response['query']['pages'].values()))
wikicode = page['revisions'][0]['*']
parsed_wikicode = mwparserfromhell.parse(wikicode)
print(parsed_wikicode.strip_code())

## Methods
> **[Two simple ways to scrape text from Wikipedia in Python](https://gist.github.com/zluvsand/6c1b2c8b5098d88081bf5988138019f2)**

> **[GetWiki.py](https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6)**
