This script will scrape some of the articles on https://en.wikipedia.org/w/index.php?title=Category:All_NPOV_disputes page which contains all the articles that have contested neutrality. It will save it into an XML file which can then be processed by https://gitlab.com/mattiasostmar/discoursediversity to identify correlation in the diversity of the discourse and NPOV measures. NPOV is set to FALSE here.


In [None]:
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

import xml.etree.ElementTree as ET

# prettify XML
def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

# Main function
def parse_wiki(start_url, start_item, iterations, lastcount):
    
    runtimes = 0
    
    # first pull the HTML from the page that links to all of the pages with the links.
    html = requests.get(start_url + start_item + '#mw-pages')

    #turn the HTML into a beautiful soup text object
    b = BeautifulSoup(html.text, 'lxml')
    # create an empty list where those links will go.
    links = []

    # in this case, all of the links we're in a '<li>' brackets.
    for i in b.find_all(name = 'li'):
        # pull the actual link for each one
        for link in i.find_all('a', href=True):
            links.append(link['href'])
    # the above code ends up pulling more links than I want,
    # so I just use the ones I want
    
    # print(links) # to check
    
    # this is approximately where those links are at on this particular page
    start_from = 29
    finish_at = 227
    total_processed = finish_at - start_from
    
    links = links[start_from:finish_at] #29:227
    
    # remove "wiki" from the link
    full_links = [i[6:] for i in links]

    wk = wikipediaapi.Wikipedia('en',extract_format=wikipediaapi.ExtractFormat.WIKI)

    count = 0
    last_title = ''

    for link in full_links:
     
        page = wk.page(link)    
        last_title = page.title

        if (count < total_processed):
            article = ET.SubElement(root, 'article', {'id':str(lastcount),'title':page.title})
            articlegt = ET.SubElement(rootgt, 'article', {'id':str(lastcount),'npov':'false','title':page.title})
            article.text = page.text
            # print(page.title)
            lastcount = lastcount + 1
            count = count + 1
        

    indent(root)
    indent(rootgt)
    
    print(last_title)
    
    runtimes = runtimes + 1
    
    if (runtimes <= iterations):
        parse_wiki(parse_url, last_title, iterations - runtimes, lastcount)
    

    
# create the file structure for articles
root = ET.Element('articles')

# create the file structure for ground truth
rootgt = ET.Element('articles')

# where to import from
parse_url = 'https://en.wikipedia.org/w/index.php?title=Category:All_NPOV_disputes&pagefrom='

# starting item
start_item = 'Albena'
    
# running the function    
times_to_run = 3
first_article_id = 0
parse_wiki(parse_url, start_item, times_to_run, first_article_id)

# creating an XML object
tree = ET.ElementTree(root) 
treegt = ET.ElementTree(rootgt) 

# print(ET.tostring(root).decode()) # check the XML output

# output articles with the IDs into a file
tree.write('article-training-wikipedia-short.xml', encoding='utf-8', xml_declaration=True)

# output ground truth into a file
treegt.write('ground-truth-wikipedia-short.xml', encoding='utf-8', xml_declaration=True)




    
