# Wikipedia metadata extraction

Here we extract meta data on the contents in an XML file containing a dump of english Wikipedia. Data is saved in three csv-files.

In [1]:
"""
Based on: "Processing Large XML Wikipedia Dumps that won't fit in RAM in Python without Spark" https://www.youtube.com/watch?v=AeRN4zI7Dhk

"""

import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'F:/wikipedia-data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'redirect.csv'
FILENAME_CATEGORY = 'categories.csv'
ENCODING = "utf-8"

In [2]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [3]:
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [4]:
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
# pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathCategory = os.path.join(PATH_WIKI_XML, FILENAME_CATEGORY)
pathRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)

In [5]:
totalCount = 0
articleCount = 0
redirectCount = 0
categoryCount = 0
title = None
start_time = time.time()

In [6]:
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathCategory, "w", ENCODING) as categoryFH, \
        codecs.open(pathRedirect, "w", ENCODING) as redirectFH:
    
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    categoryWriter = csv.writer(categoryFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'parentCategories', 'links'])
    categoryWriter.writerow(['id', 'title', 'parentCategories'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = False
                inrevision = False
                isredirect = False
                ns = 0
                parentCategories = 'Not run'
                links = 'Not run'
                articleText = 'Category:xxxERROR IF IN DATAxxx'
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
            elif tname == 'title':
                    title = elem.text

            elif tname == 'id' and not inrevision and elem.text!=None:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.get('title', '')
                isredirect = True
            elif tname == 'ns' and elem.text!=None:
                ns = int(elem.text)
            # This catches content of <text> tags that have short content
            elif tname == 'text':
                articleText = str(elem.text)
                parentCategories = re.findall(r'\[\[Category:(.*?)\]\]', articleText)
                links = re.findall(r'\[\[(.*?)\]\]', articleText)
            
        # This catches content of <text> tags that have very long content
        # Reason: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
        elif tname == 'text' and articleText == 'None':
            articleText = str(elem.text)
            parentCategories = re.findall(r'\[\[Category:(.*?)\]\]', articleText)
            links = re.findall(r'\[\[(.*?)\]\]', articleText)

            
        elif tname == 'page':
            totalCount += 1

            if ns == 14:
                categoryCount += 1
                categoryWriter.writerow([id, title, parentCategories])
            elif isredirect:
                redirectCount += 1    
                redirectWriter.writerow([id, title, redirect])
            elif ns == 0:
                articleCount += 1
                articlesWriter.writerow([id, title, parentCategories, links])
                
                # Category info is contained within <text> tag and in format [[Category:Autism| ]]
            
#             else:
#                 redirectCount += 1
#                 redirectWriter.writerow([id, title, redirect])

            if totalCount > 1 and (totalCount % 100000) == 0:
                print("{:,}".format(totalCount))

        elem.clear()
        
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

100,000
200,000
300,000
400,000
500,000
600,000
700,000
800,000
900,000
1,000,000
1,100,000
1,200,000
1,300,000
1,400,000
1,500,000
1,600,000
1,700,000
1,800,000
1,900,000
2,000,000
2,100,000
2,200,000
2,300,000
2,400,000
2,500,000
2,600,000
2,700,000
2,800,000
2,900,000
3,000,000
3,100,000
3,200,000
3,300,000
3,400,000
3,500,000
3,600,000
3,700,000
3,800,000
3,900,000
4,000,000
4,100,000
4,200,000
4,300,000
4,400,000
4,500,000
4,600,000
4,700,000
4,800,000
4,900,000
5,000,000
5,100,000
5,200,000
5,300,000
5,400,000
5,500,000
5,600,000
5,700,000
5,800,000
5,900,000
6,000,000
6,100,000
6,200,000
6,300,000
6,400,000
6,500,000
6,600,000
6,700,000
6,800,000
6,900,000
7,000,000
7,100,000
7,200,000
7,300,000
7,400,000
7,500,000
7,600,000
7,700,000
7,800,000
7,900,000
8,000,000
8,100,000
8,200,000
8,300,000
8,400,000
8,500,000
8,600,000
8,700,000
8,800,000
8,900,000
9,000,000
9,100,000
9,200,000
9,300,000
9,400,000
9,500,000
9,600,000
9,700,000
9,800,000
9,900,000
10,000,000
10,100,000
10,200