In [1]:
"""
Based on: "Processing Large XML Wikipedia Dumps that won't fit in RAM in Python without Spark" https://www.youtube.com/watch?v=AeRN4zI7Dhk

"""

import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re

PATH_WIKI_XML = 'F:/wikipedia-data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'redirect.csv'
FILENAME_CATEGORY = 'categories.csv'
ENCODING = "utf-8"

In [2]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [3]:
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [4]:
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
# pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathCategory = os.path.join(PATH_WIKI_XML, FILENAME_CATEGORY)
pathRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)

In [5]:
totalCount = 0
articleCount = 0
redirectCount = 0
categoryCount = 0
title = None
start_time = time.time()

In [6]:
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathCategory, "w", ENCODING) as categoryFH, \
        codecs.open(pathRedirect, "w", ENCODING) as redirectFH:
    
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    categoryWriter = csv.writer(categoryFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'parentCategories', 'links'])
    categoryWriter.writerow(['id', 'title', 'parentCategories'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = False
                inrevision = False
                isredirect = False
                ns = 0
                parentCategories = 'Not run'
                links = 'Not run'
                articleText = 'Category:xxxERROR IF IN DATAxxx'
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
            elif tname == 'title':
                    title = elem.text

            elif tname == 'id' and not inrevision and elem.text!=None:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.get('title', '')
                isredirect = True
            elif tname == 'ns' and elem.text!=None:
                ns = int(elem.text)
            # This catches content of <text> tags that have short content
            elif tname == 'text':
                articleText = str(elem.text)
                parentCategories = re.findall(r'\[\[Category:(.*?)\]\]', articleText)
                links = re.findall(r'\[\[(.*?)\]\]', articleText)
            
        # This catches content of <text> tags that have very long content
        # Reason: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
        elif tname == 'text' and articleText == 'None':
            articleText = str(elem.text)
            parentCategories = re.findall(r'\[\[Category:(.*?)\]\]', articleText)
            links = re.findall(r'\[\[(.*?)\]\]', articleText)

            
        elif tname == 'page':
            totalCount += 1

            if ns == 14:
                categoryCount += 1
                categoryWriter.writerow([id, title, parentCategories])
            elif isredirect:
                redirectCount += 1    
                redirectWriter.writerow([id, title, redirect])
            elif ns == 0:
                articleCount += 1
                articlesWriter.writerow([id, title, parentCategories, links])
                
                # Category info is contained within <text> tag and in format [[Category:Autism| ]]
            
#             else:
#                 redirectCount += 1
#                 redirectWriter.writerow([id, title, redirect])

            if totalCount > 1 and (totalCount % 100000) == 0:
                print("{:,}".format(totalCount))

        elem.clear()
        
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

KeyboardInterrupt: 