Parse Wiki articles while streaming from disk using the Simple API for XML.

In [None]:
!pip install mwparserfromhell

In [None]:
import re
import dill
import xml.sax
import subprocess
import mwparserfromhell
import pandas as pd
import sqlite3 as sql
from bs4 import BeautifulSoup
from tqdm import tqdm
from multiprocessing import Pool

In [None]:
def dump_dill(fname, obj):
    with open(fname, 'wb') as f:
        dill.dump(obj, f)
    return None

def load_dill(fname):
    with open(fname, 'rb') as f:
        return dill.load(f)

In [None]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [None]:
local_db = '''/Data/samples/wiki/enwiki_articles_20200520.db'''

In [None]:
file_paths = load_dill('../input/file_paths.dill')

In [None]:
file_paths

In [None]:
len(file_paths)

In [None]:
with sql.connect(local_db) as local_conn:

    for path in file_paths:
        print(path)
        # Create an instance of the handler class
        handler = WikiXmlHandler()

        # Parsing object
        parser = xml.sax.make_parser()

        # Tell the parser to use the custom handler instance
        parser.setContentHandler(handler)

        for i, line in enumerate(subprocess.Popen(['bzcat'], 
                                         stdin = open(path), 
                                         stdout = subprocess.PIPE).stdout):
            parser.feed(line)

        title_list = [None] * len(handler._pages)
        text_list = [None] * len(handler._pages)
        category_list = [None] * len(handler._pages)

        for i, article in enumerate(tqdm(handler._pages)):
            text = mwparserfromhell.parse(article[1]).strip_code().strip()
            if text.split(' ')[0] =='REDIRECT': continue
            text = text.replace('|', ' ').replace('\n', ' ')
#             category_list[i] = re.findall(r'Category:([a-zA-Z-]+)',text)
            text = re.sub(r'\<([^>]+)\>','', text)
            text = re.sub(r'http\S+','', text)
            text = text.split('== See also ==')[0] # remove references and everything afterwards
            text = re.sub('=|\*|"','', text)

            title_list[i] = re.sub("([a-z])([A-Z])","\g<1> \g<2>",article[0])
            text_list[i] = text

        article_df = pd.DataFrame({'title': title_list, 'text': text_list})
        article_df.dropna().reset_index(drop=True).to_sql('articles', local_conn, if_exists='append', index=False)