In [56]:
import psycopg2               # PostGIS client
import re                     # Regex
from lxml import etree        # XML parsing
import multiprocessing as mp  # multiprocessor support

infile = "testdump-m.xml"
lang = "en"

ns = '{http://www.mediawiki.org/xml/export-0.10/}'

# drop everything and start over?
drop = True

# Connect to an existing database
conn = psycopg2.connect("dbname=CPT user=carsten")

# Open a cursor to perform database operations
cur = conn.cursor()

# let's pre-compile some regexes
linkpattern = re.compile("\[\[?([^]|]*)(\|)?([^]|]*)?\]\]")

if drop: # drop everything and recreate tables
    cur.execute('DROP TABLE IF EXISTS "links";')
    cur.execute('CREATE TABLE "links" ("from" varchar, "to" varchar, "lang" varchar, "links" integer, "mentions" integer) ;')
    
def findreferences(pagetext, pagetitle):
    
    # find all links via regex, save in a dict with the link as key and number of occurrences for this link as value
    links = linkpattern.findall(pagetext)
    
    # we'll go through the links in alphabetical order; whenever the lastlink is different from the current one, we'll 
    # write the accumulated count of the lastlinks to the DB:
    lastlink = None 
    lastalias = None
    linkscount = 0
    mentionscount = 0
    
    for match in sorted(links):
        link = match[0]

        theselinks = 0
        
        if link != lastlink: 
            #write to DB:
            if lastlink:  # don't write on the first iteration when lastlink is empty!
                # insert results into DB
                cur.execute("INSERT INTO links VALUES (%s, %s, %s, %s, %s);", (pagetitle, lastlink, lang, linkscount, mentionscount))
                
            # and start over
            lastlink = link
            linkscount = 1
        
            # find all occurrences of the link text on the page:
            matches = re.findall(re.escape(link), pagetext)
            theselinks = len(matches)
            mentionscount = theselinks
            
        else:
            # still the same link, only update the linkscount
            linkscount = linkscount + 1

        # if there is an alias in this link, also look for its occurrences: 
        if match[2]:  # this is the alias
            alias = match[2].strip(" ")
            if len(alias) > 0: # skips empty alias, which does happen...
                # only search for appearances of this alias if it is not the same as in the last iteration!
                if alias != lastalias: 
                    lastalias = alias
                    aliasmatches = re.findall(re.escape(alias), pagetext)

                    # if the alias is a substring of the full page title, e.g. "Brooklyn, NY" and "Brooklyn"
                    # avoid double counting!
                    if alias in link:
                        mentionscount = mentionscount + len(aliasmatches) - theselinks
                    else:
                        mentionscount = mentionscount + len(aliasmatches)   


def processpage(page):
    pagetitle    = page.find(ns+'title')
    pagetext = page.find(ns+'revision/'+ns+'text')

    #print " >> "+pagetitle.text
    findreferences(pagetext.text, pagetitle.text)

    # print(title.text)
    # It's safe to call clear() here because no descendants will be accessed
    page.clear()

    # Also eliminate now-empty references from the root node to <Title> 
    while page.getprevious() is not None:
            del page.getparent()[0]

            
def go(pages):
    # go through wikipedia pages in dump, one by one:
    for event, page in pages: 
        processpage(page)
        

            
# for the parsing, we follow the approach explained here: 
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ 
pages = etree.iterparse(infile, events=('end',), tag=ns+'page')

# run the processing in multiprocess mode, as explained here:
# http://sebastianraschka.com/Articles/2014_multiprocessing_intro.html
# pool = mp.Pool(processes=4)
# results = [pool.apply_async(go, args=(x,)) for x in range(1,7)]



# Make the changes to the database persistent
conn.commit()

# Close communication with the database
cur.close()
conn.close()

print "done"

done
