# Generate Training Corpus from Wikipedia Dump

The goal of this notebook is to create an annotated corpus of video game titles using Wikipedia and DBPedia.

For this we need a Wikipedia XML dump (https://en.m.wikipedia.org/wiki/Wikipedia:Database_download).
The Wikipedia dump contains 2 files:
* enwiki-...-articles-multistream.xml.bz2 (this is the actual content)
* enwiki-...-articles-multistream-index.txt.bz2 (a index file containing all page names, an the byte offset of the file-chunk, the page is part of)

The index file can be used to extract the correct file-chunk from the bz2 archive (without need to extract the whole archive):

Lookup page byte offset -> read file chunk -> extract chunk and prase xml content -> find correct xml entry for page


### The general workflow for creating the trainding dataset

1. Get a list of all videogame entities (= wikipedia page names) from dbpedia 
2. Fetch the wikipedia page for each dbpedia entry
3. Parse wikipedia page and check each link if it links to a game entry. If so, annotate the link as a game title.
4. Build a text corpous with all paragraphs containing one or more game title annotations.


In [1]:
import bz2
import bz2file
import lxml
import re
import os
import json
import urllib.parse
from tqdm import tqdm
from bs4 import BeautifulSoup
from SPARQLWrapper import SPARQLWrapper, JSON, JSONLD

### Constants

In [2]:
#DBPEDIA SPARQL ENDPOINT
SPARQL_ENDPOINT = "http://dbpedia.org/sparql/"
VIDEO_GAME = "http://dbpedia.org/ontology/VideoGame"
COMPANY = "http://dbpedia.org/ontology/Company"
DEVICE = "http://dbpedia.org/ontology/Device"
GENRE = "http://dbpedia.org/class/yago/WikicatVideoGameGenres"
REMAKE = "http://dbpedia.org/class/yago/WikicatVideoGameRemakes"
SOFTWARE = "http://dbpedia.org/ontology/Software"

In [3]:
# wikipedia dump files
DUMP_PAGES = "/home/pmuehleder/data/wikipedia/enwiki-20180301-pages-articles-multistream.xml.bz2"
DUMP_INDEX = "/home/pmuehleder/data/wikipedia/enwiki-20180301-pages-articles-multistream-index.txt.bz2"

OUT_DIR = "/home/pmuehleder/data/wikipedia/ner_train"
WIKI_DIR = "/home/pmuehleder/data/wikipedia"

### Wikipedia dump helper functions

In [4]:
def get_byte_positions(wiki_idx):
    """
    Returns byte positions, id and title for a specific entry in the wikipedia index dump file
    """
    start_byte = int(wiki_index[wiki_idx].split(":")[0])
    id_ = int(wiki_index[wiki_idx].split(":")[1])
    title = wiki_index[wiki_idx].split(":")[2]
    
    #find byte position of next section
    offset = 1
    while True:
        entry = wiki_index[wiki_idx+offset]
        end_byte = int(entry.split(":")[0])
        if end_byte != start_byte:
            break
        offset += 1
    
    return start_byte, end_byte, id_, title

In [5]:
def get_chunks():
    """
    Returns byte positions of usable chunks in the wikipedia dump
    """
    current_start_byte = int(wiki_index[0].split(":")[0])
    for entry in wiki_index:
        start_byte = int(entry.split(":")[0])
        
        if start_byte != current_start_byte:
            yield (current_start_byte, start_byte)
            current_start_byte = start_byte

In [6]:
def get_paragraphs(text):
    """
    Yields text paragraphs of a wiki page.
    Removes lists and tables, headlines and paragraphs shorter than 10 tokens.
    """
    paragraphs = text.split("\n")
    for p in paragraphs:
        p = p.strip()
        if len(p)>0:
            if p[0] not in "#|*=:":
                if len(p.split(" ")) > 10:
                    p = clean(p)
                    if "align=" not in p and "[[File:" not in p:
                        yield p

In [7]:
def preprocess(text):
    """
    Removes references (citations, files) and markup text formatting
    """
    text = text.replace("''", "").replace("{{'}}", "'")
    text = re.sub(r'<ref .+?$','', text)
    text = re.sub(r'<ref>.+?</ref>', '',text)
    text = re.sub(r'<ref name[^<]+?/>', '', text)
    text = re.sub(r'<ref name.+?</ref>', '', text)
    text = re.sub(r'<.+?>', '', text)
    
    text = re.sub(r'{{[cC]ite.+?}}', '', text)

    text =text.replace("{{cite web", "")
    #text = re.sub(r'\[\[File.+?\]\]', '', text)
    return text


In [8]:
def clean(text):
    """
    Removes wikipedia auto references ( inside '{{ }}' )
    """
    for sub in  re.finditer(r'{{.+?}}', text):
        original = sub.group(0)
        if len(original.split("|")) > 1:
            repl = original.split("|")[1] 
            if "{{" in repl:
                repl = ""
        else:
            repl = original.replace("{{","")
        repl = repl.replace("}}", "")
        if "date=" in repl:
            repl=""
        text = text.replace(original, repl)

    return text.replace("}}", "").replace("{{", "")

In [9]:
def search_chunk_for_page(term):
    """
    Looks up wikipedia index for page :term: and returns its byte position in wikipedia dump
    """
    for i, entry in enumerate(wiki_index):
        if term in entry:
            #print(entry)

            positions = get_byte_positions(i)
            length = positions[1]-positions[0]
            return (positions[0], length, positions[3])

In [10]:
def get_links(text):
    """
    Searches all internal wikitext markup links in text and 
    yields complete link markup string, entity name and link text
    """
    links = []
    
    for sub in  re.finditer(r'\[\[.+?\]\]', text):
        positions = sub.span()
        original = sub.group(0)
        
        spl = original.split("|")
        if len(spl) > 1:
            entity = spl[0].replace("[[", "")
            text = spl[1].replace("]]", "")
        else:
            entity = text = original.replace("[[","").replace("]]","")

        entity = entity.replace(" ","_")
        links.append( [original, entity, text, positions] ) 
    return sorted(links, key=lambda x: x[3][0])
                
                

In [11]:
def annotate(text):
    """
    checks the links in a 
    """
    annotations = []
    for markup, entity, link_text, positions in get_links(text):
                    
        #text = text.replace(markup, link_text)
        start = text.find(markup)
        end = start+len(link_text)
        
        text = text[:start]+link_text+text[start+len(markup):]
        if is_video_game(entity):   

            annotations.append({
                "type": "Game",
                "name": link_text,
                "start": start,
                "end": end
            })

            
    return text, annotations
    

In [12]:
def is_video_game(entity):
    """
    Queries DBPedia to check if entity is a video game.
    A video game is of type "Videogame" and "Software".
    """
    sparql.setQuery(query.format(entity=entity))
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
    except:
        print(entity)
        return False
    types = [ r["o"]["value"] for r in results["results"]["bindings"] ]

    
    if VIDEO_GAME in types and SOFTWARE in types:
        return True
    else:
        return False

### DBPedia SPARQL setup and helper functions

In [13]:
sparql = SPARQLWrapper(SPARQL_ENDPOINT)

In [14]:
query = """
SELECT * 
WHERE {{
  <http://dbpedia.org/resource/{entity}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?o.
}}
"""

In [17]:
def get_entities():
    """
    Fetch all entities from DBPedia of type VideoGame and Software
    """
    
    q = """
    SELECT DISTINCT ?ent
    WHERE {{
      ?ent ?p ?o.
      ?o <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/VideoGame>;
         <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Software> .
    }}
    OFFSET {offset}
    LIMIT 10000
    """
    sparql.setQuery(q.format(offset=0))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    entities = [ r["ent"]["value"] for r in results["results"]["bindings"] ]
    
    offset = 0
    while True:
        offset += 10000
        print(offset)
        ent_count = len(entities)
    
        sparql.setQuery(q.format(offset=offset))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        entities += list(set([ r["ent"]["value"] for r in results["results"]["bindings"] ]))
        if ent_count == len(entities):
            break
    
    print(len(set(entities)))
    return entities

In [18]:
vg_entities = get_entities()

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
134598


In [19]:
vg_entities[:10]
vg_entities_filepath = os.path.join(OUT_DIR, "vg_entities.json")
with open(vg_entities_filepath, "w") as f:
    json.dump(vg_entities, f)


# Read index file data

In [15]:
#read index file
with bz2.BZ2File(DUMP_INDEX) as f:
    wiki_index = f.readlines()

wiki_index = [ x.decode().strip() for x in wiki_index ]

# Create annotated NER training corpus

In [16]:
len([x for x in get_chunks()])

182897

In [None]:
vg_entities_filepath = os.path.join(WIKI_DIR, "vg_entities.json")
with open(vg_entities_filepath) as f:
    vg_entities = json.load(f)

In [None]:

files = os.listdir(OUT_DIR)
files = [ int(f.replace(".json","")) for f in files ]
if len(files):
    start_offset = max(files)
else:
    start_offset = 0

#print(start)

with open(DUMP_PAGES, "rb") as f:
    #for byte_start, byte_end in get_chunks():
    sents = []
    for i, entity in tqdm(enumerate(vg_entities[start_offset:])):
        
        
        #save training data every 100 wiki pages
        if (i+1)%100 == 0:
            fp = os.path.join(OUT_DIR,"{}.json".format(i+1))
            with open(fp, "w") as of:
                json.dump(sents, of, indent=4)
            sents = []        
        
        #check if dbpedia entity is in wikipedia index file and get byte positions of entity
        term = entity.split("/")[-1].replace("_", " ")
        try:
            byte_start, byte_length, page_title = search_chunk_for_page(term)
        except:
            print(">>>")
            print(term)
            print("<<<")
            continue
        
        #read and dezip chunk containing wikipedia page 
        f.seek(byte_start,0)
        compressed = f.read(byte_length)
        data = bz2.decompress(compressed)
        
        #iteratate unzipped wiki chunk until page is found
        soup = BeautifulSoup(data.decode().strip(), "lxml")
        for page in soup.find_all("page"):
            title = page.find("title").text
            if page_title in title:


                text = page.find("text").text
                text = preprocess(text)

                for p in get_paragraphs(text):

                    final_text,annotations = annotate(p)
                    if len(annotations) > 0:

                        sents.append({
                            "text": final_text,
                            "annotations": annotations
                        })
                
                break

        

9it [00:16,  1.81s/it]

Edward_"Edge"_Geraldine


28it [01:01,  2.21s/it]

>>>
Tetris & Dr. Mario
<<<


36it [01:08,  1.91s/it]

"Weird_Al"_Yankovic


41it [01:19,  1.93s/it]