In [24]:
import re
import pickle
import shelve
import mwparserfromhell
from mwparserfromhell.nodes.text import Text
from mwparserfromhell.nodes.wikilink import Wikilink 
import wikitextparser as wtp

import requests
import nltk
from nltk.util import ngrams
import operator
import numpy as np

import time
import operator
import sys
import csv

from scripts.utils import wtpGetLinkAnchor
from scripts.utils_features import get_feature_set

In [25]:
# if len(sys.argv) >= 2:
#     lang = sys.argv[1]
# else:
#     lang = 'en'
lang = 'simple'
wiki   = lang+'wiki'

In [26]:
API_URL = "https://{0}.wikipedia.org/w/api.php".format(lang)

def parse(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "rvlimit": 1,
        "titles": title,
        "format": "json",
        "formatversion": "2",
    }
    headers = {"User-Agent": "My-Bot-Name/1.0"}
    req = requests.get(API_URL, headers=headers, params=params)
    res = req.json()
    revision = res["query"]["pages"][0]["revisions"][0]
    text = revision["slots"]["main"]["content"]
    return mwparserfromhell.parse(text)

In [27]:
## open datasets as shelve
# Load the anchor dictionary (the main data structure)
anchors = shelve.open( "./data/{0}/{0}.anchors.db".format(lang), flag='r' )
pageids = shelve.open( "./data/{0}/{0}.pageids.db".format(lang), flag='r' )
redirects = shelve.open( "./data/{0}/{0}.redirects.db".format(lang), flag='r' )

## load word2vec features
word2vec = shelve.open("./data/{0}/{0}.w2v.filtered.db".format(lang), flag='r' )
## load navigation-vector features
nav2vec = shelve.open("./data/{0}/{0}.nav.filtered.db".format(lang), flag='r' )

## load trained model
import xgboost as xgb
model = xgb.XGBClassifier()  # init model
model.load_model('./data/{0}/{0}.linkmodel.bin'.format(lang))  # load data

In [32]:
# Main decision function.

# for a given page X and a piece of text "lipsum".. check all the candidate and make inference
# Returns the most likely candidate according to the pre-trained link model
# If the probability is below a certain threshold, return None
def classify_links(page, text, THRESHOLD):
    #start_time = time.time()
    cand_prediction = {}
    # Work with the 10 most frequent candidates
    limited_cands = anchors[text]
    if len(limited_cands) > 10:
        limited_cands = dict(sorted(anchors[text].items(), key = operator.itemgetter(1), reverse = True)[:10]) 
    for cand in limited_cands:
        # get the features
#         cand_feats = get_feature_set(page, text, cand, anchors, word2vec,nav2vec,pageids)
        cand_feats = get_feature_set(page, text, cand, anchors, word2vec,nav2vec)

        # compute the model probability
        cand_prediction[cand] = model.predict_proba(np.array(cand_feats).reshape((1,-1)))[0,1]
    
    # Compute the top candidate
    top_candidate = max(cand_prediction.items(), key=operator.itemgetter(1))
    
    # Check if the max probability meets the threshold before returning
    if top_candidate[1] < THRESHOLD:
        return None
    #print("--- %s seconds ---" % (time.time() - start_time))
    return top_candidate

In [33]:
# Article parsing utility.

# For a given page return the list of all existing links and mentions
# To avoid linking what's already linked
def getLinks(wikicode, page_title):
    m = set()
    e = set()
    page_title_tmp = page_title.replace('_',' ')
    # add the page title itself
    m.add(page_title_tmp)
    e.add(page_title_tmp)
    linklist = wtp.parse(str(wikicode)).wikilinks
    for l in linklist:
        link,anchor = wtpGetLinkAnchor(l)
        m.add(anchor)
        e.add(link)
#         m.add(l.plain_text().strip())
#         e.add(l.title.strip())
    return m, e

In [34]:
# Article parsing utility.

# Split a MWPFH node <TEXT> into sentences
SENT_ENDS = [u".", u"!", u"?"]
def tokenize_sentence_split(text):
    for line in text.split("\n"):
        tok_acc = []
        for tok in nltk.word_tokenize(line):
            tok_acc.append(tok)
            if tok in SENT_ENDS:
                yield " ".join(tok_acc)
                tok_acc = []
        if tok_acc:
            yield " ".join(tok_acc)

In [35]:
# Actual Linking function
def process_page(page):
    page_wikicode = parse(page)
    page_wikicode_init= str(page_wikicode) # save the initial state
    linked_mentions, linked_links = getLinks(page_wikicode, page)
    tested_mentions = set()
    for gram_length in range(10, 0, -1):
        #print("Scanning ", gram_length, "Grams")
        # Parsing the tree can be done once
        for node in page_wikicode.filter(recursive= False):
            if isinstance(node, Text):
                lines = node.split("\n")
                for line in lines:

                    for sent in tokenize_sentence_split(line):
                        grams = list(ngrams(sent.split(), gram_length))
    
                        for gram in grams:
                            mention = ' '.join(gram).lower()
                            # if the mention exist in the DB 
                            # it was not previously linked (or part of a link)
                            # none of its candidate links is already used
                            # it was not tested before (for efficiency)
 
                            if (mention in anchors and
                                not any(mention in s for s in linked_mentions) and
                                not bool(set(anchors[mention].keys()) & linked_links) and
                                mention not in tested_mentions):
                                #logic
                                #print("testing:", mention, len(anchors[mention]))
                                candidate = classify_links(page, mention, THRESHOLD)
                                if candidate:
                                    candidate_link, candidate_proba = candidate
                                    #print(">> ", mention, candidate)
                                    ############## Critical ##############
                                    # Insert The Link in the current wikitext
                                    match = re.compile(r'(?<!\[\[)(?<!-->)\b{}\b(?![\w\s]*[\]\]])'.format(re.escape(mention)))
                                    newval, found = match.subn("[[" + candidate_link  +  "|" + mention+  "|pr=" + str(candidate_proba) + "]]", node.value, 1)
                                    node.value = newval
                                    ######################################
                                    # Book-keeping
                                    linked_mentions.add(mention)
                                    linked_links.add(candidate)
                                # More Book-keeping
                                tested_mentions.add(mention)

    return page_wikicode

In [36]:
%timeit
# Running the Model on a page
THRESHOLD = 0.8

page_title = "Fernand_Léger"
# page_title = "Tarek_Kamel"
page_title = "Shri Yantra"

print("processing:", page_title)
print("\n==========\n")
t1 = time.time()
result = process_page(page_title)
# print(result)
t2 = time.time()
print(t2-t1)


processing: Shri Yantra


2.6180663108825684


In [37]:
print(result)

{{complex|date=June 2012}}
[[Image:SriYantra color.svg|thumb|The Shri Yantra.]]
The '''Shri Yantra''' or '''Sri Chakra''' of [[Tripura Sundari]] is a [[yantra]] or [[mandala]] formed by nine interlocking triangles surrounding a dot in the cetner called a [[bindu]].  Four of these triangles are orientated upright representing [[Shiva]] or the Masculine.  Five of these triangles are inverted triangles represent [[Shakti]] or the Feminine. Because it is composed of nine triangles, it is also known as the ''navayoni chakra''.<ref name=SC>{{cite book|last=Shankaranarayanan|first=S.|title=Sri Chakra|edition=3rd|year=1979|publisher=Dipti Publications}}</ref>

Together the nine triangles are interlaced in such a way as to form [[43 (number)|43]] smaller triangles in a web symbolic of the entire [[Universe|cosmos|pr=0.8924864]] or a [[Uterus|womb|pr=0.99999416]] symbolic of creation. Together they express [[Advaita|Advaita-ism]] or [[non-duality]]. This is surrounded by a lotus of eight petals,

In [19]:
## the crucial thing is to get the correct mentions.