In [248]:
import requests
import urllib
import numpy as np
import math
from bs4 import BeautifulSoup
from collections import Counter

In [2]:
blocklist = [
    "Main_Page",
    "Help:",
    "Special:",
    "Portal:",
    "Talk:",
    "Template:"
]

In [285]:
def parse_wiki_link(link, get_text=False):
    r = requests.get(link)
    
    soup = BeautifulSoup(r.content, 'html.parser')
    # jank way to figure out redirect links
    real_link = soup.find_all('link', {"rel" : "canonical"})[0].get("href").split("#")[0]
    if real_link == link:
        real_link = None

    wiki_content_links = set()
    for link in soup.find_all('a', href=True):
        if link["href"] == "#cite_ref-1":
            break
        clean_link = link["href"]
        clean_link = urllib.parse.unquote(clean_link)
        if clean_link.startswith("/wiki/"):
            wiki_link = clean_link[6:]
            if any(x in wiki_link for x in blocklist):
                continue
            wiki_content_links.add(clean_link)

    if get_text:
        # todo - clean this text
        words = soup.find_all('p')
        return wiki_content_links, real_link, words

    return wiki_content_links, real_link

In [130]:
link = "https://en.wikipedia.org/wiki/Optimal_control_theory"
l, real_link = parse_wiki_link(link)

In [280]:
class UserHistory:
    def __init__(self, user_history):
        self.user_history = user_history
        # user_vists is a list of links in chronological order ascending
        # user_vists[-1] is the current page
        self.outgoing_links = Counter()
        #self.ingoing_links = set()
        self.words = []
        for link in user_history:
            results, real_link, words = parse_wiki_link(link, get_text=True)
            self.words.extend(words)
            self.outgoing_links.update(results)
            #self.ingoing_links.update(parse_wiki_ingoing(link))
        self.already_visited_pages = {link.split("wikipedia.org")[1] for link in user_history}

        # remove self-loops
        for page in self.already_visited_pages:
            if page in self.outgoing_links:
                del self.outgoing_links[page]
        #self.ingoing_links -= already_visited_pages
        
        # self.outgoing_text

        outgoing_links_list = list(self.outgoing_links.values())
        self.mean = np.average(outgoing_links_list)
        self.std = np.std(outgoing_links_list)


In [318]:
def wiki_prefix(suffix):
    # suffix is '/wiki/<article title>'
    return "https://en.wikipedia.org/"+suffix

In [271]:
class Cache:
    def __init__(self, fetch_fn):
        self.dict = dict()
        self.fetch_fn = fetch_fn

    def __call__(self, key, args):
        if key in self.dict:
            return self.dict[key]
        result = self.fetch_fn(args)
        self.dict[key] = result
        return result

def linkcount_fetch(wiki_page):
    # this is so damn slow
    link = f"https://linkcount.toolforge.org/api/?page={wiki_page}&project=en.wikipedia.org"
    r = requests.get(link).json()
    return r["wikilinks"]["all"]

In [281]:
def score_link_similarity(user_history, target):
    # user_history  
    #   incorporate idf (just hyperlinks) -> scrape target/what_links_here (expensive)
    #   or sample 10000 pages and count link frequency and store it somewhere else                       
    #   incorporate ingoing recommendations
    # return score(target | user_history)

    # how many times does target appear in self.outgoing_links
    z_score = (user_history.outgoing_links[target]-user_history.mean)/user_history.std
    return .5 * (math.erf(z_score / 2 ** .5) + 1)

In [291]:
def score_link_text_similarity(user_history, target):
    # user_history  
    #   incorporate idf (text)
    #   incorporate ingoing recommendations
    # return score(target | user_history)

    # references in div class = reflist
    
    target = link_clean(target)
    words = user_history.words
    total = 0
    for i in words:
        i = str(i)
        i = i.lower()
        total += i.count(target)
    return total

def link_clean(target):
    title = target.lower()
    title = title[6:]
    title = title.replace("_", " ")
    return title

In [325]:
def score_coupling_similarity(user_history, target, cache, doc_freq_cache):
    # user_history  
    #   need to download target and scrape it's links
    # pages are similar if their outgoing (ingoing) links have overlap
    if target in cache:
        results, real_link = cache[target]
    else:
        results, real_link = parse_wiki_link(target)
        # TODO swap to MediaWiki API
        cache[target] = [results, real_link]
    if real_link is not None:
        # this is a redirect link
        if "/wiki" + real_link.split("/wiki")[1] in user_history.already_visited_pages:
            # already been to this page, don't re-recommend
            # unless we want to in some cases?
            return -100

    #doc_freq = doc_freq_cache()
    target_outgoing = Counter(results)

    score = 0
    doc_len = sum(v for v in user_history.outgoing_links.values())
    k1 = 0.5
    k3 = 0.5
    b = 0.9
    avg_doc_len = 50 #?
    for link, count in target_outgoing.items():
        query_count = user_history.outgoing_links[link]
        if count == 0 or query_count == 0:
            continue
        page_name = link.split("/wiki/")[1]
        doc_freq = 100 #doc_freq_cache(page_name, page_name)

        norm_qtf = (k3+1)*query_count / (k3 + query_count)
        norm_tf = count * (k1 + 1) / (count + k1*((1-b)+b*(doc_len/avg_doc_len)))
        tf = norm_tf * norm_qtf

        num_links_on_wiki = 1e7
        idf = np.log(num_links_on_wiki / (doc_freq+1))
        score += tf * idf
    #union = sum(v for v in target_outgoing.values()) + sum(v for v in user_history.outgoing_links.values())

    return score

In [None]:
cache = dict()

In [209]:
doc_freq_cache = Cache(linkcount_fetch)

In [286]:
link1 = "https://en.wikipedia.org/wiki/Hamilton–Jacobi–Bellman_equation"
link2 = "https://en.wikipedia.org/wiki/Value_function"
link3 = "https://en.wikipedia.org/wiki/Optimal_control"
user_history = UserHistory([link1, link2, link3])

In [184]:
link1 = "https://en.wikipedia.org/wiki/Bitcoin"
link2 = "https://en.wikipedia.org/wiki/Lightning_Network"
user_history = UserHistory([link1, link2])

In [294]:
def compute_outgoing_scores_baseline(user_history):
    # composite score_link_similarity and score_link_text_similarity
    # (todo: this filters scores, will do re-ranking with coupling similarity, re-ranking with deeper searches, etc)
    weight = 0.01 # to be tuned
    outgoing_scores = dict()
    for link in user_history.outgoing_links:
        outgoing_scores[link] = score_link_similarity(user_history, link) + weight * score_link_text_similarity(user_history, link)
    return outgoing_scores

In [297]:
baseline_scores = compute_outgoing_scores_baseline(user_history)
sorted_baseline_scores = [(k, v) for k, v in sorted(baseline_scores.items(), reverse=True, key=lambda item: item[1])]

In [326]:
def rerank_with_coupling(user_history, baseline_scores, num_rerank):
    new_rankings = dict()
    for target, score in baseline_scores[:num_rerank]:
        new_rankings[target] = score_coupling_similarity(user_history, wiki_prefix(target), cache, doc_freq_cache)
    return new_rankings

In [329]:
results = rerank_with_coupling(user_history, sorted_baseline_scores, 15)

In [330]:
[(k, v) for k, v in sorted(results.items(), reverse=True, key=lambda item: item[1])]

[('/wiki/Dynamic_programming', 158.8045786822883),
 ('/wiki/Bellman_equation', 145.58730318947676),
 ('/wiki/Richard_Bellman', 135.32911922490658),
 ('/wiki/Dimitri_Bertsekas', 124.28184418613871),
 ('/wiki/Control_theory', 122.90093480629272),
 ('/wiki/Differential_equation', 120.13911604660075),
 ('/wiki/Objective_function', 103.56820348844892),
 ('/wiki/Hamiltonian_(control_theory)', 85.61638155045111),
 ('/wiki/Rudolf_E._Kálmán', 64.90274085276133),
 ('/wiki/Dynamical_system', 49.71273767445548),
 ("/wiki/Pontryagin's_maximum_principle", 37.87637156148989),
 ('/wiki/Viscosity_solution', 37.28455325584161),
 ('/wiki/Partial_differential_equation', 27.61818759691971),
 ('/wiki/Riccati_equation', 13.809093798459855),
 ('/wiki/Control_(optimal_control_theory)', 13.809093798459855)]

In [None]:
def recommend(ranks):
    pass

In [None]:
def visualize_history(user_history):
    pass