In [217]:
import requests
import urllib
import numpy as np
from bs4 import BeautifulSoup
from collections import Counter

In [2]:
blocklist = [
    "Main_Page",
    "Help:",
    "Special:",
    "Portal:",
    "Talk:",
    "Template:"
]

In [179]:
def parse_wiki_link(link):
    r = requests.get(link)
    
    soup = BeautifulSoup(r.content, 'html.parser')
    # jank way to figure out redirect links
    real_link = soup.find_all('link', {"rel" : "canonical"})[0].get("href").split("#")[0]
    if real_link == link:
        real_link = None

    wiki_content_links = set()
    for link in soup.find_all('a', href=True):
        clean_link = link["href"]
        clean_link = urllib.parse.unquote(clean_link)
        if clean_link.startswith("/wiki/"):
            wiki_link = clean_link[6:]
            if any(x in wiki_link for x in blocklist):
                continue
            wiki_content_links.add(clean_link)
        
    return wiki_content_links, real_link

In [130]:
link = "https://en.wikipedia.org/wiki/Optimal_control_theory"
l, real_link = parse_wiki_link(link)

In [180]:
class UserHistory:
    def __init__(self, user_history):
        self.user_history = user_history
        # user_vists is a list of links in chronological order ascending
        # user_vists[-1] is the current page
        self.outgoing_links = Counter()
        #self.ingoing_links = set()
        for link in user_history:
            results, real_link = parse_wiki_link(link)
            self.outgoing_links.update(results)
            #self.ingoing_links.update(parse_wiki_ingoing(link))
        self.already_visited_pages = {link.split("wikipedia.org")[1] for link in user_history}

        # remove self-loops
        for page in self.already_visited_pages:
            if page in self.outgoing_links:
                del self.outgoing_links[page]
        #self.ingoing_links -= already_visited_pages
        
        # self.outgoing_text

In [137]:
def wiki_prefix(suffix):
    # suffix is '/wiki/<article title>'
    return "https://en.wikipedia.org/"+link

In [207]:
class Cache:
    def __init__(self, fetch_fn):
        self.dict = dict()
        self.fetch_fn = fetch_fn

    def __call__(self, key, args):
        if key in self.dict:
            return self.dict[key]
        result = self.fetch_fn(args)
        self.dict[key] = result
        return result

def linkcount_fetch(wiki_page):
    # this is so damn slow
    link = f"https://linkcount.toolforge.org/api/?page={wiki_page}&project=en.wikipedia.org"
    r = requests.get(link).json()
    return r["wikilinks"]["all"]

In [88]:
def score_link_similarity(user_history, target):
    # user_history  
    #   incorporate idf (just hyperlinks) -> scrape target/what_links_here (expensive)
    #   or sample 10000 pages and count link frequency and store it somewhere else                       
    #   incorporate ingoing recommendations
    # return score(target | user_history)

    # how many times does target appear in self.outgoing_links
    pass

In [89]:
def score_link_text_similarity(user_history, target):
    # user_history  
    #   incorporate idf (text)
    #   incorporate ingoing recommendations
    # return score(target | user_history)
    pass

In [222]:
def score_coupling_similarity(user_history, target, cache, doc_freq_cache):
    # user_history  
    #   need to download target and scrape it's links
    # pages are similar if their outgoing (ingoing) links have overlap
    if target in cache:
        results, real_link = cache[target]
    else:
        results, real_link = parse_wiki_link(target)
        cache[target] = [results, real_link]
    if real_link is not None:
        # this is a redirect link
        if "/wiki" + real_link.split("/wiki")[1] in user_history.already_visited_pages:
            # already been to this page, don't re-recommend
            # unless we want to in some cases?
            return -100

    #doc_freq = doc_freq_cache()
    target_outgoing = Counter(results)

    score = 0
    doc_len = sum(v for v in user_history.outgoing_links.values())
    k1 = 0.5
    k3 = 0.5
    b = 0.9
    avg_doc_len = 50 #?
    for link, count in target_outgoing.items():
        query_count = user_history.outgoing_links[link]
        if count == 0 or query_count == 0:
            continue
        page_name = link.split("/wiki/")[1]
        doc_freq = doc_freq_cache(page_name, page_name)

        norm_qtf = (k3+1)*query_count / (k3 + query_count)
        norm_tf = count * (k1 + 1) / (count + k1*((1-b)+b*(doc_len/avg_doc_len)))
        tf = norm_tf * norm_qtf

        num_links_on_wiki = 1e7
        idf = np.log(num_links_on_wiki / (doc_freq+1))
        score += tf * idf
    #union = sum(v for v in target_outgoing.values()) + sum(v for v in user_history.outgoing_links.values())

    return score

In [None]:
cache = dict()

In [209]:
doc_freq_cache = Cache(linkcount_fetch)

In [147]:
link1 = "https://en.wikipedia.org/wiki/Hamilton–Jacobi–Bellman_equation"
link2 = "https://en.wikipedia.org/wiki/Value_function"
link3 = "https://en.wikipedia.org/wiki/Optimal_control"
user_history = UserHistory([link1, link2, link3])

In [184]:
link1 = "https://en.wikipedia.org/wiki/Bitcoin"
link2 = "https://en.wikipedia.org/wiki/Lightning_Network"
user_history = UserHistory([link1, link2])

In [229]:
iou_scores = dict()
for link in user_history.outgoing_links.keys():
    iou_scores[link] = score_coupling_similarity(user_history, wiki_prefix(link), cache, doc_freq_cache)
{k: v for k, v in sorted(iou_scores.items(), reverse=True, key=lambda item: item[1])}

{'/wiki/Blockchain#Blockchain_analysis': 446.8843310628075,
 '/wiki/Bitcoin_network#Mining': 431.2871402340079,
 '/wiki/Bitcoin_network#Payment_verification': 431.2871402340079,
 '/wiki/Bitcoin_Cash': 419.15067436413483,
 '/wiki/Peercoin': 405.7886679475003,
 '/wiki/Proof-of-work_system': 397.5773815048087,
 '/wiki/Bitcoin_Gold': 397.45741631690527,
 '/wiki/Ledger_(journal)': 390.8854515109159,
 '/wiki/List_of_people_in_blockchain_technology': 387.1498060394201,
 '/wiki/Blockchain.info': 377.77912996630687,
 '/wiki/BitLicense': 376.39386692372705,
 '/wiki/Bitwala': 373.1601653409947,
 '/wiki/Environmental_impact_of_cryptocurrencies': 325.70287850964627,
 '/wiki/ERC-20': 319.8335247975653,
 '/wiki/Ethereum': 319.8335247975653,
 '/wiki/Digital_currency': 315.02486765102344,
 '/wiki/Virtual_currency': 308.82921330417446,
 '/wiki/Diem_(digital_currency)': 308.3917625231289,
 '/wiki/List_of_SHA-256_crypto_currencies': 307.22545243792393,
 '/wiki/Cryptocurrency_and_security#Bitcoin': 297.619

In [21]:
score_coupling_similarity(user_history, "https://en.wikipedia.org/wiki/Dynamic_programming")

0.08285163776493257

In [22]:
score_coupling_similarity(user_history, "https://en.wikipedia.org/wiki/]")

0.4370860927152318

In [91]:
def compute_outgoing_scores_baseline(user_history):
    # composite score_link_similarity and score_link_text_similarity
    # (todo: this filters scores, will do re-ranking with coupling similarity, re-ranking with deeper searches, etc)
    weight = 1.0 # to be tuned
    outgoing_scores = dict()
    for link in user_history.outgoing_links:
        outgoing_scores[link] = score_link_similarity(user_history, link) + weight * score_link_text_similarity(user_history, link)
    return outgoing_scores

In [135]:
def rerank_with_coupling(user_history, baseline_scores, num_rerank):
    sorted_scores = sort(baseline_scores, 'descending')

    for target, score in sorted_scores[num_rerank]:
        coupling_score = score_coupling_similarity(user_history, target)
        

In [None]:
def recommend(ranks):
    pass

In [None]:
def visualize_history(user_history):
    pass