In [1]:
from class_wikiwrapper import WikiWrapper
import pandas as pd 
import numpy as np
import pickle
import glob
import sys
import re
from time import time, gmtime, strftime
from collections import defaultdict

In [2]:
from class_linkscore import LinkScore
from class_cohm import COHM
from class_annealing import Annealing

In [16]:
def get_results_from_wikipedia(s, wikipedia_wrapper):
    Q = wikipedia_wrapper.get_results(s)
    R = []
    for r in Q:
        r_title = r["title"]
        if r["des"] == 1:
            des_results = wikipedia_wrapper.get_iolinks(r_title, ides=True)["links"]
            for d in des_results:
                if d["des"] == 0:
                    R.append(d)
        else:
            R.append(r)
        
    results = [r["title"] for r in R]
    return results

def clf_NIL(size, str_sim, ctx_sim):
    if size == 0:
        return 1
    else:
        if str_sim >= 0.8:
            return 0
        elif str_sim >= 0.4:
            if ctx_sim <= 0.1:
                return 1
            else:
                return 0
        else:
            if ctx_sim <= 0.5:
                return 1
            else:
                return 0
            
def NIL(s, candidates_s):
    size = len(candidates_s)
    scores = {
        "str": [0],
        "ctx": [0]
    }
    entity_mentioned = s
    for zx in range(size):
        entity_named = candidates_s[zx]
        str_score = linkscore.get_single_str_score(entity_mentioned, entity_named)["max"]
        ctx_score = linkscore.get_single_ctx_score(entity_mentioned, entity_named)["union"]
        scores["str"].append(str_score)
        scores["ctx"].append(ctx_score)
    return clf_NIL(len(candidates_s), max(scores["str"]), max(scores["ctx"]))

def get_prepros(reference, WW, nil_list=None):
    if nil_list is None:
        nil_list = []
    space = []
    base = []
    reference_mask = np.ones(len(reference))
    for j, k in enumerate(reference):
        if k not in nil_list:
            Rk = WW.get_results(k)
            if Rk:
                row = []
                for r in Rk:
                    if r["des"] == 0:
                        row.append(r["title"])
                if len(row) > 1:
                    space.append(np.array(row))
                    base.append(k)
                else:
                    reference_mask[j] = 0
            else:
                reference_mask[j] = 0
                nil_list.append(k)
        else:
            reference_mask[j] = 0

#     space = np.array(space)
    base = np.array(base)

    return reference_mask, base, space

def get_link(reference, space, reference_mask, X):
    dic = {}
    ix = 0
    for i, b in enumerate(reference_mask):
        if b:
            dic[reference[i]] = space[ix][X[ix]]
            ix += 1
        else:
            dic[reference[i]] = np.nan
    return dic

In [17]:
entity_named = [
    'artificial intelligence',
    'cognition',
    'controlled study',
    'diagnostic imaging',
    'human',
    'procedures',
    'randomized controlled trial',
    'retrospective study',
    'x-ray computed tomography',
    'Artificial Intelligence',
    'Cognition',
    'COVID-19',
    'Humans',
    'Retrospective Studies',
    'Tomography, X-Ray Computed'
]

In [6]:
WW = WikiWrapper()

In [7]:
%%time
candidates_entity_named = {
    s: get_results_from_wikipedia(s, WW) for s in entity_named
}
print("candidates entity named:", candidates_entity_named)

candidates entity named: {'artificial intelligence': ['Artificial intelligence', 'Artificial general intelligence', 'Generative artificial intelligence', 'Hallucination (artificial intelligence)', 'Artificial intelligence art', 'A.I. Artificial Intelligence', 'History of artificial intelligence', 'Applications of artificial intelligence', 'Swarm intelligence', 'Artificial intelligence in healthcare', 'Ethics of artificial intelligence', 'Explainable artificial intelligence', 'Friendly artificial intelligence', 'Symbolic artificial intelligence', 'Timeline of artificial intelligence'], 'cognition': ['Cognition', 'Elephant cognition', 'Animal cognition', 'Embodied cognition', 'Need for cognition', 'Nootropic', 'Epistemic cognition', 'Dynamicism', 'Computational cognition', 'Metacognition', 'Evolution of cognition', 'Consciousness and Cognition', 'Perseverative cognition', 'Distributed cognition', 'Paranoia'], 'controlled study': ['Scientific control', 'Placebo-controlled study', 'Case–co

In [8]:
cohm = COHM(WW)
linkscore = LinkScore(cohm)

In [18]:
%%time
nil_tags_entity_named = {}
for s in candidates_entity_named.keys():
    nil_tags_entity_named[s] = NIL(s, candidates_entity_named[s])
nil_tags_entity_named

CPU times: user 0 ns, sys: 427 µs, total: 427 µs
Wall time: 430 µs


{'artificial intelligence': 0,
 'cognition': 0,
 'controlled study': 0,
 'diagnostic imaging': 0,
 'human': 0,
 'procedures': 0,
 'randomized controlled trial': 0,
 'retrospective study': 0,
 'x-ray computed tomography': 0,
 'Artificial Intelligence': 0,
 'Cognition': 0,
 'COVID-19': 0,
 'Humans': 0,
 'Retrospective Studies': 0,
 'Tomography, X-Ray Computed': 0}

In [19]:
keywords = list(nil_tags_entity_named.keys())
list_NIL = [k for k, v in nil_tags_entity_named.items() if v == 1]
keywords_mask, base_, space_ = get_prepros(keywords, WW, list_NIL)
keywords_mask, base_, space_

(array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 array(['artificial intelligence', 'cognition', 'controlled study',
        'diagnostic imaging', 'human', 'procedures',
        'randomized controlled trial', 'retrospective study',
        'x-ray computed tomography', 'Artificial Intelligence',
        'Cognition', 'COVID-19', 'Humans', 'Retrospective Studies',
        'Tomography, X-Ray Computed'], dtype='<U27'),
 [array(['Artificial intelligence', 'Artificial general intelligence',
         'Generative artificial intelligence',
         'Hallucination (artificial intelligence)',
         'Artificial intelligence art', 'A.I. Artificial Intelligence',
         'History of artificial intelligence',
         'Applications of artificial intelligence', 'Swarm intelligence',
         'Artificial intelligence in healthcare',
         'Ethics of artificial intelligence',
         'Explainable artificial intelligence',
         'Friendly artificial intelligence',
         

In [20]:
a = "0.32653061224489793 0.36734693877551017 0.30612244897959195".split(" ")
a1, a2, a3 = [float(_) for _ in a]
linkscore.add_params({"a1": a1, "a2": a2, "a3": a3})
print(linkscore.params)

{'a1': 0.32653061224489793, 'a2': 0.36734693877551017, 'a3': 0.30612244897959195}


In [27]:
%%time
best_linking = []
if len(base_) > 0:
    sample_size = 10000
    annealing = Annealing(keywords, base_, space_, linkscore, nf=sample_size, beta=lambda n: n ** (5 / 8))

    U = np.random.uniform(size=sample_size)
    V = np.random.uniform(size=sample_size)
    annealing.MCMC(U, V, save_rate=sample_size)

    best_linking = annealing.X

linking = get_link(keywords, space_, keywords_mask, best_linking)
linking

[15, 15, 15, 15, 13, 13, 15, 15, 15, 15, 15, 15, 15, 15, 15]
CPU times: user 624 ms, sys: 13 µs, total: 624 ms
Wall time: 622 ms


{'artificial intelligence': 'Artificial intelligence',
 'cognition': 'Cognition',
 'controlled study': 'Scientific control',
 'diagnostic imaging': 'Medical imaging',
 'human': 'Human',
 'procedures': 'Surgery',
 'randomized controlled trial': 'Randomized controlled trial',
 'retrospective study': 'Retrospective cohort study',
 'x-ray computed tomography': 'CT scan',
 'Artificial Intelligence': 'Artificial intelligence',
 'Cognition': 'Cognition',
 'COVID-19': 'COVID-19',
 'Humans': 'Human',
 'Retrospective Studies': 'Retrospective cohort study',
 'Tomography, X-Ray Computed': 'CT scan'}