In [1]:
from askem_extractions.data_model import AttributeCollection, AnchoredEntity
import json
from collections import defaultdict
import torch
import html


* 'schema_extra' has been renamed to 'json_schema_extra'


In [3]:
extractions = AttributeCollection.from_json('extractions_page4.json')

In [4]:
def load_variables(path:str):
	with open(path) as f:
		data = json.load(f)

	# Sanitize the entities
	new_data = dict()
	for k, vs in data.items():
		nk = html.unescape(k) if k.startswith('&#') else k
		nvs = {html.unescape(v) if v.startswith('&#') else v for v in vs}
		new_data[nk] = list(nvs)
	
	return new_data

load_variables('variables/page 7/eqn6.json')

{'β': ['β', 'beta'],
 'σ': ['sigma', 'σ'],
 'r': ['r'],
 'I_t': ['I_t', 'I_{t}'],
 'σ_it^t': ['σ_it^t', 'σ_{it^{t}}'],
 't': ['t'],
 'G': ['G'],
 'S': ['S'],
 'i': ['i'],
 'p': ['p'],
 'N': ['N'],
 'S_t': ['S_{t}', 'S_t'],
 'I': ['I']}

In [14]:
def extraction_index(extractions:AttributeCollection):
	ret = defaultdict(set)
	for entity in extractions.attributes:
		entity = entity.payload
		if isinstance(entity, AnchoredEntity):

			for m in entity.mentions:
				ret[entity].add(m.name)

			for td in entity.text_descriptions:
				ret[entity].add(td.description)

	return ret

extraction_index(extractions)

defaultdict(set,
            {AnchoredEntity(id=ID(id='R:22946840'), mentions=[Mention(id=ID(id='T:1309853609'), name='AIDS', extraction_source=TextExtraction(page=1, block=10, surrounding_passage='Mathematical models can help public health interventions by showing the likely outcome of an epidemic .\nIn the book [ 1 ] , the authors covered the concept of mathematical models in epidemiology for certain diseases such as HIV / AIDS , influenza , dengue fever , Zika virus , etc.\nThe paper [ 2 ] investigated the impact of case-area targeted intervention and its effect on reducing cholera transmission .', char_start=1503, char_end=1507, document_reference=ID(id='Anewcomparativestudyonthegeneralfractional modelofCOVID-19withisolationandquarantine effects.pdf')), provenance=Provenance(method='Skema TR Pipeline rules', timestamp=datetime.datetime(2023, 9, 15, 20, 32, 28, 994331)))], text_descriptions=[TextDescription(id=ID(id='T:109968174'), description='influenza', grounding=[Grounding(groun

In [6]:
def revert_index(index):
	ret = defaultdict(list)
	for k, vs in index.items():
		for v in vs:
			ret[v].append(k)
	return ret

revert_index(load_variables('variables/page 7/eqn6.json'))

defaultdict(list,
            {'β': ['β'],
             'beta': ['β'],
             'sigma': ['σ'],
             'σ': ['σ'],
             'r': ['r'],
             'I_t': ['I_t'],
             'I_{t}': ['I_t'],
             'σ_it^t': ['σ_it^t'],
             'σ_{it^{t}}': ['σ_it^t'],
             't': ['t'],
             'G': ['G'],
             'S': ['S'],
             'i': ['i'],
             'p': ['p'],
             'N': ['N'],
             'S_{t}': ['S_t'],
             'S_t': ['S_t'],
             'I': ['I']})

In [19]:
from typing import List, Tuple
from sentence_transformers import SentenceTransformer, util


def align_texts(sources: List[str], targets: List[str], threshold: float, model) -> List[Tuple[str, str]]:

	with torch.no_grad():
		s_embs = model.encode(sources)
		t_embs = model.encode(targets)

	similarities = util.pytorch_cos_sim(s_embs, t_embs)

	indices = (similarities >= threshold).nonzero()

	ret = list()
	for ix in indices:
		ret.append((sources[ix[0]], targets[ix[1]], similarities[ix[0], ix[1]]))

	return ret



In [41]:
import dataclasses
from dataclasses import dataclass

@dataclass
class LinkedElement:
	element: str
	linked_str: str
	extraction: str
	score: float

def link_variables_to_extractions(equation_path, extractions_path):
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	eqn = load_variables(equation_path)
	extractions = AttributeCollection.from_json(extractions_path)
	extractions_ix = extraction_index(extractions)

	inverted_eq = revert_index(eqn)
	invertex_ex = revert_index(extractions_ix)

	srcs = list(inverted_eq.keys())
	trgts = list(invertex_ex.keys())

	matches = align_texts(sources=srcs, targets=trgts, threshold=0.6, model=model)

	ret = list()
	for src_ix, tar_ix, sim in matches:
		for src in inverted_eq[src_ix]:
			for tgt in invertex_ex[tar_ix]:
				link = LinkedElement(
					element= src,
					extraction= tgt.model_dump(mode='json'),
					linked_str=tar_ix,
					score=sim.item()
				)
				ret.append(dataclasses.asdict(link))

	return json.dumps(ret, indent=2)

print(link_variables_to_extractions('variables/page 7/eqn6.json', 'extractions_page4.json'))

[
  {
    "element": "r",
    "linked_str": "r",
    "extraction": {
      "id": {
        "id": "R:954242467"
      },
      "mentions": [
        {
          "id": {
            "id": "T:-704063786"
          },
          "name": "r",
          "extraction_source": {
            "page": 5,
            "block": 7,
            "surrounding_passage": "According to the CDa formula ( 1 ) , the expression 0 t represents the general left Caputo fractional derivative , and a shows the fractional order .\nAlso , to ensure that the above-mentioned fractional equations 1 are dimensionally matched on both sides , the coefficient r1a , including the auxiliary parameter r , is considered [ 41 ] .",
            "char_start": 1028,
            "char_end": 1029,
            "document_reference": {
              "id": "Anewcomparativestudyonthegeneralfractional modelofCOVID-19withisolationandquarantine effects.pdf"
            }
          },
          "provenance": {
            "method": "Skema TR Pi