In [31]:
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [32]:
tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained('./cache')
model: RobertaModel = RobertaModel.from_pretrained('./cache').to(device)
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [33]:
def embed_text(text: str) -> np.ndarray:
    inputs = tokenizer(text=text, return_tensors='pt',
                       truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0].mean(dim=0).cpu().numpy()
embed_text('Hello, my dog is cute').shape

(768,)

In [38]:
import llama_index.graph_stores
import llama_index.graph_stores.neo4j
gs = llama_index.graph_stores.neo4j.Neo4jGraphStore(url='bolt://localhost:7687', 
                                                    username='neo4j',
                                                    password='030108chen')
print(gs.get_schema("Paper"))

Node properties are the following:
Paper {title: STRING, abs: STRING, arxivId: STRING, citedNum: INTEGER},Author {name: STRING},Code {url: STRING, rating: INTEGER},Method {desc: STRING, name: STRING},Task {desc: STRING, name: STRING}
Relationship properties are the following:

The relationships are the following:
(:Paper)-[:APPLIES]->(:Method),(:Paper)-[:PERFORMS]->(:Task),(:Paper)-[:PROPOSES]->(:Method),(:Paper)-[:CITES]->(:Paper),(:Author)-[:WRITES]->(:Paper)


In [41]:
def embed_paper(arxiv_id: str) -> np.ndarray:
    result = gs.query("MATCH (p:Paper) WHERE p.arxivId = $arxiv_id RETURN p",
                      {
                          'arxiv_id': arxiv_id
                      })
    if len(result) == 0:
        raise ValueError(f'Paper {arxiv_id} not found')
    paper = result[0]['p']
    title, abstract = paper['title'], paper['abs']
    text = title + ' ' + abstract
    return embed_text(text.replace('\n', ' ').strip())


embed_paper("1706.03762").shape

(768,)

In [43]:
from typing import Literal


def embed_task_or_method(name: str, type: Literal['task', 'method'] | None = None) -> np.ndarray:
    result = gs.query(f"MATCH (n:{type.capitalize()}) WHERE n.name = $name RETURN n" if type else
                      "MATCH (n) WHERE n.name = $name RETURN n",
                      {
        'name': name
    })
    if len(result) == 0:
        raise ValueError(f'{type.capitalize()} {name} not found')
    node = result[0]['n']
    name, desc = node['name'], node['desc']
    text = name + ' ' + desc
    return embed_text(text.replace('\n', ' ').strip())

embed_task_or_method('Text-To-Speech Synthesis').shape

(768,)