## match graph data format to Graphrag

## load llama expanded triples

In [2]:
from datasets import Dataset

dataset = Dataset.load_from_disk("/projects/JHA/shared/dataset/head_predictions_filtered_tails_dataset")
print(dataset)

Dataset({
    features: ['head', 'predictions', 'text', 'tails'],
    num_rows: 5714
})


In [13]:
import pandas as pd
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ['head', 'tails']])

# Create a set to store unique (head, tail) pairs
unique_triples = set()

for entry in dataset:
    head = entry["head"]
    tails = [tail for tail in entry["tails"] if tail.strip()]  # Remove empty strings
    
    for tail in tails:
        unique_triples.add((head, tail))

# Convert to DataFrame
df = pd.DataFrame(unique_triples, columns=["head", "tail"])

# Save to CSV
csv_path = "/projects/JHA/shared/graph/pubmed/unique_triples.csv"
df.to_csv(csv_path, index=False)
print(len(df))
print(f"CSV file saved at: {csv_path}")


7728
CSV file saved at: /projects/JHA/shared/graph/pubmed/unique_triples.csv


In [8]:
import networkx as nx
import pandas as pd
import uuid
import csv
import os

# Read the CSV file
root = '/projects/JHA/shared/graph/pubmed'
injected_path = os.path.join(root, 'injections_train.csv')
expanded_path = os.path.join(root, 'unique_triples.csv')
output_path = os.path.join(root, 'injected_expanded')
if not os.path.exists(output_path):
    os.makedirs(output_path)

# read the CSV file
df = pd.read_csv(injected_path)

# Initialize an empty graph
G = nx.MultiDiGraph()  # Use DiGraph for a directed graph. Use Graph() for an undirected graph.

def add_edge_if_not_exists(graph, u, v, key=None, **attr):
    # Check if an edge exists between u and v
    if graph.has_edge(u, v):
        # Get all edges between u and v
        edges_data = graph.get_edge_data(u, v)
        # Iterate through all edges between u and v
        for edge_key, edge_attrs in edges_data.items():
            # Check if the attributes match
            if edge_attrs == attr:
                #print(f"Edge ({u}, {v}) with attributes {attr} already exists. Skipping.")
                return
    # If no matching edge is found, add the new edge
    graph.add_edge(u, v, key=key, **attr)
    #print(f"Edge ({u}, {v}) with attributes {attr} added.")

# Iterate over the rows in the CSV file
for index, row in df.iterrows():
    head = row['root']
    relation = row['relation']
    tail = row['tail']
    # Add an edge to the graph with the relation as an edge attribute
    add_edge_if_not_exists(G, head, tail, rel=relation)

df2 = pd.read_csv(expanded_path)
# Iterate over the rows in the CSV file
for index, row in df2.iterrows():
    head = row['head']
    relation = 'brings about'
    tail = row['tail']
    # Add an edge to the graph with the relation as an edge attribute
    add_edge_if_not_exists(G, head, tail, rel=relation)

# Prepare entities dataframe
nodes_data = []
for human_readable_id, node in enumerate(G.nodes()):
    node_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each node.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "title": node,                        # Use the node name for the title.
        "description": node,                  # Use the node name for the description.
        "degree": G.degree(node)              # Calculate the node's degree.
    }
    nodes_data.append(node_entry)

entities_df = pd.DataFrame(nodes_data)

# Process edges to create the relationships DataFrame.
edges_data = []
for human_readable_id, (source, target, data) in enumerate(G.edges(data=True)):
    combined_degree = G.degree(source) + G.degree(target)
    rel = data.get("rel", "")
    if rel == 'isa': 
        rel = 'is a'
    edge_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each edge.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "source": source,                     # Source node (using the node name).
        "target": target,                      # Target node (using the node name).
        "combined_degree": combined_degree, 
        "description": f"{source} {rel} {target}"
        # 'rel' attribute is available in data if needed: data.get('rel')
    }
    edges_data.append(edge_entry)

relationships_df = pd.DataFrame(edges_data)

# Save DataFrames to parquet files
entities_df.to_parquet(os.path.join(output_path, 'entities.parquet'), index=False)
relationships_df.to_parquet(os.path.join(output_path, 'relationships.parquet'), index=False)
print('saved')

In [11]:
print(len(G.nodes()))
print(len(G.edges()))

8043
12388


In [9]:
from openai import OpenAI
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

entities_df = pd.read_parquet(os.path.join(output_path, 'entities.parquet'))
sentences = list(entities_df['description'])
print(len(sentences))
final_embeddings = []
batch_size = 50000
for i in range(0, len(sentences)+1, batch_size):
    print(i)
    if i+batch_size<=len(sentences)+1:
        batch = sentences[i:i+batch_size]
    else:
        batch = sentences[i:]
    response = model.encode(batch)
    final_embeddings.extend(response)

import json
print(f"Total embeddings generated: {len(final_embeddings)}")

entities_df['vector'] = final_embeddings

# Create a new column "attributes" as a dictionary with the title
entities_df['attributes'] = entities_df['title'].apply(lambda title: json.dumps({"title": title}))
print()
final_entities_df = entities_df[['id', 'description', 'vector', 'attributes']].rename(columns={'description': 'text'})

# Connect to (or create) a LanceDB database and save the DataFrame.
db = lancedb.connect(os.path.join(output_path, 'lancedb'))
table = db.create_table("default-entity-description", final_entities_df, mode="overwrite")

print("Entities have been embedded and saved to LanceDB successfully.")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/70.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
!!!!!!!!!!!!megablocks not available, using torch.matmul instead


pytorch_model.bin:   0%|          | 0.00/547M [00:00<?, ?B/s]

  state_dict = loader(resolved_archive_file)
<All keys matched successfully>


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

8043
0
Total embeddings generated: 8043

Entities have been embedded and saved to LanceDB successfully.


[2025-03-03T16:42:11Z WARN  lance::dataset::write::insert] No existing dataset at /projects/JHA/shared/graph/pubmed/injected_expanded/lancedb/default-entity-description.lance.lance, it will be created


## extract llama filtered triples

In [7]:
import networkx as nx
import pandas as pd
import uuid
import csv
import os

# Read the CSV file
root = '/projects/JHA/shared/graph/pubmed'
injected_path = os.path.join(root, 'injections_train.csv')
expanded_path = os.path.join(root, 'unique_triples.csv')
output_path = os.path.join(root, 'injected_expanded_filtered')
filtered_path = os.path.join(root, 'new_triples_with_responses.csv')
if not os.path.exists(output_path):
    os.makedirs(output_path)

# read the CSV file
df = pd.read_csv(injected_path)

# Initialize an empty graph
G = nx.MultiDiGraph()  # Use DiGraph for a directed graph. Use Graph() for an undirected graph.

def add_edge_if_not_exists(graph, u, v, key=None, **attr):
    # Check if an edge exists between u and v
    if graph.has_edge(u, v):
        # Get all edges between u and v
        edges_data = graph.get_edge_data(u, v)
        # Iterate through all edges between u and v
        for edge_key, edge_attrs in edges_data.items():
            # Check if the attributes match
            if edge_attrs == attr:
                #print(f"Edge ({u}, {v}) with attributes {attr} already exists. Skipping.")
                return
    # If no matching edge is found, add the new edge
    graph.add_edge(u, v, key=key, **attr)
    #print(f"Edge ({u}, {v}) with attributes {attr} added.")

# Iterate over the rows in the CSV file
for index, row in df.iterrows():
    head = row['root']
    relation = row['relation']
    tail = row['tail']
    # Add an edge to the graph with the relation as an edge attribute
    add_edge_if_not_exists(G, head, tail, rel=relation)

# read filtered triples
df = pd.read_csv(filtered_path)
df = df.drop(columns=['response'])
df = df[df['valid'] == True]
# Iterate over the rows in the CSV file
for index, row in df.iterrows():
    head = row['head']
    relation = 'brings about'
    tail = row['tail']
    # Add an edge to the graph with the relation as an edge attribute
    add_edge_if_not_exists(G, head, tail, rel=relation)

# Prepare entities dataframe
nodes_data = []
for human_readable_id, node in enumerate(G.nodes()):
    node_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each node.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "title": node,                        # Use the node name for the title.
        "description": node,                  # Use the node name for the description.
        "degree": G.degree(node)              # Calculate the node's degree.
    }
    nodes_data.append(node_entry)

entities_df = pd.DataFrame(nodes_data)

# Process edges to create the relationships DataFrame.
edges_data = []
for human_readable_id, (source, target, data) in enumerate(G.edges(data=True)):
    combined_degree = G.degree(source) + G.degree(target)
    rel = data.get("rel", "")
    if rel == 'isa': 
        rel = 'is a'
    edge_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each edge.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "source": source,                     # Source node (using the node name).
        "target": target,                      # Target node (using the node name).
        "combined_degree": combined_degree, 
        "description": f"{source} {rel} {target}"
        # 'rel' attribute is available in data if needed: data.get('rel')
    }
    edges_data.append(edge_entry)

relationships_df = pd.DataFrame(edges_data)

# Save DataFrames to parquet files
entities_df.to_parquet(os.path.join(output_path, 'entities.parquet'), index=False)
relationships_df.to_parquet(os.path.join(output_path, 'relationships.parquet'), index=False)
print('saved')

saved


In [10]:
from openai import OpenAI
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

entities_df = pd.read_parquet(os.path.join(output_path, 'entities.parquet'))
sentences = list(entities_df['description'])
print(len(sentences))
final_embeddings = []
batch_size = 50000
for i in range(0, len(sentences)+1, batch_size):
    print(i)
    if i+batch_size<=len(sentences)+1:
        batch = sentences[i:i+batch_size]
    else:
        batch = sentences[i:]
    response = model.encode(batch)
    final_embeddings.extend(response)

import json
print(f"Total embeddings generated: {len(final_embeddings)}")

entities_df['vector'] = final_embeddings

# Create a new column "attributes" as a dictionary with the title
entities_df['attributes'] = entities_df['title'].apply(lambda title: json.dumps({"title": title}))
print()
final_entities_df = entities_df[['id', 'description', 'vector', 'attributes']].rename(columns={'description': 'text'})

# Connect to (or create) a LanceDB database and save the DataFrame.
db = lancedb.connect(os.path.join(output_path, 'lancedb'))
table = db.create_table("default-entity-description", final_entities_df, mode="overwrite")

print("Entities have been embedded and saved to LanceDB successfully.")



  state_dict = loader(resolved_archive_file)
<All keys matched successfully>


6664
0
Total embeddings generated: 6664

Entities have been embedded and saved to LanceDB successfully.
