## Injected KG to graphrag format

In [2]:
import networkx as nx
import pandas as pd
import uuid
import csv
import os

# Read the CSV file
root = '/projects/JHA/shared/graph/pubmed'
graph_path = os.path.join(root, 'injections_train.csv')
output_path = os.path.join(root, 'injected')
if not os.path.exists(output_path):
    os.makedirs(output_path)

# read the CSV file
df = pd.read_csv(graph_path)

# Initialize an empty graph
G = nx.MultiDiGraph()  # Use DiGraph for a directed graph. Use Graph() for an undirected graph.

def add_edge_if_not_exists(graph, u, v, key=None, **attr):
    # Check if an edge exists between u and v
    if graph.has_edge(u, v):
        # Get all edges between u and v
        edges_data = graph.get_edge_data(u, v)
        # Iterate through all edges between u and v
        for edge_key, edge_attrs in edges_data.items():
            # Check if the attributes match
            if edge_attrs == attr:
                #print(f"Edge ({u}, {v}) with attributes {attr} already exists. Skipping.")
                return
    # If no matching edge is found, add the new edge
    graph.add_edge(u, v, key=key, **attr)
    #print(f"Edge ({u}, {v}) with attributes {attr} added.")

# Iterate over the rows in the CSV file
for index, row in df.iterrows():
    head = row['root']
    relation = row['relation']
    tail = row['tail']
    # Add an edge to the graph with the relation as an edge attribute
    add_edge_if_not_exists(G, head, tail, rel=relation)

# Prepare entities dataframe
nodes_data = []
for human_readable_id, node in enumerate(G.nodes()):
    node_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each node.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "title": node,                        # Use the node name for the title.
        "description": node,                  # Use the node name for the description.
        "degree": G.degree(node)              # Calculate the node's degree.
    }
    nodes_data.append(node_entry)

entities_df = pd.DataFrame(nodes_data)

# Process edges to create the relationships DataFrame.
edges_data = []
for human_readable_id, (source, target, data) in enumerate(G.edges(data=True)):
    combined_degree = G.degree(source) + G.degree(target)
    rel = data.get("rel", "")
    if rel == 'isa': 
        rel = 'is a'
    edge_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each edge.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "source": source,                     # Source node (using the node name).
        "target": target,                      # Target node (using the node name).
        "combined_degree": combined_degree, 
        "description": f"{source} {rel} {target}"
        # 'rel' attribute is available in data if needed: data.get('rel')
    }
    edges_data.append(edge_entry)

relationships_df = pd.DataFrame(edges_data)

# Save DataFrames to parquet files
entities_df.to_parquet(os.path.join(output_path, 'entities.parquet'), index=False)
relationships_df.to_parquet(os.path.join(output_path, 'relationships.parquet'), index=False)


In [3]:
from openai import OpenAI
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

entities_df = pd.read_parquet(os.path.join(output_path, 'entities.parquet'))
sentences = list(entities_df['description'])
print(len(sentences))
final_embeddings = []
batch_size = 50000
for i in range(0, len(sentences)+1, batch_size):
    print(i)
    if i+batch_size<=len(sentences)+1:
        batch = sentences[i:i+batch_size]
    else:
        batch = sentences[i:]
    response = model.encode(batch)
    final_embeddings.extend(response)

import json
print(f"Total embeddings generated: {len(final_embeddings)}")

entities_df['vector'] = final_embeddings

# Create a new column "attributes" as a dictionary with the title
entities_df['attributes'] = entities_df['title'].apply(lambda title: json.dumps({"title": title}))
print()
final_entities_df = entities_df[['id', 'description', 'vector', 'attributes']].rename(columns={'description': 'text'})

# Connect to (or create) a LanceDB database and save the DataFrame.
db = lancedb.connect(os.path.join(output_path, 'lancedb'))
table = db.create_table("default-entity-description.lance", final_entities_df, mode="overwrite")

print("Entities have been embedded and saved to LanceDB successfully.")

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
  state_dict = loader(resolved_archive_file)
<All keys matched successfully>


4534
0
Total embeddings generated: 4534

Entities have been embedded and saved to LanceDB successfully.


[2025-03-01T02:23:39Z WARN  lance::dataset::write::insert] No existing dataset at /projects/JHA/shared/graph/pubmed/injected/lancedb/default-entity-description.lance.lance, it will be created


## injected KG (lim2) to graphrag format

In [1]:
import networkx as nx
import pandas as pd
import uuid
import csv
import os

# Read the CSV file
root = '/projects/JHA/shared/graph/pubmed'
graph_path = os.path.join(root, '32k_lim2_rel32.csv')
output_path = os.path.join(root, 'injected_32k')
if not os.path.exists(output_path):
    os.makedirs(output_path)

# read the CSV file
df = pd.read_csv(graph_path)

# Initialize an empty graph
G = nx.MultiDiGraph()  # Use DiGraph for a directed graph. Use Graph() for an undirected graph.

def add_edge_if_not_exists(graph, u, v, key=None, **attr):
    # Check if an edge exists between u and v
    if graph.has_edge(u, v):
        # Get all edges between u and v
        edges_data = graph.get_edge_data(u, v)
        # Iterate through all edges between u and v
        for edge_key, edge_attrs in edges_data.items():
            # Check if the attributes match
            if edge_attrs == attr:
                #print(f"Edge ({u}, {v}) with attributes {attr} already exists. Skipping.")
                return
    # If no matching edge is found, add the new edge
    graph.add_edge(u, v, key=key, **attr)
    #print(f"Edge ({u}, {v}) with attributes {attr} added.")

# Iterate over the rows in the CSV file
for index, row in df.iterrows():
    head = row['root']
    relation = row['relation']
    tail = row['tail']
    # Add an edge to the graph with the relation as an edge attribute
    add_edge_if_not_exists(G, head, tail, rel=relation)

# Prepare entities dataframe
nodes_data = []
for human_readable_id, node in enumerate(G.nodes()):
    node_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each node.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "title": node,                        # Use the node name for the title.
        "description": node,                  # Use the node name for the description.
        "degree": G.degree(node)              # Calculate the node's degree.
    }
    nodes_data.append(node_entry)

entities_df = pd.DataFrame(nodes_data)

# Process edges to create the relationships DataFrame.
edges_data = []
for human_readable_id, (source, target, data) in enumerate(G.edges(data=True)):
    combined_degree = G.degree(source) + G.degree(target)
    rel = data.get("rel", "")
    if rel == 'isa': 
        rel = 'is a'
    edge_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each edge.
        "human_readable_id": human_readable_id, # A sequential human readable id.
        "source": source,                     # Source node (using the node name).
        "target": target,                      # Target node (using the node name).
        "combined_degree": combined_degree, 
        "description": f"{source} {rel} {target}"
        # 'rel' attribute is available in data if needed: data.get('rel')
    }
    edges_data.append(edge_entry)

relationships_df = pd.DataFrame(edges_data)

# Save DataFrames to parquet files
entities_df.to_parquet(os.path.join(output_path, 'entities.parquet'), index=False)
relationships_df.to_parquet(os.path.join(output_path, 'relationships.parquet'), index=False)


In [2]:
from openai import OpenAI
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

entities_df = pd.read_parquet(os.path.join(output_path, 'entities.parquet'))
sentences = list(entities_df['description'])
print(len(sentences))
final_embeddings = []
batch_size = 50000
for i in range(0, len(sentences)+1, batch_size):
    print(i)
    if i+batch_size<=len(sentences)+1:
        batch = sentences[i:i+batch_size]
    else:
        batch = sentences[i:]
    response = model.encode(batch)
    final_embeddings.extend(response)

import json
print(f"Total embeddings generated: {len(final_embeddings)}")

entities_df['vector'] = final_embeddings

# Create a new column "attributes" as a dictionary with the title
entities_df['attributes'] = entities_df['title'].apply(lambda title: json.dumps({"title": title}))
print()
final_entities_df = entities_df[['id', 'description', 'vector', 'attributes']].rename(columns={'description': 'text'})

# Connect to (or create) a LanceDB database and save the DataFrame.
db = lancedb.connect(os.path.join(output_path, 'lancedb'))
table = db.create_table("default-entity-description", final_entities_df, mode="overwrite")

print("Entities have been embedded and saved to LanceDB successfully.")

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
  state_dict = loader(resolved_archive_file)
<All keys matched successfully>


43970
0
Total embeddings generated: 43970



[2025-03-18T18:09:37Z WARN  lance::dataset::write::insert] No existing dataset at /projects/JHA/shared/graph/pubmed/injected_32k/lancedb/default-entity-description.lance.lance, it will be created


Entities have been embedded and saved to LanceDB successfully.


## expanded KG (lim2) to graphrag format

In [56]:
import networkx as nx
import pandas as pd
import uuid
import csv
import os

# Read the CSV file
root = '/projects/JHA/shared/graph/pubmed'
graph_path = os.path.join(root, 'expanded_true_triples.csv')
output_path = os.path.join(root, 'expanded_32k')
if not os.path.exists(output_path):
    os.makedirs(output_path)

# read the CSV file
df = pd.read_csv(graph_path, na_values=['NULL'], keep_default_na=False)
root_degree = df['root'].value_counts()
tail_degree = df['tail'].value_counts()

all_entities = set(root_degree.index).union(set(tail_degree.index))
combined_degree = {entity: root_degree.get(entity, 0) + tail_degree.get(entity, 0) for entity in all_entities}
sorted_combined_degree = dict(sorted(combined_degree.items(), key=lambda item: item[1], reverse=True))


In [57]:
# Iterate over the rows in the CSV file
# Prepare entities dataframe
nodes_data = []
for index, (head, degree) in enumerate(sorted_combined_degree.items()):  
    node_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each node.
        "human_readable_id": index, # A sequential human readable id.
        "title": head,                        # Use the node name for the title.
        "description": head,                  # Use the node name for the description.
        "degree": degree              # Calculate the node's degree.
    }
    nodes_data.append(node_entry)

entities_df = pd.DataFrame(nodes_data)
# Save DataFrames to parquet files
entities_df.to_parquet(os.path.join(output_path, 'entities.parquet'), index=False)


In [59]:
# Process edges to create the relationships DataFrame.
edges_data = []

for index, row in df.iterrows():
    head = row['root']
    relation = row['relation']
    tail = row['tail']
    
    if relation == 'isa': 
        relation = 'is a'
    edge_entry = {
        "id": str(uuid.uuid4()),              # Generate a unique UUID for each edge.
        "human_readable_id": index, # A sequential human readable id.
        "source": head,                     # Source node (using the node name).
        "target": tail,                      # Target node (using the node name).
        "combined_degree": sorted_combined_degree[head]+sorted_combined_degree[tail], 
        "description": f"{head} {relation} {tail}"
        # 'rel' attribute is available in data if needed: data.get('rel')
    }
    edges_data.append(edge_entry)

relationships_df = pd.DataFrame(edges_data)

# Save DataFrames to parquet files
relationships_df.to_parquet(os.path.join(output_path, 'relationships.parquet'), index=False)

In [60]:
from openai import OpenAI
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

entities_df = pd.read_parquet(os.path.join(output_path, 'entities.parquet'))
sentences = list(entities_df['description'])
print(len(sentences))
final_embeddings = []
batch_size = 50000
for i in range(0, len(sentences)+1, batch_size):
    print(i)
    if i+batch_size<=len(sentences)+1:
        batch = sentences[i:i+batch_size]
    else:
        batch = sentences[i:]
    response = model.encode(batch)
    final_embeddings.extend(response)

import json
print(f"Total embeddings generated: {len(final_embeddings)}")

entities_df['vector'] = final_embeddings

# Create a new column "attributes" as a dictionary with the title
entities_df['attributes'] = entities_df['title'].apply(lambda title: json.dumps({"title": title}))
print()
final_entities_df = entities_df[['id', 'description', 'vector', 'attributes']].rename(columns={'description': 'text'})

# Connect to (or create) a LanceDB database and save the DataFrame.
db = lancedb.connect(os.path.join(output_path, 'lancedb'))
table = db.create_table("default-entity-description", final_entities_df, mode="overwrite")

print("Entities have been embedded and saved to LanceDB successfully.")

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
  state_dict = loader(resolved_archive_file)
<All keys matched successfully>


389476
0
50000
100000
150000
200000
250000
300000
350000
Total embeddings generated: 389476



[2025-03-19T21:26:13Z WARN  lance::dataset::write::insert] No existing dataset at /projects/JHA/shared/graph/pubmed/expanded_32k/lancedb/default-entity-description.lance, it will be created


Entities have been embedded and saved to LanceDB successfully.
