## Install llmgraph

In [None]:
# Install llmgraph from pypi (https://pypi.org/project/llmgraph/)
# (Ignore any dependency resolver issues on Google Colab, they're fine)
%pip -q install  llmgraph

In [None]:
# Display installed llmgraph version
%pip list | grep llmgraph

llmgraph                         1.2.3


## Imports

In [None]:
import IPython
import os
import getpass
from pathlib import Path

## Enter your OpenAI API Key

In [None]:
# Set OPENAI_API_KEY from user input (hidden in UI via getpass function)
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key")

Enter your OpenAI API Key··········


## Run llmgraph command

In [None]:
!llmgraph --help

[1m                                                                                                    [0m
[1m [0m[1;33mUsage: [0m[1mllmgraph [OPTIONS] ENTITY_TYPE ENTITY_WIKIPEDIA[0m[1m                                            [0m[1m [0m
[1m                                                                                                    [0m
 Create knowledge graphs with LLMs                                                                  
                                                                                                    
[2m╭─[0m[2m Arguments [0m[2m─────────────────────────────────────────────────────────────────────────────────────[0m[2m─╮[0m
[2m│[0m [31m*[0m    entity_type           [1;33mTEXT[0m  Entity type (e.g. movie) [2m[default: None][0m [2;31m[required][0m             [2m│[0m
[2m│[0m [31m*[0m    entity_wikipedia      [1;33mTEXT[0m  Full wikipedia link to root entity [2m[default: None][0m [2;31m[required][0m   

In [None]:
!zip  -r /content/_output/concepts-chemical.zip /content/_output/concepts-chemical

updating: content/_output/concepts-chemical/ (stored 0%)
  adding: content/_output/concepts-chemical/per--and-polyfluoroalkyl-substances/ (stored 0%)
  adding: content/_output/concepts-chemical/per--and-polyfluoroalkyl-substances/concepts-chemical_per--and-polyfluoroalkyl-substances_v1.2.3_gpt-4o-mini_level4_incl_unprocessed.html (deflated 84%)
  adding: content/_output/concepts-chemical/per--and-polyfluoroalkyl-substances/concepts-chemical_per--and-polyfluoroalkyl-substances_v1.2.3_gpt-4o-mini_level3.gexf (deflated 87%)
  adding: content/_output/concepts-chemical/per--and-polyfluoroalkyl-substances/concepts-chemical_per--and-polyfluoroalkyl-substances_v1.2.3_gpt-4o-mini_level4_fully_connected.html (deflated 83%)
  adding: content/_output/concepts-chemical/per--and-polyfluoroalkyl-substances/concepts-chemical_per--and-polyfluoroalkyl-substances_v1.2.3_gpt-4o-mini_level5_incl_unprocessed.html (deflated 84%)
  adding: content/_output/concepts-chemical/per--and-polyfluoroalkyl-substances/

In [None]:
# Run llmgraph
# Note: valid `entity_type` values are found here: https://github.com/dylanhogg/llmgraph/blob/main/llmgraph/prompts.yaml
!llmgraph concepts-chemical https://en.wikipedia.org/wiki/Per-_and_polyfluoroalkyl_substances --levels 1 --llm-model gpt-4o-mini --llm-temp 0.0 --no-allow-user-input

Running with [33mentity_type[0m=[32m'concepts-chemical'[0m, 
[33mentity_wikipedia[0m=[32m'https://en.wikipedia.org/wiki/Per-_and_polyfluoroalkyl_substances'[0m, 
[33mentity_root[0m=[32m'Per- and polyfluoroalkyl substances'[0m, [33mcustom_entity_root[0m=[3;91mFalse[0m, [33mlevels[0m=[1;36m1[0m, 
[33mllm_model[0m=[32m'gpt-4o-mini'[0m, [33mllm_temp[0m=[1;36m0[0m[1;36m.0[0m, [33moutput_folder[0m=[32m'./_output/'[0m
[?25l[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ][2KProcessing [1;32mPer- and polyfluoroalkyl substances[0m [1m([0mlevel [1;36m1[0m, total tokens [1;36m0[0m[1m)[0m
[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ][2KProcessing level 1: [35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ 

## Locate the output files

In [None]:
# Get list of book html files from the _output folder
html_files = []
graphml_files = []
for root, dirs, files in os.walk("_output"):
  if not dirs:
    html_files.extend([str(Path(root) / f) for f in files if f.endswith("fully_connected.html")])
    graphml_files.extend([str(Path(root) / f) for f in files if f.endswith(".graphml")])
html_files = sorted(html_files)
graphml_files = sorted(graphml_files)
html_file = html_files[-1]
graphml_file = graphml_files[-1]

print(html_file)
print(graphml_file)

_output/concepts-chemical/per--and-polyfluoroalkyl-substances/concepts-chemical_per--and-polyfluoroalkyl-substances_v1.2.3_gpt-4o-mini_level5_fully_connected.html
_output/concepts-chemical/per--and-polyfluoroalkyl-substances/concepts-chemical_per--and-polyfluoroalkyl-substances_v1.2.3_gpt-4o-mini_level5.graphml


In [None]:
# Uncomment these lines to download book html (or find it in the file tree on the left)
# from google.colab import files
# files.download(book_file)

## Display the network

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

In [None]:
# Load graphml file
G3 = nx.read_graphml(graphml_file)
# G = nx.read_graphml("_output/concepts-general/large-language-model/concepts-general_large-language-model_v1.2.1_level3.graphml")

# Create pyvis network for displaying
nt = Network(height="800px", width="100%", directed=True, cdn_resources="remote", notebook=True)
nt.from_nx(G3)
nt.force_atlas_2based(
    spring_strength=0.03
)

In [None]:
# Display pyviz network
nt.save_graph("llmgraph.html")
IPython.display.HTML(filename="llmgraph.html")

In [None]:
# Display pyviz network
nt.save_graph("llmgraph.html")
IPython.display.HTML(filename="llmgraph.html")

In [None]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:

from sentence_transformers import SentenceTransformer, util

# Initialize the RoBERTa model
model = SentenceTransformer('stsb-roberta-large')

# Compute embeddings for each concept
concept = list(G.nodes)
embeddings = model.encode(concept, convert_to_tensor=True).cpu()

# Compute similarity scores between each pair of concepts
similarity_matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy()


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
similarity_matrix

array([[1.0000004 , 0.79460144, 0.5092505 , ..., 0.37320125, 0.40201074,
        0.3823853 ],
       [0.79460144, 0.99999994, 0.43906438, ..., 0.3429673 , 0.34737134,
        0.42329293],
       [0.5092505 , 0.43906438, 0.99999994, ..., 0.3131227 , 0.28395653,
        0.20818084],
       ...,
       [0.37320125, 0.34296736, 0.3131227 , ..., 0.9999999 , 0.8555348 ,
        0.28827834],
       [0.4020108 , 0.34737134, 0.28395656, ..., 0.8555348 , 1.        ,
        0.33780807],
       [0.3823853 , 0.42329293, 0.20818084, ..., 0.28827834, 0.33780807,
        0.99999994]], dtype=float32)

In [None]:
from node2vec import Node2Vec


In [None]:
class BiasedNode2Vec(Node2Vec):
    def __init__(self, *args, **kwargs):
        self.similarity_matrix = kwargs.pop('similarity_matrix', None)
        self.cids = list(G.nodes)

        super().__init__(*args, **kwargs)

    def get_edge_weight(self, source, target, prev_target=None):
        print(f"Getting edge weight for {source} -> {target} (prev_target={prev_target}")
        if self.similarity_matrix is None:
            return 1
        if prev_target is None:
            return 1
        # Use the similarity scores as weights for edges
        source_idx = self.cids.index(source)
        target_idx = self.cids.index(target)
        prev_target_idx = self.cids.index(prev_target)
        p = self.similarity_matrix[source_idx, prev_target_idx]
        q = self.similarity_matrix[source_idx, target_idx]
        return 1 / p if source != target else 1 / q

    def biased_random_walk(self, walk_length, start_node):
        walk = [start_node]

        while len(walk) < walk_length:
            current_node = walk[-1]
            neighbors = list(self.graph.neighbors(current_node))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(np.random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    weights = np.array([self.get_edge_weight(current_node, neighbor, prev_node) for neighbor in neighbors])
                    probabilities = weights / weights.sum()
                    next_node = np.random.choice(neighbors, p=probabilities)
                    walk.append(next_node)
            else:
                break

        return walk

In [None]:
G.nodes['Polyfluoroalkyl substances']

{'name': 'Polyfluoroalkyl substances',
 'level': 4,
 'wikipedia_link': 'https://en.wikipedia.org/wiki/Polyfluoroalkyl_substances',
 'wikipedia_canonical': 'Per-_and_polyfluoroalkyl_substances',
 'wikipedia_normalized': 'Per- and polyfluoroalkyl substances',
 'wikipedia_resp_code': 200,
 'wikipedia_content': 'Per- and polyfluoroalkyl substances are a group of synthetic organofluorine chemical compounds that have multiple fluorine atoms attached to an alkyl chain; there are 7 million such chemicals according to PubChem. PFAS came into use after the invention of Teflon in 1938 to make fluoropolymer coatings and products that resist heat, oil, stains, grease, and water. They are now used in products including waterproof fabric such as Nylon, yoga pants, carpets, shampoo, feminine hygiene products, mobil',
 'processed': 2,
 'node_count': 42,
 'label': 'Polyfluoroalkyl substances',
 'title': "42. <a href='https://en.wikipedia.org/wiki/Polyfluoroalkyl_substances' target='_blank'>Polyfluoroalk

In [None]:
import numpy as np
biased_node2vec = BiasedNode2Vec(graph=G, dimensions=64, walk_length=10, num_walks=100, workers=1, similarity_matrix=similarity_matrix)

# Generate random walks
node = 'Per- and polyfluoroalkyl substances'  # Starting node for the walk
walk_length = 10  # Length of each walk
walk = biased_node2vec.biased_random_walk(walk_length, node)

print("Biased Random Walk:", walk)

Computing transition probabilities:   0%|          | 0/399 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 100/100 [00:00<00:00, 375.80it/s]

Getting edge weight for Dioxins -> Polychlorinated biphenyl (prev_target=Per- and polyfluoroalkyl substances
Getting edge weight for Dioxins -> Furan (prev_target=Per- and polyfluoroalkyl substances
Getting edge weight for Dioxins -> Polycyclic aromatic hydrocarbon (prev_target=Per- and polyfluoroalkyl substances
Getting edge weight for Dioxins -> Organochlorine pesticide (prev_target=Per- and polyfluoroalkyl substances
Getting edge weight for Dioxins -> Heavy metal (prev_target=Per- and polyfluoroalkyl substances
Getting edge weight for Heavy metal -> Toxic metal (prev_target=Dioxins
Getting edge weight for Heavy metal -> Metalloid (prev_target=Dioxins
Getting edge weight for Heavy metal -> Transition metal (prev_target=Dioxins
Getting edge weight for Heavy metal -> Heavy metal poisoning (prev_target=Dioxins
Getting edge weight for Heavy metal -> Environmental pollution (prev_target=Dioxins
Getting edge weight for Transition metal -> Lanthanide (prev_target=Heavy metal
Getting edge we




In [None]:
!pip install --upgrade openai



In [None]:
# prompt: api code for gpt-4o-mini
from openai import OpenAI

client = OpenAI(
  api_key='sk-proj-3G1UkAIQeYFeicyYt2aUT3BlbkFJq6wGv7HBGnAbwp7e1Z7w'
)
chat_completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": f"create 5 hot topic innovation in chemistry with {walk} in the every topic use {walk[0]} jsut give  json format"},
              {"role": "system", "content": f"json format"}
              ] # Replace YOUR_PROMPT_HERE
)


In [None]:
chat_completion.dict()

{'id': 'chatcmpl-9xnou2V2V8URRwhGq31RGCko5XXRu',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': '```json\n{\n  "hot_topics": [\n    {\n      "topic": "Advanced Remediation Techniques for Per- and Polyfluoroalkyl Substances (PFAS)",\n      "description": "Innovative methods utilizing transition metals for the degradation and detoxification of PFAS in contaminated water sources."\n    },\n    {\n      "topic": "Dioxins: Interaction with Per- and Polyfluoroalkyl Substances",\n      "description": "Research on the synergistic effects of dioxins and PFAS, focusing on their combined toxicity and potential bioremediation strategies."\n    },\n    {\n      "topic": "Heavy Metal Interaction with Per- and Polyfluoroalkyl Substances",\n      "description": "Investigating how heavy metals influence the behavior and toxicology of PFAS in environmental systems and human health."\n    },\n    {\n      "topic": "Per- and Polyfluoroalkyl Substances 

In [None]:
G.nodes.data()


NodeDataView({'Per- and polyfluoroalkyl substances': {'name': 'Per- and polyfluoroalkyl substances', 'level': 1, 'wikipedia_link': 'https://en.wikipedia.org/wiki/Per-_and_polyfluoroalkyl_substances', 'wikipedia_canonical': 'Per-_and_polyfluoroalkyl_substances', 'wikipedia_normalized': 'Per- and polyfluoroalkyl substances', 'wikipedia_resp_code': 200, 'wikipedia_content': 'Per- and polyfluoroalkyl substances are a group of synthetic organofluorine chemical compounds that have multiple fluorine atoms attached to an alkyl chain; there are 7 million such chemicals according to PubChem. PFAS came into use after the invention of Teflon in 1938 to make fluoropolymer coatings and products that resist heat, oil, stains, grease, and water. They are now used in products including waterproof fabric such as Nylon, yoga pants, carpets, shampoo, feminine hygiene products, mobil', 'processed': 2, 'node_count': 0, 'label': 'Per- and polyfluoroalkyl substances', 'title': "0. <a href='https://en.wikipedi

In [None]:
!pip install torch torch-geometric networkx


Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [None]:
import torch
import torch_geometric
from torch_geometric.utils import from_networkx
import networkx as nx



# Convert NetworkX graph to PyG data
data = from_networkx(G)

# Add additional attributes to node features
data.x = torch.tensor([node_data['size'] for _, node_data in G.nodes(data=True)], dtype=torch.float).view(-1, 1)

# If you have edge features, you can add them similarly
edge_attr = torch.tensor([edge_data['similarity'] for _, _, edge_data in G.edges(data=True)], dtype=torch.float).view(-1, 1)
data.edge_attr = edge_attr


In [None]:
from torch_geometric.utils import negative_sampling

# Positive edges (edges that exist in the graph)
positive_edges = data.edge_index

# Generate negative edges (edges that don't exist in the graph)
negative_edges = negative_sampling(edge_index=positive_edges, num_nodes=data.num_nodes, num_neg_samples=positive_edges.size(1))

# Combine positive and negative edges
combined_edge_index = torch.cat([positive_edges, negative_edges], dim=-1)
data.edge_index = combined_edge_index

# Labels for edges: 1 for positive edges, 0 for negative edges
edge_labels = torch.cat([torch.ones(positive_edges.size(1)), torch.zeros(negative_edges.size(1))], dim=0)


In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv


class GCNLinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GCNLinkPredictor, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(2 * hidden_channels, 1)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

    def decode(self, z, edge_index):
        row, col = edge_index
        z = torch.cat([z[row], z[col]], dim=-1)
        return self.lin(z)

    def forward(self, data):
        z = self.encode(data.x, data.edge_index)
        return self.decode(z, data.edge_index)

# Initialize model
model = GCNLinkPredictor(in_channels=1, hidden_channels=16)

# Example forward pass
pred = model(data)


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

model.train()
for epoch in range(500):  # You can adjust the number of epochs
    optimizer.zero_grad()
    pred = model(data)
    loss = criterion(pred.view(-1), edge_labels)
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')


Epoch 1, Loss: 0.722249448299408
Epoch 2, Loss: 0.7895192503929138
Epoch 3, Loss: 0.7161275148391724
Epoch 4, Loss: 0.7084832787513733
Epoch 5, Loss: 0.7320275902748108
Epoch 6, Loss: 0.7199879288673401
Epoch 7, Loss: 0.6973623037338257
Epoch 8, Loss: 0.6916326880455017
Epoch 9, Loss: 0.7004224061965942
Epoch 10, Loss: 0.7059733271598816
Epoch 11, Loss: 0.7011034488677979
Epoch 12, Loss: 0.6920285224914551
Epoch 13, Loss: 0.687255322933197
Epoch 14, Loss: 0.6893223524093628
Epoch 15, Loss: 0.6938940286636353
Epoch 16, Loss: 0.6954166889190674
Epoch 17, Loss: 0.6927077770233154
Epoch 18, Loss: 0.6887975931167603
Epoch 19, Loss: 0.6871020197868347
Epoch 20, Loss: 0.6883867383003235
Epoch 21, Loss: 0.6906952261924744
Epoch 22, Loss: 0.6916317939758301
Epoch 23, Loss: 0.6904987692832947
Epoch 24, Loss: 0.6884781122207642
Epoch 25, Loss: 0.6872402429580688
Epoch 26, Loss: 0.6874776482582092
Epoch 27, Loss: 0.6885176301002502
Epoch 28, Loss: 0.6891462802886963
Epoch 29, Loss: 0.6887428164482

In [None]:
model.eval()
with torch.no_grad():
    pred = model(data)
    pred_labels = (torch.sigmoid(pred) > 0.5).float()

    # Calculate accuracy
    correct = (pred_labels.view(-1) == edge_labels).sum().item()
    accuracy = correct / edge_labels.size(0)
    print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 59.75%


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

model.eval()
with torch.no_grad():
    pred = model(data)
    pred_labels = (torch.sigmoid(pred) > 0.5).float()

    # Calculate accuracy
    correct = (pred_labels.view(-1) == edge_labels).sum().item()
    accuracy = correct / edge_labels.size(0)
    print(f'Accuracy: {accuracy * 100:.2f}%')

    # Calculate precision, recall, F1-score
    pred_labels_np = pred_labels.cpu().numpy()
    edge_labels_np = edge_labels.cpu().numpy()
    precision = precision_score(edge_labels_np, pred_labels_np)
    recall = recall_score(edge_labels_np, pred_labels_np)
    f1 = f1_score(edge_labels_np, pred_labels_np)

    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-Score: {f1:.2f}')

    # Calculate AUC-ROC
    pred_proba_np = torch.sigmoid(pred).cpu().numpy()
    auc_roc = roc_auc_score(edge_labels_np, pred_proba_np)

    print(f'AUC-ROC: {auc_roc:.2f}')


Accuracy: 59.75%
Precision: 0.60
Recall: 0.59
F1-Score: 0.59
AUC-ROC: 0.64


In [None]:
model.eval()
with torch.no_grad():
    # Forward pass to get predictions
    pred = model(data)

    # Apply sigmoid to get probabilities
    pred_probs = torch.sigmoid(pred).view(-1)

    # Apply a threshold to get binary predictions
    threshold = 0.5
    pred_labels = (pred_probs > threshold).float()

    # Calculate accuracy
    correct = (pred_labels == edge_labels).sum().item()
    total = edge_labels.size(0)
    accuracy = correct / total
    print(f'Accuracy: {accuracy * 100:.2f}%')

    # True positives, false positives, true negatives, false negatives
    TP = ((pred_labels == 1) & (edge_labels == 1)).sum().item()
    FP = ((pred_labels == 1) & (edge_labels == 0)).sum().item()
    TN = ((pred_labels == 0) & (edge_labels == 0)).sum().item()
    FN = ((pred_labels == 0) & (edge_labels == 1)).sum().item()

    # Precision
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0

    # Recall
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0

    # F1-Score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-Score: {f1_score:.2f}')


Accuracy: 59.75%
Precision: 0.60
Recall: 0.59
F1-Score: 0.59


In [None]:
import itertools

# Ensure the model is in evaluation mode
model.eval()

# Get all possible pairs of nodes
all_node_pairs = torch.tensor(list(itertools.combinations(range(data.num_nodes), 2)), dtype=torch.long).t()

# Filter out the edges that already exist in the graph
existing_edges = set(map(tuple, data.edge_index.t().tolist()))
non_existing_pairs = [pair for pair in all_node_pairs.t().tolist() if tuple(pair) not in existing_edges]

# Convert the non-existing pairs back to a tensor
non_existing_pairs = torch.tensor(non_existing_pairs).t()

# Create a temporary Data object to pass through the model
temp_data = torch_geometric.data.Data(x=data.x, edge_index=non_existing_pairs)

# Predict edge existence probabilities
with torch.no_grad():
    pred_probs = torch.sigmoid(model(temp_data)).view(-1)

sorted_indices_tmp = pred_probs[pred_probs>0.99]

# Sort predicted edges by their probability in descending order
sorted_indices = sorted_indices_tmp.argsort(descending=True)
predicted_edges = non_existing_pairs[:, sorted_indices].t()  # Get top 10 predicted edges
predicted_edges_probs = pred_probs[sorted_indices]


In [None]:
float(pred_probs[sorted_indices[0]])

0.9999991655349731

In [None]:
predicted_edges[0]

tensor([317, 337])

In [None]:
for i in predicted_edges:
  print(id2name[int(i[0])],id2name[int(i[1])])
  break

Amine Catalyst


In [None]:
id2name = {i: name for i, name in enumerate(G.nodes)}

In [None]:
id2name[0]

'Per- and polyfluoroalkyl substances'

In [None]:
# @title Default title text
# Add predicted new edges to the graph

G2 = G.copy()
for i in range(len(predicted_edges)):
  print(id2name[int(predicted_edges[i][0])],id2name[int(predicted_edges[i][1])])
  G2.add_edge(id2name[int(predicted_edges[i][0])],id2name[int(predicted_edges[i][1])],weight=float(pred_probs[sorted_indices[i]]))

# Now visualize the graph
# import matplotlib.pyplot as plt

# pos = nx.spring_layout(G2)  # or any other layout you prefer
# nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray')
# plt.show()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Polycyclic aromatic hydrocarbon Nonmetal
Polycyclic aromatic hydrocarbon Alloy
Toxic metal BPA-free plastics
Polycyclic aromatic hydrocarbon Lead poisoning
Polycyclic aromatic hydrocarbon Periodic table
Polycyclic aromatic hydrocarbon Lanthanide
Polycyclic aromatic hydrocarbon Actinide
Polycyclic aromatic hydrocarbon Alkaline earth metal
Polycyclic aromatic hydrocarbon Mercury poisoning
Polycyclic aromatic hydrocarbon Cadmium poisoning
Polycyclic aromatic hydrocarbon Arsenic poisoning
Polycyclic aromatic hydrocarbon Air pollution
Polycyclic aromatic hydrocarbon Toxicology
Polycyclic aromatic hydrocarbon Post-transition metal
Polycyclic aromatic hydrocarbon Soil contamination
Polycyclic aromatic hydrocarbon Climate change
Polycyclic aromatic hydrocarbon Aniline
Polycyclic aromatic hydrocarbon Toxic waste
Polycyclic aromatic hydrocarbon Water pollution
Polycyclic aromatic hydrocarbon Cresol
Polycyclic aromatic hydrocarbon S

In [None]:

from sentence_transformers import SentenceTransformer, util

# Initialize the RoBERTa model
model = SentenceTransformer('stsb-roberta-large')

# Compute embeddings for each concept
concept2 = list(G2.nodes)
embeddings2 = model.encode(concept2, convert_to_tensor=True).cpu()

# Compute similarity scores between each pair of concepts
similarity_matrix2 = util.pytorch_cos_sim(embeddings2, embeddings2).numpy()


In [None]:
class BiasedNode2Vec(Node2Vec):
    def __init__(self, *args, **kwargs):
        self.similarity_matrix = kwargs.pop('similarity_matrix2', None)
        self.cids = list(G2.nodes)

        super().__init__(*args, **kwargs)

    def get_edge_weight(self, source, target, prev_target=None):
        print(f"Getting edge weight for {source} -> {target} (prev_target={prev_target}")
        if self.similarity_matrix is None:
            return 1
        if prev_target is None:
            return 1
        # Use the similarity scores as weights for edges
        source_idx = self.cids.index(source)
        target_idx = self.cids.index(target)
        prev_target_idx = self.cids.index(prev_target)
        p = self.similarity_matrix[source_idx, prev_target_idx]
        q = self.similarity_matrix[source_idx, target_idx]
        return 1 / p if source != target else 1 / q

    def biased_random_walk(self, walk_length, start_node):
        walk = [start_node]

        while len(walk) < walk_length:
            current_node = walk[-1]
            neighbors = list(self.graph.neighbors(current_node))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(np.random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    weights = np.array([self.get_edge_weight(current_node, neighbor, prev_node) for neighbor in neighbors])
                    probabilities = weights / weights.sum()
                    next_node = np.random.choice(neighbors, p=probabilities)
                    walk.append(next_node)
            else:
                con

        return walk

In [None]:
import numpy as np
biased_node2vec = BiasedNode2Vec(graph=G2, dimensions=64, walk_length=10, num_walks=200, workers=1, similarity_matrix2=similarity_matrix2)

# Generate random walks
node = 'Per- and polyfluoroalkyl substances'  # Starting node for the walk
walk_length = 20  # Length of each walk
walk = biased_node2vec.biased_random_walk(walk_length, node)

print("Biased Random Walk:", walk)

Computing transition probabilities:   0%|          | 0/399 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:53<00:00,  3.76it/s]

Biased Random Walk: ['Per- and polyfluoroalkyl substances', 'Chemical concentration']





In [None]:
# prompt: api code for gpt-4o-mini
from openai import OpenAI

client = OpenAI(
  api_key='sk-proj-3G1UkAIQeYFeicyYt2aUT3BlbkFJq6wGv7HBGnAbwp7e1Z7w'
)
chat_completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": f"create 5 hot topic innovation in chemistry for essay with {walk} in the every topic use {walk[0]} jsut give  json format"},
              {"role": "system", "content": f"json format"}
              ] # Replace YOUR_PROMPT_HERE
)


In [None]:
chat_completion.dict()

{'id': 'chatcmpl-9xozr0pJgyJBF3SYwaMgXkvRO0DWh',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': '```json\n{\n  "innovations": [\n    {\n      "topic": "Development of Biodegradable Alternatives to Per- and Polyfluoroalkyl Substances",\n      "description": "Research is focused on finding biodegradable substances to replace per- and polyfluoroalkyl substances (PFAS) used in various applications, reducing environmental persistence and health risks.",\n      "related_terms": ["trichloroethane", "emulsifier"]\n    },\n    {\n      "topic": "Advanced Detection Methods for Per- and Polyfluoroalkyl Substances in Water Sources",\n      "description": "Innovative analytical techniques are being developed for the detection and quantification of PFAS in water supplies, improving public health monitoring.",\n      "related_terms": ["trichloroethane", "emulsifier"]\n    },\n    {\n      "topic": "Novel Remediation Technologies for Sites Contamin