In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
%%bash
pip install sentence-transformers
pip install datasets

In [4]:
from sentence_transformers import SentenceTransformer

import math
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix
import json
from tqdm import tqdm
import torch
from torch import Tensor
from collections import defaultdict
from typing import List


def load_jsonl(filepaths):
    data = []
    for filepath in filepaths:
        with open(filepath, "r") as f:
            for line in f:
                data.append(json.loads(line))
    return data


def separate_qa_helper(node, depth, msg_dict):
    if "text" in node:
        if node["role"] == "prompter":
            msg_dict["user_messages"].append(str(node["text"]))
        elif node["role"] == "assistant":
            msg_dict["assistant_messages"].append(str(node["text"]))
        depth += 1
        if "replies" in node:
            for reply in node["replies"]:
                separate_qa_helper(reply, depth, msg_dict)


def store_qa_data_separate(trees, data):
    message_list = []
    for i, msg_tree in enumerate(trees):
        if "prompt" in msg_tree.keys():
            separate_qa_helper(msg_tree["prompt"], i, data)
        elif "prompt" not in msg_tree.keys():
            message_list.append(msg_tree)
    return data, message_list


def group_qa_helper(node, depth, msg_pairs):
    if "text" in node:
        if node["role"] == "prompter":
            if "replies" in node:
                for reply in node["replies"]:
                    qa_pair = {"instruct": str(node["text"]), "answer": str(reply["text"])}
                    msg_pairs.append(qa_pair)
        depth += 1
        if "replies" in node:
            for reply in node["replies"]:
                group_qa_helper(reply, depth, msg_pairs)


def store_qa_data_paired(trees, data: List):
    message_list = []
    for i, msg_tree in enumerate(trees):
        if "prompt" in msg_tree.keys():
            group_qa_helper(msg_tree["prompt"], i, data)
        elif "prompt" not in msg_tree.keys():
            message_list.append(msg_tree)
    return data, message_list


def load_data(filepaths: List[str], paired=False):
    trees = load_jsonl(filepaths)
    if paired:
        data = []
        data, message_list = store_qa_data_paired(trees, data)
        sents = [f"{qa['instruct']} {qa['answer']}" for qa in data]
    elif not paired:
        data = defaultdict(list)
        data, message_list = store_qa_data_separate(trees, data)
        sents = data["user_messages"] + data["assistant_messages"]

    data = [(i, sent) for i, sent in enumerate(sents)]
    data = pd.DataFrame(data, columns=["id", "query"])
    return data, message_list

In [5]:
def embed_data(data, key='query', model_name='all-MiniLM-L6-v2', cores=1, gpu=False, batch_size=128):
    """
    Embed the sentences/text using the MiniLM language model (which uses mean pooling)
    """
    print('Embedding data')
    model = SentenceTransformer(model_name)
    print('Model loaded')

    sentences = data[key].tolist()
    unique_sentences = data[key].unique()
    print('Unique sentences', len(unique_sentences))

    if cores == 1:
        embeddings = model.encode(unique_sentences, show_progress_bar=True, batch_size=batch_size)
    else:
        devices = ['cpu'] * cores
        if gpu:
            devices = None  # use all CUDA devices

        # Start the multi-process pool on multiple devices
        print('Multi-process pool starting')
        pool = model.start_multi_process_pool(devices)
        print('Multi-process pool started')

        chunk_size = math.ceil(len(unique_sentences) / cores)

        # Compute the embeddings using the multi-process pool
        embeddings = model.encode_multi_process(unique_sentences, pool, batch_size=batch_size, chunk_size=chunk_size)
        model.stop_multi_process_pool(pool)

    print("Embeddings computed")

    mapping = {sentence: embedding for sentence, embedding in zip(unique_sentences, embeddings)}
    embeddings = np.array([mapping[sentence] for sentence in sentences])
  
    return embeddings

def gaussian_kernel_torch(embs_a, embs_b, sigma=1.0):
    """
    Computes the Gaussian kernel matrix between two sets of embeddings using PyTorch.
    :param embs_a: Tensor of shape (batch_size_a, embedding_dim) containing the first set of embeddings.
    :param embs_b: Tensor of shape (batch_size_b, embedding_dim) containing the second set of embeddings.
    :param sigma: Width of the Gaussian kernel.
    :return: Tensor of shape (batch_size_a, batch_size_b) containing the Gaussian kernel matrix.
    """
    if not isinstance(embs_a, torch.Tensor):
        embs_a = torch.tensor(embs_a)

    if not isinstance(embs_b, torch.Tensor):
        embs_b = torch.tensor(embs_b)

    # Compute the pairwise distances between the embeddings
    dist_matrix = torch.cdist(embs_a, embs_b)

    # Compute the Gaussian kernel matrix
    kernel_matrix = torch.exp(-dist_matrix ** 2 / (2 * sigma ** 2))

    return kernel_matrix

def prune_ref_docs(qa_embs, ref_embs, ref_docs, threshold=0.1):
    """
    Drops unnecessary documents from the reference embeddings and updates the list of reference documents,
    and then recomputes the adjacency matrix.

    Parameters:
    qa_embs (numpy array): The embedding matrix of QA pairs.
    ref_embs (numpy array): The embedding matrix of reference sentences.
    ref_docs (list): The list of reference documents.
    threshold (float): The threshold below which documents are considered unnecessary.

    Returns:
    pruned_ref_embs (numpy array): The pruned embedding matrix of reference sentences.
    pruned_ref_docs (list): The pruned list of reference documents.
    pruned_A (numpy array): The pruned adjacency matrix.
    """
    
    # Compute the initial adjacency matrix with full reference embeddings
    A = gaussian_kernel_torch(qa_embs, ref_embs, sigma=0.5)
    print(f'Before: {A.shape}')
    # Compute the row-wise sum of the adjacency matrix
    row_sum = torch.sum(A, dim=0)
    
    # Identify the indexes of the relevant documents
    relevant_idx = torch.where(row_sum > threshold * row_sum.max())[0]
    
    # Drop unnecessary rows from the reference embeddings
    pruned_ref_embs = ref_embs[relevant_idx]
    
    # Update the list of reference documents
    pruned_ref_docs = [ref_docs[i] for i in relevant_idx]
    
    # Recompute the adjacency matrix with pruned reference embeddings
    pruned_A = gaussian_kernel_torch(qa_embs, pruned_ref_embs, sigma=0.5)
    print(f'After: {pruned_A.shape}')
    return pruned_ref_embs, pruned_ref_docs, pruned_A

def prune_ref_and_qa(qa_embs, ref_embs, ref_docs, qa_docs, threshold=0.7, qthreshold =0.7):
    """
    Drops unnecessary documents from the reference embeddings and updates the list of reference documents,
    and then recomputes the adjacency matrix.

    Parameters:
    qa_embs (numpy array): The embedding matrix of QA pairs.
    ref_embs (numpy array): The embedding matrix of reference sentences.
    ref_docs (list): The list of reference documents.
    qa_docs (list): The list of QA documents.
    threshold (float): The threshold below which documents are considered unnecessary.

    Returns:
    pruned_ref_embs (numpy array): The pruned embedding matrix of reference sentences.
    pruned_ref_docs (list): The pruned list of reference documents.
    pruned_qa_embs (numpy array): The pruned embedding matrix of QA pairs.
    pruned_qa_docs (list): The pruned list of QA documents.
    pruned_A (numpy array): The pruned adjacency matrix.
    """
    
    # Compute the initial adjacency matrix with full reference embeddings
    A = gaussian_kernel_torch(qa_embs, ref_embs, sigma=0.5)
    print(f'Before: {A.shape}')
    # Compute the row-wise sum of the adjacency matrix
    row_sum = torch.sum(A, dim=0)
    
    # Identify the indexes of the relevant documents
    relevant_idx = torch.where(row_sum > threshold * row_sum.max())[0]
    
    # Drop unnecessary rows from the reference embeddings
    pruned_ref_embs = ref_embs[relevant_idx]
    
    # Update the list of reference documents
    pruned_ref_docs = [ref_docs[i] for i in relevant_idx]
    
    # Recompute the adjacency matrix with pruned reference embeddings
    pruned_A = gaussian_kernel_torch(qa_embs, pruned_ref_embs, sigma=0.5)
    print(f'After: {pruned_A.shape}')
    
    # Compute the column-wise sum of the pruned adjacency matrix
    col_sum = torch.sum(pruned_A, dim=1)
    
    # Identify the indexes of the relevant QA pairs
    relevant_idx = torch.where(col_sum > qthreshold * col_sum.max())[0]
    
    # Drop unnecessary rows from the QA embeddings
    pruned_qa_embs = qa_embs[relevant_idx]
    
    # Update the list of QA documents
    pruned_qa_docs = [qa_docs[i] for i in relevant_idx]
    print(len(pruned_qa_docs))
    
    return pruned_ref_embs, pruned_ref_docs, pruned_qa_embs, pruned_qa_docs, pruned_A

In [16]:
filepaths = ["/content/drive/MyDrive/oasst_user_data/2023-02-07_oasst_prod.jsonl"]#, "/content/drive/MyDrive/oasst_user_data/2023-02-08_oasst_spam.jsonl", "/content/drive/MyDrive/oasst_user_data/2023-02-09_oasst_prod.jsonl","/content/drive/MyDrive/oasst_user_data/2023-02-10_oasst_prod.jsonl"]

In [17]:
data, message_list = load_data(filepaths, True)
sents = data['query'].to_list()
data

Unnamed: 0,id,query
0,0,Why is the sky blue? The sky appears blue to h...
1,1,Can you explain it simpler so i can follow alo...
2,2,Explain it to me in a single sentence in simpl...
3,3,Explain it to me in a single sentence in simpl...
4,4,Ohh ok. But why does it seem to be orange or r...
...,...,...
7458,7458,"Write a haiku about a pirate ship. Rigid sail,..."
7459,7459,Write a haiku about a pirate ship. Sails billo...
7460,7460,Why do Haikus have specific number of syllable...
7461,7461,Thank you. Could you write a haiku about love ...


In [34]:
data = load_jsonl(["/content/drive/MyDrive/mm_cot_rag_data_synth/gt4sd_2.jsonl"])
for d in data:
    print(d['example'])


Use the GT4SD/molecular_properties Hugging Face API to predict molecular properties from SMILES notation.

Input: SMILES notation of molecule: CCOc1ccc(O)cc1

Call API 1: <work>api_call(target='gt4sd', input='molecular_properties', parameters={'smiles_notation': 'CCOc1ccc(O)cc1'})</work>
Display Output 1: [The retrieved output from API 1 shows the predicted molecular properties of the molecule specified by the input SMILES notation, such as its solubility, boiling point, and melting point.]
Use the GT4SD/molecular_properties Hugging Face API to predict molecular properties from SMILES notation.

    Input: SMILES notation of molecule: C1CCCCC1

    Call API 1: api_call(target='gt4sd', input='molecular_properties', parameters={'smiles_notation': 'C1CCCCC1'})
    Display Output 1: The retrieved output from API 1 shows the predicted molecular properties of the molecule specified by the input SMILES notation, such as its solubility, boiling point, and melting point.
Use the GT4SD/molecular

In [36]:
sents_idx = [(i, sent['example'] ) for i, sent in enumerate(data)]
sents = [sent for (i, sent) in sents_idx]
data = pd.DataFrame(sents_idx, columns=['id', 'query'])

In [43]:
from datasets import load_dataset
refs = load_dataset("BeIR/scidocs", "corpus")
refs = pd.DataFrame(refs['corpus'], columns=['_id', 'title', 'text'])
refs['merged_text'] = refs['title'] + ' ' + refs['text']

# Drop the original title and text columns
refs = refs.drop(['title', 'text'], axis=1)

# Reorder the columns if necessary
refs = refs[['_id', 'merged_text']]
new_doc = {"_id": 23, 'merged_text': """The GT4SD (Generative Toolkit for Scientific Discovery) is an open-source platform to accelerate hypothesis generation in the scientific discovery process. It provides a library for making state-of-the-art generative AI models easier to use.

For full details on the library API and examples see the docs. Almost all pretrained models are also available via gradio-powered web apps on Hugging Face Spaces.

"""}
refs = refs.append(new_doc, ignore_index=True)
ref_docs = refs['merged_text'].to_list()



  0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
qa_embs = embed_data(data, key="query")
ref_embs = embed_data(refs, key='merged_text')
print(qa_embs.shape)
print(ref_embs.shape)

Embedding data
Model loaded
Unique sentences 25


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings computed
Embedding data
Model loaded
Unique sentences 25657


Batches:   0%|          | 0/201 [00:00<?, ?it/s]

Embeddings computed
(30, 384)
(25658, 384)


In [45]:
pruned_ref_embs, pruned_ref_docs, pruned_A = prune_ref_docs(qa_embs, ref_embs, ref_docs, threshold=0.8)

Before: torch.Size([30, 25658])
After: torch.Size([30, 4])


In [46]:
pruned_ref_docs

['Discovering Molecular Functional Groups Using Graph Convolutional Neural Networks Functional groups (FGs) serve as a foundation for analyzing chemical properties of organic molecules. Automatic discovery of FGs will impact various fields of research, including medicinal chemistry, by reducing the amount of lab experiments required for discovery or synthesis of new molecules. Here, we investigate methods based on graph convolutional neural networks (GCNNs) for localizing FGs that contribute to specific chemical properties. Molecules are modeled as undirected graphs with atoms as nodes and bonds as edges. Using this graph structure, we trained GCNNs in a supervised way on experimentally-validated molecular training sets to predict specific chemical properties, e.g., toxicity. Upon learning a GCNN, we analyzed its activation patterns to automatically identify FGs using four different methods: gradient-based saliency maps, Class Activation Mapping (CAM), gradient-weighted CAM (Grad-CAM),

In [22]:
pruned_ref_embs, pruned_ref_docs, pruned_qa_embs, pruned_qa_docs, pruned_A = prune_ref_and_qa(qa_embs, ref_embs, ref_docs, sents, threshold=0.9, qthreshold=0.3)

Before: torch.Size([7463, 25657])
After: torch.Size([7463, 184])
1928


In [23]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_best_ref(qa_docs, ref_docs, qa_embs, ref_embs):
    best_refs = []
    for qa_emb in qa_embs:
        sim_scores = cosine_similarity(qa_emb.reshape(1, -1), ref_embs)
        best_ref_index = np.argmax(sim_scores)
        best_ref_doc = ref_docs[best_ref_index]
        best_refs.append(best_ref_doc)
    return best_refs

In [None]:
best_refs = find_best_ref(sents, pruned_ref_docs, qa_embs, pruned_ref_embs)
for br, qa in zip(best_refs, sents):
    print('\n\n\n\n\n')
    print(f'QA: {qa}')
    print()
    print(f'Most similar ref_doc {br}')
    print('\n\n\n\n\n')