In [4]:
from sentence_transformers import SentenceTransformer

import math
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix
import json
from tqdm import tqdm
import torch
from torch import Tensor
from collections import defaultdict
from typing import List
from ..src import message_tree_loading, similarity_functions


In [16]:
filepaths = ["/content/drive/MyDrive/oasst_user_data/2023-02-07_oasst_prod.jsonl"]#, "/content/drive/MyDrive/oasst_user_data/2023-02-08_oasst_spam.jsonl", "/content/drive/MyDrive/oasst_user_data/2023-02-09_oasst_prod.jsonl","/content/drive/MyDrive/oasst_user_data/2023-02-10_oasst_prod.jsonl"]

In [17]:
data, message_list = message_tree_loading.load_data(filepaths, True)
sents = data['query'].to_list()
data

Unnamed: 0,id,query
0,0,Why is the sky blue? The sky appears blue to h...
1,1,Can you explain it simpler so i can follow alo...
2,2,Explain it to me in a single sentence in simpl...
3,3,Explain it to me in a single sentence in simpl...
4,4,Ohh ok. But why does it seem to be orange or r...
...,...,...
7458,7458,"Write a haiku about a pirate ship. Rigid sail,..."
7459,7459,Write a haiku about a pirate ship. Sails billo...
7460,7460,Why do Haikus have specific number of syllable...
7461,7461,Thank you. Could you write a haiku about love ...


In [34]:
data = message_tree_loading.load_jsonl(["/content/drive/MyDrive/mm_cot_rag_data_synth/gt4sd_2.jsonl"])
for d in data:
    print(d['example'])


Use the GT4SD/molecular_properties Hugging Face API to predict molecular properties from SMILES notation.

Input: SMILES notation of molecule: CCOc1ccc(O)cc1

Call API 1: <work>api_call(target='gt4sd', input='molecular_properties', parameters={'smiles_notation': 'CCOc1ccc(O)cc1'})</work>
Display Output 1: [The retrieved output from API 1 shows the predicted molecular properties of the molecule specified by the input SMILES notation, such as its solubility, boiling point, and melting point.]
Use the GT4SD/molecular_properties Hugging Face API to predict molecular properties from SMILES notation.

    Input: SMILES notation of molecule: C1CCCCC1

    Call API 1: api_call(target='gt4sd', input='molecular_properties', parameters={'smiles_notation': 'C1CCCCC1'})
    Display Output 1: The retrieved output from API 1 shows the predicted molecular properties of the molecule specified by the input SMILES notation, such as its solubility, boiling point, and melting point.
Use the GT4SD/molecular

In [36]:
sents_idx = [(i, sent['example'] ) for i, sent in enumerate(data)]
sents = [sent for (i, sent) in sents_idx]
data = pd.DataFrame(sents_idx, columns=['id', 'query'])

In [43]:
from datasets import load_dataset
refs = load_dataset("BeIR/scidocs", "corpus")
refs = pd.DataFrame(refs['corpus'], columns=['_id', 'title', 'text'])
refs['merged_text'] = refs['title'] + ' ' + refs['text']

# Drop the original title and text columns
refs = refs.drop(['title', 'text'], axis=1)

# Reorder the columns if necessary
refs = refs[['_id', 'merged_text']]
new_doc = {"_id": 23, 'merged_text': """The GT4SD (Generative Toolkit for Scientific Discovery) is an open-source platform to accelerate hypothesis generation in the scientific discovery process. It provides a library for making state-of-the-art generative AI models easier to use.

For full details on the library API and examples see the docs. Almost all pretrained models are also available via gradio-powered web apps on Hugging Face Spaces.

"""}
refs = refs.append(new_doc, ignore_index=True)
ref_docs = refs['merged_text'].to_list()



  0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
qa_embs = similarity_functions.embed_data(data, key="query")
ref_embs = similarity_functions.embed_data(refs, key='merged_text')
print(qa_embs.shape)
print(ref_embs.shape)

Embedding data
Model loaded
Unique sentences 25


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings computed
Embedding data
Model loaded
Unique sentences 25657


Batches:   0%|          | 0/201 [00:00<?, ?it/s]

Embeddings computed
(30, 384)
(25658, 384)


In [45]:
pruned_ref_embs, pruned_ref_docs, pruned_A = similarity_functions.prune_ref_docs(qa_embs, ref_embs, ref_docs, threshold=0.8)

Before: torch.Size([30, 25658])
After: torch.Size([30, 4])


In [46]:
pruned_ref_docs

['Discovering Molecular Functional Groups Using Graph Convolutional Neural Networks Functional groups (FGs) serve as a foundation for analyzing chemical properties of organic molecules. Automatic discovery of FGs will impact various fields of research, including medicinal chemistry, by reducing the amount of lab experiments required for discovery or synthesis of new molecules. Here, we investigate methods based on graph convolutional neural networks (GCNNs) for localizing FGs that contribute to specific chemical properties. Molecules are modeled as undirected graphs with atoms as nodes and bonds as edges. Using this graph structure, we trained GCNNs in a supervised way on experimentally-validated molecular training sets to predict specific chemical properties, e.g., toxicity. Upon learning a GCNN, we analyzed its activation patterns to automatically identify FGs using four different methods: gradient-based saliency maps, Class Activation Mapping (CAM), gradient-weighted CAM (Grad-CAM),

In [22]:
pruned_ref_embs, pruned_ref_docs, pruned_qa_embs, pruned_qa_docs, pruned_A = similarity_functions.prune_ref_and_qa(qa_embs, ref_embs, ref_docs, sents, threshold=0.9, qthreshold=0.3)

Before: torch.Size([7463, 25657])
After: torch.Size([7463, 184])
1928


In [23]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_best_ref(qa_docs, ref_docs, qa_embs, ref_embs):
    best_refs = []
    for qa_emb in qa_embs:
        sim_scores = cosine_similarity(qa_emb.reshape(1, -1), ref_embs)
        best_ref_index = np.argmax(sim_scores)
        best_ref_doc = ref_docs[best_ref_index]
        best_refs.append(best_ref_doc)
    return best_refs

In [None]:
best_refs = find_best_ref(sents, pruned_ref_docs, qa_embs, pruned_ref_embs)
for br, qa in zip(best_refs, sents):
    print('\n\n\n\n\n')
    print(f'QA: {qa}')
    print()
    print(f'Most similar ref_doc {br}')
    print('\n\n\n\n\n')