In [None]:
# installs
#pip install langchain tiktoken openai langchainhub chromadb 

In [1]:
import requests
import json
from bs4 import BeautifulSoup
import os

In [2]:
## change the
query    = "single cell RNA sequencing"
print(type(query))

start_date = "2010-01-01" ## "YYYY-MM-DD"
end_date   = "2023-12-30"

# Replace with the actual bioRxiv API endpoint for searching articles
url = f"https://api.biorxiv.org/details/biorxiv/{start_date}/{end_date}"

# Parameters for the API request
params = {
    'q': query,  # your search query
    'num_results': 100  # number of results to return
}

# Send a request to the bioRxiv API
response = requests.get(url, params=params)




<class 'str'>


In [None]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the response JSON
    number_of_articles = 100  # Filter the first 10 articles
    data = response.json()

    # Extract article information
    articles = data['collection']  # Adjust this based on actual response structure
    articles = articles[:number_of_articles]

    # Create a directory for articles if it doesn't exist
    if not os.path.exists('articles'):
        os.makedirs('articles')

    for article in articles:
        #print(f"Title: {article['title']}")

        if article['published'] == "NA":
            continue
            
        print(f"Title: {article['title']}")

        doi = article['doi']
        # Replace / with _ in DOI to make it a valid filename
        filename = doi.replace('/', '_') + '.txt'
        # Include the folder name in the path
        filepath = os.path.join('articles', filename)

        # bioRxiv metadata
        full_text_url = f"https://www.biorxiv.org/content/{doi}.full"

        try:
            response = requests.get(full_text_url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                texts = soup.get_text(separator=' ', strip=True)

                # Writing the texts to a file inside the articles folder
                with open(filepath, 'w', encoding='utf-8') as file:
                    file.write(texts)
                print(f"Text written to {filepath}")

        except Exception as e:
            print(f"Problem reading the article: {e}")

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [4]:
#Load from a directory
loader = DirectoryLoader('./articles/', glob="./*.txt", loader_cls=TextLoader) #- We will use this in case of many articles saved in a directory

documents = loader.load()

In [5]:
#splitting the text into smaller documents 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [6]:
# Lets see how many smaller documents we have created from the large document
len(texts)

2833

In [7]:
# Lets select the 4th document to see how it looks as a small chunk
texts[50]

Document(page_content='their uniqueness in the immunological groups. Mice infection, histology and cytokine detection To assess the susceptibility of C57BL/6 mice to gastrointestinal colonization and/or infection with different S. cerevisiae isolates, C57BL/6 female mice (6-8 weeks old) were maintained under specific pathogen-free conditions at the Animal Facility of University of Perugia (Perugia, Italy). Homozygous Il17a-/-, Ifng-/- and Il10-/- mice, on a C57BL/6 background, were bred under specific pathogen-free conditions ( Fig. 3D and E and Supplemental Fig. 10). Experiments were performed according to the Italian Approved Animal Welfare Assurance A 245/2011-B. For gastrointestinal infection, 10 8 yeast cells were injected intragastrically. Mice were monitored for fungal growth at 3 and 7 days after intragastric inoculation in different organs, such as the esophagus, stomach, ileum, and colon, and for dissemination, liver and kidneys. Colony forming units (log10 CFU) were assessed

In [None]:
import os
from dotenv import load_dotenv
import chromadb
#from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings # use openai embeddings instead

#from langchain_community.vectorstores import Chroma  # We will use this as our local vectorstore


# Load environment variables from a .env file
load_dotenv()

OPENAI_API_KEY = os.getenv('openai_api_key')


# Embed and store the texts


## here we are using OpenAI embeddings but we could also use the sentensetransformer embeddings
embedding_function = OpenAIEmbeddings(model = 'text-embedding-ada-002',openai_api_key=OPENAI_API_KEY)

# create the open-source embedding function 
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(texts, embedding_function)


In [9]:
# Making a retriever
def retriever(query, num_chunks):
    retriever = db.as_retriever(search_kwargs={"k": num_chunks})
    docs = retriever.get_relevant_documents(query) # Here we are filtering documents with similar meaning to the query
    return docs

In [10]:
def print_sources(documents):
    seen_dois = set()  # A set to keep track of DOIs that have already been printed

    for doc in documents:
        
        # Extract the 'source' from the document's metadata
        source = doc.metadata['source']
        
        # Extract the DOI part from the 'source'
        parts = source.split('/')[1].split('.')  # Split into parts by '.' after 'articles/'
        doi = '.'.join(parts[:-1])  # Join all parts except the last one ('txt')

        # Replace underscore with slash in DOI
        doi = doi.replace('_', '/')

        # Print the DOI if it hasn't been seen before

        if doi not in seen_dois:
            print( f'Relevant Docs: Doi: {doi} ')
            seen_dois.add(doi)  # Add the DOI to the set of seen DOIs

In [11]:
# Set your OpenAI API key here
OPENAI_API_KEY = os.getenv('openai_api_key')
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [12]:
def answer_question(content, question):
    # Construct the conversation
    messages = [{"role": "system",
                 "content": """
                 You are a helpful research assistant. 
                 You will be given a Content and a Question. Answer the Question by rephrasing and smoothening the Content. 
                 Do not add any information but what is in the documents. 
                 If your output do not meaningfully answer the question, say irrelevant documents.
                 """
                },
                {"role": "user", "content": content},
                {"role": "user", "content": question}
               ]


    # Make the API call
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106", #"gpt-4-1106-preview"
            messages=messages,
            temperature=1  # creativity
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None




In [13]:
def get_content(query):
    docs = retriever(query, num_chunks=5)
    content = "Content: "
    for doc in docs:
        content += doc.page_content if hasattr(doc, 'page_content') else ''
    return content

In [14]:
# Asking a new question    
subject =  "enzymatic pre-amplification"
question = f"Question: what do you know about {subject}?"
content = get_content(query)
answer_question(content, question)

'The process of enzymatic pre-amplification in single-cell RNA-Seq lowers the data quality and results in unavoidable loss of sequence information due to significant truncation of the 5’ region of the transcript. Additionally, the small amount of mRNA present in a single cell makes the number of obtainable reads per cell much smaller than that obtainable from bulk samples, making rare transcripts harder to detect. These challenges present hurdles in single-cell RNA-Seq, impacting the overall effectiveness of the sequencing platforms.'

In [15]:
# Asking a new question    
subject =  "airplane"
question = f"Question: what do you know about {subject}?"
content = get_content(query)
answer_question(content, question)

'Irrelevant documents.'