## Get the data

In [1]:
import getpass
import os
import sys
#set openAI api key
os.environ["OPENAI_API_KEY"] = ""

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from langchain.document_loaders import DataFrameLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders import DataFrameLoader
import json
from langchain.vectorstores import Pinecone
# LLM wrapper
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI

from langchain import SerpAPIWrapper, LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationSummaryBufferMemory
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
# Helper function for printing docs
import textwrap

def pretty_text(text):
    wrapped_text = textwrap.wrap(text, width=100)
    for line in wrapped_text:
        print(line)


def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


### Load the index

In [3]:
def load_embedding_db(index_name):
    from langchain.vectorstores import FAISS
    # You may need to import the embeddings model depending on your application's structure
    # from langchain.embeddings.openai import OpenAIEmbeddings
    embeddings = OpenAIEmbeddings()
    db = FAISS.load_local(index_name, embeddings)
    return db

db = load_embedding_db("faiss_index_1000_200_1000papers")
retriever = db.as_retriever(
    search_kwargs={"k":100, "include_metadata": True})
retriever


VectorStoreRetriever(vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x160a5f390>, search_type='similarity', search_kwargs={'k': 100, 'include_metadata': True})

In [4]:

from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = OpenAIEmbeddings()
# splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator=". ")
# redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[relevant_filter]
)

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

from langchain.prompts import PromptTemplate

prompt_template = """
You are Dr. Origins, a specialist in Galactic Astronomy. Your expertise lies in reading and critically interpreting astronomy papers to generate innovative, research-based ideas. 
Every idea should commence with "I propose...".

Guidelines:
1. Base your ideas on scientifically recognized theories and principles.
2. Your ideas should be feasibly verifiable and provide avenues for further exploration or research in Galactic Astronomy.
3. Abstain from making overly speculative claims or assertions that cannot be empirically tested.
4. Always accurately reference established theories, observational data, or universally accepted astronomical concepts. Do not misrepresent or fabricate scientific references. If you are unsure about a reference, do not use it.
5. Clearly distinguish your ideas from referenced material. Explain how the referenced research inspired your idea.
6. Learn from feedback. Improve and adjust your proposal according to received input.
7. Use less than 250 words.

In response to a human query, generate an informed, precise, and critical response, ensuring your answer's clarity and originality. 

Context: {context}
Human: {question}
Dr. Origins: """
 

DRC_PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

doc_template = """--- document start ---
citation: {citation}
content:{page_content}
--- document end ---
"""

ASTRO_DOC_PROMPT = PromptTemplate(
    template=doc_template,
    input_variables=["page_content", "citation"],
)

from langchain.chains import TransformChain, LLMChain, SimpleSequentialChain

model_name = "gpt-4"
llm_qg = ChatOpenAI(temperature=0.2, model_name=model_name)


TEMP = 0.7
llm = ChatOpenAI(temperature=TEMP, model_name=model_name)

question_generator = LLMChain(llm=llm_qg, prompt=CONDENSE_QUESTION_PROMPT) # this is the question generator, i probably need to change it to another model instance
doc_chain = load_qa_chain(llm, chain_type="stuff", prompt=DRC_PROMPT, document_prompt=ASTRO_DOC_PROMPT)


memory = ConversationSummaryBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, output_key="answer")

app_retriever = compression_retriever

chain = ConversationalRetrievalChain(
    retriever=app_retriever,
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    memory=memory,
    return_source_documents=True,
    max_tokens_limit=7000,
)

In [9]:
query = """
The proposal to investigate the vertical distribution of stars in the
Milky Way's disk using Gaia data and spectroscopic surveys has potential but needs to address some
limitations and weaknesses. These include: 1. Providing a clear methodology for data integration,
considering the complex and often incompatible selection functions of different surveys. 2. Defining
the sample selection criteria to ensure the reliability of the results. 3. Addressing the
uncertainties in determining individual stellar metal abundances and proper motions. 4.
Disentangling the contributions of in-situ star formation and external accretion events, considering
the complexity of the Galactic disk's structure and the interplay between internal and external
processes. 5. Providing a detailed description of how the results will be compared with simulations
for validating the findings and testing theories of Galactic disk formation.",      "question": "Can
you revise the proposal to address these limitations and provide a more detailed methodology,
including data integration, sample selection criteria, handling uncertainties, disentangling
contributions of different processes, and comparing results with simulations to ensure the validity
and reliability of the results?
"""
result = chain({"question": query})

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 8454d8a839514f605fce6bc8db8f217e in your message.).


In [10]:
import json
import random

def process_and_save_result(result, name):
    # Generate a random number
    random_number = random.randint(1000, 10000)
    print(f"Random Number: {random_number}")

    # Save result and answer
    answer = result["answer"]
    pretty_text(answer)
    with open(f'answer_{name}.json', 'w') as fp:
        json.dump(answer, fp)

    # Extract and print the metadata
    metadata = []
    for item in result["source_documents"]:
        metadata.append(item.metadata)
        #print(item.metadata)
        
    # Save the metadata
    with open(f'metadata_{name}.json', 'w') as f:
        json.dump(metadata, f)

# Now you can simply call this function with your result and a name
process_and_save_result(result, '1000_07_e4_a2')

Random Number: 5699
I propose an investigation of the vertical distribution of stars in the Milky Way's disk using Gaia
data and spectroscopic surveys, such as APOGEE, GALAH, Gaia-ESO, and LAMOST, with an enhanced
methodology to address limitations and improve the overall reliability of the results.  1. Data
Integration: Combine the Gaia data, including parallaxes, proper motions, and radial velocities,
with complementary information from spectroscopic surveys to create a comprehensive dataset for
analysis. This will provide a more complete picture of the stellar distribution and kinematics in
the Milky Way's disk (Gaia Collaboration et al. 2018; Yang et al. 2020).  2. Sample Selection
Criteria: Establish stringent sample selection criteria to minimize biases and ensure a
representative sample of stars across different regions of the Milky Way's disk (Boubert et al.
2020). This may include selecting stars based on their spectral type, distance from the Sun,
apparent magnitude, and spat

In [None]:
def prepare_dataframe(df):
    #subselect only Content, Citation and meta_key
    #make df['ArxivID'] into string
    df['ArxivID'] = df['ArxivID'].astype(str)   
    df = df[['Content', 'citation', 'meta_key']]
    return df

def prepare_and_load_df(df_path):
    df = pd.read_csv(df_path)
    df['ArxivID'] = df['ArxivID'].astype(str)   
    df = df[['Content', 'citation', 'meta_key']]

    return df

df = prepare_and_load_df('papers/df_arxiv_100_sample.csv')

In [None]:
def generate_result_for_temperature(temp, query):
    model_name = "gpt-4"

    # Adjust temperature of models
    llm_qg = ChatOpenAI(temperature=0.3)
    llm = ChatOpenAI(temperature=temp, model_name=model_name)

    question_generator = LLMChain(llm=llm_qg, prompt=CONDENSE_QUESTION_PROMPT)
    doc_chain = load_qa_chain(llm, chain_type="stuff", prompt=DRC_PROMPT, document_prompt=ASTRO_DOC_PROMPT)

    memory = ConversationSummaryBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, output_key="answer")
    app_retriever = compression_retriever

    chain = ConversationalRetrievalChain(
        retriever=app_retriever,
        question_generator=question_generator,
        combine_docs_chain=doc_chain,
        memory=memory,
        return_source_documents=True,
        max_tokens_limit=7500,
    )

    # Query
    result = chain({"question": query})

    # Get meta_keys
    meta_keys = [item.metadata['meta_key'] for item in result['source_documents']]

    # Create a dictionary with the data
    data = {'temp': temp, 
            'history': len(result['chat_history'])//2-1, 
            'question': result['question'], 
            'result': result['answer'], 
            'meta_key': meta_keys}

    return data

# List of desired temperatures
temperatures = [0.1, 0.3, 0.5, 0.7, 0.9]  

query = """Drawing from the literature you have access to, propose a novel idea in Galactic Astronomy that can be tested with current or future observations."""

# List to store the results
results = []

# Iterate through the temperatures and append results to the list
for temp in temperatures:
    result = generate_result_for_temperature(temp, query)
    results.append(result)

# Create a DataFrame from the list
df = pd.DataFrame(results)
