In [23]:
import os
# Run export OPENAI_API_KEY=sk-YOUR_OPENAI_API_KEY...
# Get openAI api key by reading local .env file
from dotenv import load_dotenv, find_dotenv
dotenv_path = os.path.abspath('../../../.env')
_ = load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']
PG_DB_PW = os.environ['POSTGRES_DB_PASSWORD']
host= os.environ['POSTGRES_DB_HOST']
port= os.environ['POSTGRES_DB_PORT']
user= os.environ['POSTGRES_DB_USER']
password= os.environ['POSTGRES_DB_PASSWORD']
dbname= os.environ['POSTGRES_DB_DBNAME']


In [75]:
import openai
import os
import pandas as pd
import numpy as np
import json
import tiktoken
import psycopg2
import ast
import pgvector
import math
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector

from IPython.display import Markdown, display

import pandas as pd

from langchain.vectorstores.pgvector import PGVector
from langchain.sql_database import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
from langchain_openai import ChatOpenAI, OpenAI
from langchain.chains import RetrievalQA



In [38]:
# CONNECTION_STRING = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
# CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}?sslmode=require"
CONNECTION_STRING = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
collection_name= "arxiv"


In [31]:
import psycopg2
conn = psycopg2.connect(
    host=host,
    port=port,
    user=user,
    password=PG_DB_PW,
    database=dbname
)

In [78]:
# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(CONNECTION_STRING)
cur = conn.cursor()

#install pgvector
cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()

In [45]:
embedding_df = pd.read_csv('../../data-ingest/data/embeddings/test_embedding.csv', index_col=0)

In [46]:
embedding_df

Unnamed: 0,title,content,tokens,embeddings
0,pdf3.pdf,Detection of stellar light from quasar host ga...,1017,"[-0.008913620375096798, -0.003685270668938756,..."
1,pdf3.pdf,"der Universit ¨at Heidelberg (ITA), Albert-Ueb...",867,"[-0.027547599747776985, 0.016496682539582253, ..."
2,pdf3.pdf,that the host galaxies are massive (stellar ma...,662,"[-0.050405103713274, 0.011267776601016521, 0.0..."
3,pdf3.pdf,profile13whose parameters include the position...,625,"[-0.022945208474993706, 0.004072774201631546, ..."
4,pdf3.pdf,"range [−1,−0.3], the stellar age is within the...",635,"[-0.019858600571751595, 0.015721391886472702, ..."
...,...,...,...,...
128,pdf1.pdf,"in Do et al. (2019), with a peak flux density ...",544,"[-0.014840332791209221, 0.00358420517295599, 0..."
129,pdf1.pdf,light curves. varying values of α. See Figure ...,683,"[-0.008777388371527195, 0.013306329026818275, ..."
130,pdf1.pdf,1.028±0.044 1.044±0.029 1.013±0.037 0.995±0.03...,2237,"[-0.015937522053718567, 0.023344896733760834, ..."
131,pdf1.pdf,... 0.005±0.003 0.003±0.002 18.0 0.001±0.001 0...,1186,"[-0.012408621609210968, -0.006417050026357174,..."


In [47]:
#load documents from Pandas dataframe for insertion into database
from langchain.document_loaders import DataFrameLoader

# page_content_column is the column name in the dataframe to create embeddings for
loader = DataFrameLoader(embedding_df, page_content_column = 'content')
docs = loader.load()


In [48]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()


In [49]:
embedding_df['content']

0      Detection of stellar light from quasar host ga...
1      der Universit ¨at Heidelberg (ITA), Albert-Ueb...
2      that the host galaxies are massive (stellar ma...
3      profile13whose parameters include the position...
4      range [−1,−0.3], the stellar age is within the...
                             ...                        
128    in Do et al. (2019), with a peak flux density ...
129    light curves. varying values of α. See Figure ...
130    1.028±0.044 1.044±0.029 1.013±0.037 0.995±0.03...
131    ... 0.005±0.003 0.003±0.002 18.0 0.001±0.001 0...
132    of the pre-2019 (blue), 2019 (black), and post...
Name: content, Length: 133, dtype: object

In [53]:
from langchain.vectorstores.pgvector import DistanceStrategy

db = PGVector.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name=collection_name,
    connection_string=CONNECTION_STRING,
    distance_strategy=DistanceStrategy.COSINE,
    pre_delete_collection=False 
)

In [59]:
PGVector.__dict__

mappingproxy({'__module__': 'langchain_community.vectorstores.pgvector',
              '__doc__': '`Postgres`/`PGVector` vector store.\n\n    To use, you should have the ``pgvector`` python package installed.\n\n    Args:\n        connection_string: Postgres connection string.\n        embedding_function: Any embedding function implementing\n            `langchain.embeddings.base.Embeddings` interface.\n        embedding_length: The length of the embedding vector. (default: None)\n            NOTE: This is not mandatory. Defining it will prevent vectors of\n            any other size to be added to the embeddings table but, without it,\n            the embeddings can\'t be indexed.\n        collection_name: The name of the collection to use. (default: langchain)\n            NOTE: This is not the name of the table, but the name of the collection.\n            The tables will be created when initializing the store (if not exists)\n            So, make sure the user has the right permiss

In [17]:
from langchain.schema import Document

# Query for which we want to find semantically similar documents
query = "What is a quasar?"

#Fetch the k=3 most similar documents
docs =  db.similarity_search(query, k=3)


In [18]:
docs

[Document(page_content='Quasars and the Intergalactic Medium at Cosmic Dawn Xiaohui Fan,1Eduardo Ba˜ nados,2and Robert A. Simcoe3 1Steward Observatory, University of Arizona, 933 North Cherry Avenue, Tucson, AZ 85721, USA; email: xfan@arizona.edu 2Max-Planck-Institut f¨ ur Astronomie, K¨ onigstuhl 17, D-69117 Heidelberg, Germany; email: banados@mpia.de 3MIT Kavli Institute for Astrophysics and Space Research, 77 Massachusetts Ave., Cambridge, MA 02139, USA; email: simcoe@space.mit.edu Annual Review of Astronomy and Astrophysics 2023. AA:1–58 https://doi.org/10.1146/((please add article doi)) Copyright ©2023 by the author(s). All rights reservedKeywords quasar, supermassive black hole, galaxy evolution, cosmic reionization Abstract Quasars at cosmic dawn provide powerful probes of the formation and growth of the earliest supermassive black holes (SMBHs) in the uni- verse, their connections to galaxy and structure formation, and the evolution of the intergalactic medium (IGM) at the epoc

In [60]:
# Create retriever from database
# We specify the number of results we want to retrieve (k=3)
retriever = db.as_retriever(
    search_kwargs={"k": 3}
    )


In [47]:
llm = ChatOpenAI(temperature = 0.0, model = 'gpt-3.5-turbo-16k')


In [48]:
from langchain.chains import RetrievalQA
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    verbose=True,
)


In [49]:
query =  "What is a quasar?"

response = qa_stuff.run(query)

from IPython.display import Markdown, display
display(Markdown(response))


  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


A quasar, short for "quasi-stellar radio source," is a highly energetic and luminous astronomical object. It is believed to be powered by the accretion of mass onto a supermassive black hole at the center of a galaxy. Quasars emit enormous amounts of energy across the electromagnetic spectrum, from radio waves to X-rays. They are among the most distant and ancient objects in the universe, providing valuable insights into the early stages of galaxy formation and the evolution of the intergalactic medium.

In [50]:
# New chain to return context and sources
qa_stuff_with_sources = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

query =  "What is the evidence for an enhanced accretion episode from Sgr A* in 2019?"

# To run the query, we use a different syntax since we're returning more than just the response text
responses = qa_stuff_with_sources({"query": query})


  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [52]:
source_documents = responses["source_documents"]
source_content = [doc.page_content for doc in source_documents]
source_metadata = [doc.metadata for doc in source_documents]

# Construct a single string with the LLM output and the source titles and urls
def construct_result_with_sources():
    result = responses['result']
    result += "\n\n"
    result += "Sources used:"
    for i in range(len(source_content)):    
        result += "\n\n"
        result += source_metadata[i]['title']
        result += "\n\n"
        return result

display(Markdown(construct_result_with_sources()))


The evidence for an enhanced accretion episode from Sgr A* in 2019 includes the following:

1. Increase in Mean Luminosity: The mean luminosity of Sgr A* increased by a factor of approximately 3 in 2019 compared to historical measurements.

2. Higher Variance: The light curves of Sgr A* in 2019 showed higher variance compared to other time periods examined, indicating a greater average luminosity correlated with more extreme variability.

3. Elevated Flux Densities: Both faint and bright flux densities of Sgr A* in 2019 were significantly elevated compared to pre- and post-2019 observations.

4. Concentration of Bright Events: Many observatories measured a concentration of bright events from Sgr A* in 2019, indicating a transient increase in accretion activity.

5. Multi-Wavelength Observations: Observations in multiple wavelengths, including near-infrared, radio, and X-rays, also showed heightened activity in 2019, supporting the idea of a physical disturbance in the accretion flow.

These pieces of evidence suggest that the enhanced accretion episode in 2019 was caused by a temporary increase in accretion onto Sgr A*, possibly due to the delayed accretion of tidally-stripped gas from the object G2 in 2014.

Sources used:

pdf1.pdf



In [53]:
query =  "What distinguishes a quasar from a normal black hole?"

# To run the query, we use a different syntax since we're returning more than just the response text
responses = qa_stuff_with_sources({"query": query})

source_documents = responses["source_documents"]
source_content = [doc.page_content for doc in source_documents]
source_metadata = [doc.metadata for doc in source_documents]

# Construct a single string with the LLM output and the source titles and urls
def construct_result_with_sources():
    result = responses['result']
    result += "\n\n"
    result += "Sources used:"
    for i in range(len(source_content)):    
        result += "\n\n"
        result += source_metadata[i]['title']
        result += "\n\n"
        return result

display(Markdown(construct_result_with_sources()))




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


A quasar is a type of black hole, but what distinguishes it from a normal black hole is its high luminosity. Quasars are extremely bright and energetic, emitting large amounts of radiation across the electromagnetic spectrum. This high luminosity is caused by the accretion of mass onto the black hole. As matter falls into the black hole's gravitational pull, it forms an accretion disk around the black hole, releasing a tremendous amount of energy in the process. This energy is what makes quasars so luminous and visible from great distances. In contrast, a normal black hole that is not actively accreting matter would not emit significant amounts of radiation and would not be visible as a quasar.

Sources used:

pdf3.pdf



In [42]:
# db = PGVector.from_documents(
#     embedding=embeddings,
#     documents=docs,
#     collection_name=collection_name,
#     connection_string=CONNECTION_STRING,
#     pre_delete_collection=False,
# )


In [43]:
store = PGVector(
    collection_name=collection_name,
    connection_string=CONNECTION_STRING,
    embedding_function=embeddings,
)

In [44]:
store

<langchain_community.vectorstores.pgvector.PGVector at 0x11f8c4680>

In [67]:
store.add_documents(docs)

['a9f8d750-bf1c-11ee-8de9-9801a78f9833',
 'a9f8d908-bf1c-11ee-8de9-9801a78f9833',
 'a9f8d96c-bf1c-11ee-8de9-9801a78f9833',
 'a9f8d9a8-bf1c-11ee-8de9-9801a78f9833',
 'a9f8d9e4-bf1c-11ee-8de9-9801a78f9833',
 'a9f8da2a-bf1c-11ee-8de9-9801a78f9833',
 'a9f8da66-bf1c-11ee-8de9-9801a78f9833',
 'a9f8daa2-bf1c-11ee-8de9-9801a78f9833',
 'a9f8dade-bf1c-11ee-8de9-9801a78f9833',
 'a9f8db24-bf1c-11ee-8de9-9801a78f9833',
 'a9f8db60-bf1c-11ee-8de9-9801a78f9833',
 'a9f8db9c-bf1c-11ee-8de9-9801a78f9833',
 'a9f8dbd8-bf1c-11ee-8de9-9801a78f9833',
 'a9f8dc1e-bf1c-11ee-8de9-9801a78f9833',
 'a9f8dc5a-bf1c-11ee-8de9-9801a78f9833',
 'a9f8dc96-bf1c-11ee-8de9-9801a78f9833',
 'a9f8ec68-bf1c-11ee-8de9-9801a78f9833',
 'a9f8ecd6-bf1c-11ee-8de9-9801a78f9833',
 'a9f8ed26-bf1c-11ee-8de9-9801a78f9833',
 'a9f8ed6c-bf1c-11ee-8de9-9801a78f9833',
 'a9f8edb2-bf1c-11ee-8de9-9801a78f9833',
 'a9f8edf8-bf1c-11ee-8de9-9801a78f9833',
 'a9f8ee3e-bf1c-11ee-8de9-9801a78f9833',
 'a9f8ee7a-bf1c-11ee-8de9-9801a78f9833',
 'a9f8eec0-bf1c-

In [72]:
retriever = store.as_retriever()
llm = ChatOpenAI(temperature = 0.0, model = 'gpt-3.5-turbo-16k')


In [73]:
# New chain to return context and sources
qa_stuff_with_sources = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

query =  "What is the evidence for an enhanced accretion episode from Sgr A* in 2019?"

# To run the query, we use a different syntax since we're returning more than just the response text
responses = qa_stuff_with_sources({"query": query})


  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [76]:
source_documents = responses["source_documents"]
source_content = [doc.page_content for doc in source_documents]
source_metadata = [doc.metadata for doc in source_documents]

# Construct a single string with the LLM output and the source titles and urls
def construct_result_with_sources():
    result = responses['result']
    result += "\n\n"
    result += "Sources used:"
    for i in range(len(source_content)):    
        result += "\n\n"
        result += source_metadata[i]['title']
        result += "\n\n"
        return result

display(Markdown(construct_result_with_sources()))

The evidence for an enhanced accretion episode from Sgr A* in 2019 comes from a study that re-calibrated and re-analyzed all of the Keck Observatory Sgr A* imaging observations from 2005-2022. The study found that the mean luminosity of Sgr A* increased by a factor of approximately 3 in 2019 compared to previous years. The light curves from 2019 also had higher variance than in all other time periods examined. Additionally, the study observed a maximum flux of 5.6 mJy in 2019, compared to a maximum pre-2019 flux of 2.0 mJy. These findings suggest that the activity in 2019 was caused by a temporary increase in accretion onto Sgr A*. One possible explanation for this increase is the delayed accretion of tidally-stripped gas from the gaseous object G2 in 2014.

Sources used:

pdf1.pdf

