In [5]:
import os
# Run export OPENAI_API_KEY=sk-YOUR_OPENAI_API_KEY...
# Get openAI api key by reading local .env file
import dotenv
dotenv_path = dotenv.find_dotenv('.env.config')
_ = dotenv.load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']
PG_DB_PW = os.environ['POSTGRES_DB_PASSWORD']
host = os.environ['POSTGRES_DB_HOST']
port = os.environ['POSTGRES_DB_PORT']
user = os.environ['POSTGRES_DB_USER']
password = os.environ['POSTGRES_DB_PASSWORD']
dbname = os.environ['POSTGRES_DB_DBNAME']
collection_name = os.environ['PGVECTOR_COLLECTION_NAME']



In [7]:
import pandas as pd
import arxivscraper.arxivscraper as ax
import arxiv
import tiktoken
import psycopg2
import pgvector
import openai
import re
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector, DistanceStrategy


In [9]:
date_from="2024-01-22"
date_until="2024-01-23"

# # Construct the default API client.
# client = arxiv.Client()

scraper = ax.Scraper(
    category="physics:astro-ph",
    date_from=date_from,
    date_until=date_until,
    t=10,
    filters={"abstract": ["quasar"]},
)
output = scraper.scrape()
cols = ("id", "title", "categories", "abstract", "doi", "created", "updated", "authors")
df = pd.DataFrame(output, columns=cols)
df['id_no_period'] = df.id.apply(lambda x: x.replace(".",""))

fetching up to  1000 records...
fetching is completed in 5.3 seconds.
Total number of records 9


In [None]:
DataFrame(eval(data))


In [3]:
scraper = ax.Scraper(
    category="physics:astro-ph",
    date_from="2024-01-22",
    date_until="2024-01-29",
    t=10,
    filters={"abstract": ["quasar"]},
)
output = scraper.scrape()
cols = ("id", "title", "categories", "abstract", "doi", "created", "updated", "authors")
df = pd.DataFrame(output, columns=cols)
df['id_no_period'] = df.id.apply(lambda x: x.replace(".",""))

fetching up to  1000 records...
fetching is completed in 10.4 seconds.
Total number of records 24


In [4]:
from os import listdir
from os.path import isfile, join
from pypdf import PdfReader
import tempfile

DEFAULT_PDF_DIRPATH = '/data/docs/'

def read_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    text = ""

    for page in pdf_reader.pages:
        extracted_text = page.extract_text()
        if extracted_text:  # Check if text is extracted successfully
            text += extracted_text + "\n"  # Append text of each page

    return text

In [5]:
# Construct the default API client.
client = arxiv.Client()

df['content'] = None

for index, row in df.iterrows():
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[row['id']])))
    # Download the PDF to a specified directory with a custom filename.
    pdf_filename = row['id_no_period']+'.pdf'

    with tempfile.TemporaryDirectory() as tmpdirname:
        paper.download_pdf(dirpath=tmpdirname, filename=pdf_filename)
        tmp_filepath = join(tmpdirname, pdf_filename)
        df.loc[index, 'content'] = read_pdf(tmp_filepath)
        


In [9]:
# drop columns that have a lot of nas, then drop all rows with remaining nas
df = df.drop(['doi', 'updated'], axis=1)
df = df.dropna()
df = df.reset_index(drop=True)

In [69]:
df.map(lambda x: re.sub('[^a-zA-Z0-9 <,>.?/:;"|\-_=+]+', ' ', str(x)))

Unnamed: 0,id,title,categories,abstract,created,authors,id_no_period,content
0,2209.09345,the extent of intergalactic metal enrichment f...,astro-ph.ga,one of the key processes driving galaxy evolut...,2022-09-19,"natsuko yamaguchi , steven r. furlanetto , ...",220909345,"MNRAS 000, 1 ?? 2021 Preprint 22 January 2024..."
1,2303.16933,musequbes: mapping the distribution of neutral...,astro-ph.ga,"we present a detailed study of cool, neutral g...",2023-03-29,"sayak dutta , sowgat muzahid , joop schaye ...",230316933,"MNRAS 000, 1 22 2023 Preprint 26 January 202..."
2,2306.05448,little red dots: an abundant population of fai...,astro-ph.ga astro-ph.co,characterising the prevalence and properties o...,2023-06-08,"jorryt matthee , rohan p. naidu , gabriel b...",230605448,"Draft version January 24, 2024 Typeset using L..."
3,2306.13219,vadar: varstrometry for dual agn using radio i...,astro-ph.ga,binary and dual active galactic nuclei agn a...,2023-06-22,"emma schwartzman , tracy e. clarke , kristi...",230613219,"Draft version January 26, 2024 Typeset using L..."
4,2308.01505,"the million quasars milliquas catalogue, v8",astro-ph.ga,"announcing the final release, v8, of the milli...",2023-08-02,eric wim flesch,230801505,"Version December 8, 2023 Preprint typeset usin..."
5,2308.06951,a model-independent method to determine h_0 ...,astro-ph.co,absolute distances from strong lensing can anc...,2023-08-14,"xiaolei li , ryan e. keeley , arman shafiel...",230806951,"Draft version January 8, 2024 Typeset using L ..."
6,2308.0726,a search for high-redshift direct-collapse bla...,astro-ph.ga astro-ph.co astro-ph.he,direct-collapse black holes dcbhs of mass s...,2023-08-14,"armin nabizadeh , erik zackrisson , fabio p...",230807260,Astronomy Astrophysics manuscript no. main E...
7,2308.08851,toward a direct measurement of the cosmic acce...,astro-ph.co,"in this work, we report the first result from ...",2023-08-17,"jiangang kang , chang-zhi lu , tongjie zhan...",230808851,Prepared for submission to JCAP Toward a direc...
8,2401.09981,a census of photometrically selected little re...,astro-ph.ga,observations with the james webb space telesco...,2024-01-18,"vasily kokorev , karina i. caputi , jenny e...",240109981,"Draft version January 25, 2024 Typeset using L..."
9,2401.10328,chronicling the reionization history at 6 les...,astro-ph.co astro-ph.ga,the spectra of high-redshift z gtrsim 6 quas...,2024-01-18,"dominika urov kov , anna-christina eilers ,...",240110328,"Draft version January 22, 2024 Typeset using L..."


In [11]:
df.to_csv('data/arxiv.csv')

In [12]:
df = pd.read_csv('data/arxiv.csv', index_col=0)

In [13]:
df.head()

Unnamed: 0,id,title,categories,abstract,created,authors,id_no_period,content
0,2209.09345,the extent of intergalactic metal enrichment f...,astro-ph.ga,one of the key processes driving galaxy evolut...,2022-09-19,"['natsuko yamaguchi', 'steven r. furlanetto', ...",220909345,"MNRAS 000, 1–??(2021) Preprint 22 January 2024..."
1,2303.16933,musequbes: mapping the distribution of neutral...,astro-ph.ga,"we present a detailed study of cool, neutral g...",2023-03-29,"['sayak dutta', 'sowgat muzahid', 'joop schaye...",230316933,"MNRAS 000, 1–22 (2023) Preprint 26 January 202..."
2,2306.05448,little red dots: an abundant population of fai...,astro-ph.ga astro-ph.co,characterising the prevalence and properties o...,2023-06-08,"['jorryt matthee', 'rohan p. naidu', 'gabriel ...",230605448,"Draft version January 24, 2024\nTypeset using ..."
3,2306.13219,vadar: varstrometry for dual agn using radio i...,astro-ph.ga,binary and dual active galactic nuclei (agn) a...,2023-06-22,"['emma schwartzman', 'tracy e. clarke', 'krist...",230613219,"Draft version January 26, 2024\nTypeset using ..."
4,2308.01505,"the million quasars (milliquas) catalogue, v8",astro-ph.ga,"announcing the final release, v8, of the milli...",2023-08-02,['eric wim flesch'],230801505,"Version December 8, 2023\nPreprint typeset usi..."


In [14]:
#load documents from Pandas dataframe for insertion into database
from langchain.document_loaders import DataFrameLoader

# page_content_column is the column name in the dataframe to create embeddings for
loader = DataFrameLoader(df, page_content_column = 'content')
docs = loader.load()

In [15]:
docs[0].metadata

{'id': 2209.09345,
 'title': 'the extent of intergalactic metal enrichment from galactic winds during   the cosmic dawn',
 'categories': 'astro-ph.ga',
 'abstract': 'one of the key processes driving galaxy evolution during the cosmic dawn is supernova feedback. this likely helps regulate star formation inside of galaxies, but it can also drive winds that influence the large-scale intergalactic medium. here, we present a simple semi-analytic model of supernova-driven galactic winds and explore the contributions of different phases of galaxy evolution to cosmic metal enrichment in the high-redshift (z > 6) universe. we show that models calibrated to the observed galaxy luminosity function at z~6-8 have filling factors ~1% at z~6 and ~0.1% at z~12, with different star formation prescriptions providing about an order of magnitude uncertainty. despite the small fraction of space filled by winds, these scenarios predict an upper limit to the abundance of metal-line absorbers in quasar spectr

In [35]:
df.columns

Index(['id', 'title', 'categories', 'abstract', 'created', 'authors',
       'id_no_period', 'content'],
      dtype='object')

In [36]:
# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate length of essay
def get_essay_length(essay):
    word_list = essay.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# Assumes we're using the text-embedding-ada-002 model
# See https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.0001

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost(df):
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost



In [26]:
# Initialize the RecursiveCharacterTextSplitter with the desired parameters
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(encoding_name='cl100k_base',
                                                                chunk_size=1024,  # Maximum size of chunks to return
                                                                chunk_overlap=50,)

chunked_documents = splitter.split_documents(docs)

In [45]:
chunked_documents.__len__()

561

In [44]:
chunked_documents[0:10]

[Document(page_content='MNRAS 000, 1–??(2021) Preprint 22 January 2024 Compiled using MNRAS L ATEX style file v3.0\nThe extent of intergalactic metal enrichment from galactic winds during\nthe Cosmic Dawn\nNatsuko Yamaguchi,1,2★Steven R. Furlanetto1, & A. C. Trapp,1\n1Department of Physics and Astronomy, University of California Los Angeles, CA, 90095-1562, USA\n2Cahill Center for Astronomy and Astrophysics, California Institute of Technology, Pasadena CA 91125, USA\nAccepted XXX. Received YYY; in original form ZZZ\nABSTRACT\nOne of the key processes driving galaxy evolution during the Cosmic Dawn is supernova feedback. This likely helps regulate\nstarformationinsideofgalaxies,butitcanalsodrivewindsthatinfluencethelarge-scaleintergalacticmedium.Here,wepresent\na simple semi-analytic model of supernova-driven galactic winds and explore the contributions of different phases of galaxy\nevolution to cosmic metal enrichment in the high-redshift (𝑧≳6)Universe. We show that models calibrated 

In [29]:
get_total_embeddings_cost(df)

0.0525512

In [30]:
CONNECTION_STRING = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
 
conn = psycopg2.connect(CONNECTION_STRING)
cur = conn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()

In [31]:
embeddings = OpenAIEmbeddings()

In [32]:
db = PGVector.from_documents(
    documents=chunked_documents,
    embedding=embeddings,
    collection_name=collection_name,
    connection_string=CONNECTION_STRING,
    distance_strategy=DistanceStrategy.COSINE,
    pre_delete_collection=True # change this if you don't want to reset the db!!
)

### Testing the QA

In [48]:
# Create retriever from database
# We specify the number of results we want to retrieve (k=3)
retriever = db.as_retriever(
    search_kwargs={"k": 3}
    )

llm = ChatOpenAI(temperature = 0.0, model = 'gpt-3.5-turbo-16k')

from langchain.chains import RetrievalQA
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    verbose=True,
)


In [49]:
query =  "What is a quasar?"

response = qa_stuff.run(query)

from IPython.display import Markdown, display
display(Markdown(response))

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


A quasar is a highly energetic and luminous active galactic nucleus (AGN) that is powered by the accretion of matter onto a supermassive black hole at the center of a galaxy. Quasars emit large amounts of electromagnetic radiation across the entire spectrum, from radio waves to X-rays. They are among the most distant and brightest objects in the universe and can be observed even at cosmological distances. Quasars played a significant role in the early universe and provide valuable insights into the formation and evolution of galaxies and supermassive black holes.

In [54]:
# New chain to return context and sources
qa_stuff_with_sources = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

query =  "What are the defining characteristics of a dual quasar?"

# To run the query, we use a different syntax since we're returning more than just the response text
responses = qa_stuff_with_sources({"query": query})

source_documents = responses["source_documents"]
source_content = [doc.page_content for doc in source_documents]
source_metadata = [doc.metadata for doc in source_documents]

# Construct a single string with the LLM output and the source titles and urls
def construct_result_with_sources():
    result = responses['result']
    result += "\n\n"
    result += "Sources used:"
    for i in range(len(source_content)):    
        result += "\n\n"
        result += source_metadata[i]['title']
        result += "\n\n"
        return result

display(Markdown(construct_result_with_sources()))




[1m> Entering new RetrievalQA chain...[0m



[1m> Finished chain.[0m


The defining characteristics of a dual quasar are:

1. Two Quasars: A dual quasar refers to a system where two quasars are observed in close proximity to each other. Quasars are extremely luminous and active galactic nuclei powered by supermassive black holes at the centers of galaxies.

2. Kiloparsec-scale Separation: The two quasars in a dual quasar system are typically separated by a distance of a few kiloparsecs. This separation indicates that they are physically associated and not just a chance alignment.

3. Radio Emission: Dual quasars often exhibit radio emission. Radio observations can help identify and confirm the presence of dual quasars by detecting the emission from the jets or lobes associated with the active black holes.

4. Astrometric Excess Noise: Astrometric excess noise refers to the excess variability in the positions of the quasars as measured by astrometric observations. This excess noise can be an indicator of the presence of a dual quasar system.

It is important to note that the identification of dual quasars requires careful analysis and confirmation through multiple observations and techniques.

Sources used:

a dynamical measure of the black hole mass in a quasar 11 billion years   ago



In [8]:
import datetime as DT



In [7]:
today = DT.date.today().strftime('%Y-%m-%d')
today

'2024-02-02'

In [9]:
get_today_and_week_ago()

('2024-02-02', '2024-01-26')