In [23]:
import os
# Run export OPENAI_API_KEY=sk-YOUR_OPENAI_API_KEY...
# Get openAI api key by reading local .env file
from dotenv import load_dotenv, find_dotenv
dotenv_path = os.path.abspath('../../../.env')
_ = load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']
PG_DB_PW = os.environ['POSTGRES_DB_PASSWORD']
host= os.environ['POSTGRES_DB_HOST']
port= os.environ['POSTGRES_DB_PORT']
user= os.environ['POSTGRES_DB_USER']
password= os.environ['POSTGRES_DB_PASSWORD']
dbname= os.environ['POSTGRES_DB_DBNAME']

In [24]:
import psycopg2
import openai
import sys
import os

from langchain_openai import OpenAIEmbeddings
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Generator,
    Iterable,
    List,
    Optional,
    Tuple,
    Type,
)

from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores.pgvector import PGVector, DistanceStrategy
from langchain_core.documents.base import Document

import os.path as osp
    
# from src.config import settings

# OPENAI_API_KEY  = settings.OPENAI_API_KEY
# PG_DB_PW = settings.POSTGRES_DB_PASSWORD
# host= settings.POSTGRES_DB_HOST
# port= settings.POSTGRES_DB_PORT
# user= settings.POSTGRES_DB_USER
# password= settings.POSTGRES_DB_PASSWORD
# dbname= settings.POSTGRES_DB_DBNAME

In [64]:
def get_connection_str():
    return f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
 
def get_conn_cur():
    conn = psycopg2.connect(get_connection_str())
    cur = conn.cursor()
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
    conn.commit()
    return conn, cur

def get_langchain_docs_from_df(df):
    loader = DataFrameLoader(df, page_content_column = 'content')
    return loader.load()

class VectorStore:
    """
    Description here
    """

    def __init__(self,
                 collection_name='arxiv',
                 connection_string:str = get_connection_str(),
                 embedding_function=OpenAIEmbeddings(),
                 distance_strategy=DistanceStrategy.COSINE):
        self.collection_name=collection_name
        self.connection_string=connection_string
        self.embedding_function=embedding_function
        self.distance_strategy=distance_strategy

    def connect_to_existing_vs_collection(self, collection_name=None):
        if collection_name is None:
            collection_name = self.collection_name
        try:
            self.store = PGVector(
                collection_name=collection_name,
                connection_string=self.connection_string,
                embedding_function=self.embedding_function,
            )
        except Exception as e:
            print(e)
    
    def init_from_documents(self, docs:List[Document], reset_collection=False):
        self.store = self.store.from_documents(documents=docs,
                                  embedding=self.embedding_function,
                                  collection_name=self.collection_name,
                                  connection_string=self.connection_string,
                                  distance_strategy=self.DistanceStrategy,
                                  pre_delete_collection=reset_collection)   

    def add_texts(self, texts:Iterable[str],
                  metadatas: Optional[List[dict]] = None,
                  ids: Optional[List[str]] = None,):
        self.store.add_texts(texts, metadatas, ids)

    def delete_collection(self):
        self.store.delete_collection()
    

from pypdf import PdfReader


        



In [119]:
from os import listdir
from os.path import isfile, join
from pypdf import PdfReader
import openai
import os
import pandas as pd
import numpy as np
import json
import tiktoken
from langchain.document_loaders import DataFrameLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_transformers.openai_functions import create_metadata_tagger
from langchain_openai import ChatOpenAI

# client = openai.OpenAI(api_key = OPENAI_API_KEY)


def read_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    text = ""

    for page in pdf_reader.pages:
        extracted_text = page.extract_text()
        if extracted_text:  # Check if text is extracted successfully
            text += extracted_text + "\n"  # Append text of each page

    return text

# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate length of essay
def get_essay_length(essay):
    word_list = essay.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# Assumes we're using the text-embedding-ada-002 model
# See https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.0001

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost(df):
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost


# Create new list with small content chunks to not hit max token limits
# Note: the maximum number of tokens for a single request is 8191
# https://openai.com/docs/api-reference/requests

# Split up the text into token sizes of around 512 tokens
def pdf_chunker(df):
    new_list = []
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        if token_len <= 512:
            new_list.append([df['title'][i], df['content'][i], token_len])
        else:
            # add content to the new list in chunks
            start = 0
            ideal_token_size = 512
            # 1 token ~ 3/4 of a word
            ideal_size = int(ideal_token_size // (4/3))
            end = ideal_size
            #split text by spaces into words
            words = text.split()

            #remove empty spaces
            words = [x for x in words if x != ' ']

            total_words = len(words)
            
            #calculate iterations
            chunks = total_words // ideal_size
            if total_words % ideal_size != 0:
                chunks += 1
            
            new_content = []
            for j in range(chunks):
                if end > total_words:
                    end = total_words
                new_content = words[start:end]
                new_content_string = ' '.join(new_content)
                new_content_token_len = num_tokens_from_string(new_content_string)
                if new_content_token_len > 0:
                    new_list.append([df['title'][i], new_content_string,  new_content_token_len])
                start += ideal_size
                end += ideal_size
    return new_list


DEFAULT_PDF_DIRPATH = '../../data-ingest/data/docs/'

def pdfs_to_docs(pdf_dirpath=DEFAULT_PDF_DIRPATH):
    file_l = [join(pdf_dirpath, f) for f in listdir(pdf_dirpath) if isfile(join(pdf_dirpath, f))]
    doc_l = []
    for filename in file_l:
        loader = PyPDFLoader(filename)
        pages = loader.load_and_split()
        doc_l.extend(pages)
    return doc_l

    # # chunked_pdf_l = pdf_chunker(pdf_df)
    # #load documents from Pandas dataframe for insertion into database
    # # page_content_column is the column name in the dataframe to create embeddings for
    # loader = DataFrameLoader(pdf_df, page_content_column = 'content')
    # docs = loader.load()

    # return docs
# schema = {
#     "properties": {
#         "article_title": {"type": "string"},
#         "authors": {"type": "string"},
#         "publish_date": {"type": "string"},
#     },
#     "required": ["article_title", "authors", "publish_date"],
# }

# def enhance_docs_metadata(docs, schema):
#     # Must be an OpenAI model that supports functions
#     llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
#     document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)
#     enhanced_docs = document_transformer.transform_documents(docs)
#     return enhanced_docs





In [112]:
doc_l = pdfs_to_docs()

In [120]:
doc_l

[Document(page_content='Detection of stellar light from quasar host galaxies at\nredshifts above 6\nXuheng Ding1,2,*, Masafusa Onoue3,1,4,*, John D. Silverman1,2,5, Yoshiki Matsuoka6,\nTakuma Izumi7,8, Michael A. Strauss9, Knud Jahnke4, Camryn L. Phillips9, Junyao Li10,\nMarta Volonteri11, Zoltan Haiman12,13, Irham Taufik Andika14,15, Kentaro Aoki16, Shunsuke\nBaba17, Rebekka Bieri18, Sarah E. I. Bosman4, Connor Bottrell1,2, Anna-Christina Eilers19,\nSeiji Fujimoto20, Melanie Habouzit21,4, Masatoshi Imanishi7,22, Kohei Inayoshi3, Kazushi\nIwasawa23,24, Nobunari Kashikawa5,25, Toshihiro Kawaguchi26, Kotaro Kohno27,25,\nChien-Hsiu Lee28, Alessandro Lupi29, Jianwei Lyu30, Tohru Nagao6, Roderik Overzier31,\nJan-Torge Schindler32, Malte Schramm33, Kazuhiro Shimasaku5,25, Yoshiki Toba7,34, Benny\nTrakhtenbrot35, Maxime Trebitsch36, Tommaso Treu37, Hideki Umehata38,39, Bram P.\nVenemans32, Marianne Vestergaard30,40, Fabian Walter4, Feige Wang30, and Jinyi Yang30\n1Kavli Institute for the Phys

In [118]:
[doc_l[0].metadata['page']==0]

[Document(page_content='Detection of stellar light from quasar host galaxies at\nredshifts above 6\nXuheng Ding1,2,*, Masafusa Onoue3,1,4,*, John D. Silverman1,2,5, Yoshiki Matsuoka6,\nTakuma Izumi7,8, Michael A. Strauss9, Knud Jahnke4, Camryn L. Phillips9, Junyao Li10,\nMarta Volonteri11, Zoltan Haiman12,13, Irham Taufik Andika14,15, Kentaro Aoki16, Shunsuke\nBaba17, Rebekka Bieri18, Sarah E. I. Bosman4, Connor Bottrell1,2, Anna-Christina Eilers19,\nSeiji Fujimoto20, Melanie Habouzit21,4, Masatoshi Imanishi7,22, Kohei Inayoshi3, Kazushi\nIwasawa23,24, Nobunari Kashikawa5,25, Toshihiro Kawaguchi26, Kotaro Kohno27,25,\nChien-Hsiu Lee28, Alessandro Lupi29, Jianwei Lyu30, Tohru Nagao6, Roderik Overzier31,\nJan-Torge Schindler32, Malte Schramm33, Kazuhiro Shimasaku5,25, Yoshiki Toba7,34, Benny\nTrakhtenbrot35, Maxime Trebitsch36, Tommaso Treu37, Hideki Umehata38,39, Bram P.\nVenemans32, Marianne Vestergaard30,40, Fabian Walter4, Feige Wang30, and Jinyi Yang30\n1Kavli Institute for the Phys

In [106]:
enhanced_doc_l = enhance_docs_metadata(doc_l, schema)

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 17345 tokens (17293 in the messages, 52 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [71]:
pdfs_to_docs()

[Document(page_content='Detection of stellar light from quasar host galaxies at\nredshifts above 6\nXuheng Ding1,2,*, Masafusa Onoue3,1,4,*, John D. Silverman1,2,5, Yoshiki Matsuoka6,\nTakuma Izumi7,8, Michael A. Strauss9, Knud Jahnke4, Camryn L. Phillips9, Junyao Li10,\nMarta Volonteri11, Zoltan Haiman12,13, Irham Taufik Andika14,15, Kentaro Aoki16, Shunsuke\nBaba17, Rebekka Bieri18, Sarah E. I. Bosman4, Connor Bottrell1,2, Anna-Christina Eilers19,\nSeiji Fujimoto20, Melanie Habouzit21,4, Masatoshi Imanishi7,22, Kohei Inayoshi3, Kazushi\nIwasawa23,24, Nobunari Kashikawa5,25, Toshihiro Kawaguchi26, Kotaro Kohno27,25,\nChien-Hsiu Lee28, Alessandro Lupi29, Jianwei Lyu30, Tohru Nagao6, Roderik Overzier31,\nJan-Torge Schindler32, Malte Schramm33, Kazuhiro Shimasaku5,25, Yoshiki Toba7,34, Benny\nTrakhtenbrot35, Maxime Trebitsch36, Tommaso Treu37, Hideki Umehata38,39, Bram P.\nVenemans32, Marianne Vestergaard30,40, Fabian Walter4, Feige Wang30, and Jinyi Yang30\n1Kavli Institute for the Phys

In [None]:
# Must be an OpenAI model that supports functions
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

In [68]:
pdfs_to_docs()

0.009539


[Document(page_content='Detection of stellar light from quasar host galaxies at\nredshifts above 6\nXuheng Ding1,2,*, Masafusa Onoue3,1,4,*, John D. Silverman1,2,5, Yoshiki Matsuoka6,\nTakuma Izumi7,8, Michael A. Strauss9, Knud Jahnke4, Camryn L. Phillips9, Junyao Li10,\nMarta Volonteri11, Zoltan Haiman12,13, Irham Taufik Andika14,15, Kentaro Aoki16, Shunsuke\nBaba17, Rebekka Bieri18, Sarah E. I. Bosman4, Connor Bottrell1,2, Anna-Christina Eilers19,\nSeiji Fujimoto20, Melanie Habouzit21,4, Masatoshi Imanishi7,22, Kohei Inayoshi3, Kazushi\nIwasawa23,24, Nobunari Kashikawa5,25, Toshihiro Kawaguchi26, Kotaro Kohno27,25,\nChien-Hsiu Lee28, Alessandro Lupi29, Jianwei Lyu30, Tohru Nagao6, Roderik Overzier31,\nJan-Torge Schindler32, Malte Schramm33, Kazuhiro Shimasaku5,25, Yoshiki Toba7,34, Benny\nTrakhtenbrot35, Maxime Trebitsch36, Tommaso Treu37, Hideki Umehata38,39, Bram P.\nVenemans32, Marianne Vestergaard30,40, Fabian Walter4, Feige Wang30, and Jinyi Yang30\n1Kavli Institute for the Phys

In [65]:
vs = VectorStore()

In [66]:
vs.connect_to_vs_by_collection_name()

AttributeError: 'VectorStore' object has no attribute 'connect_to_vs_by_collection_name'

In [59]:
vs.delete_collection()

In [54]:
import pandas as pd

text_df = pd.read_csv('../data/text/pdf_texts.csv', index_col=0)
text_df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,title,content
0,pdf3.pdf,Detection of stellar light from quasar host ga...
1,pdf2.pdf,Quasars and the\nIntergalactic Medium at\nCosm...
2,pdf1.pdf,"Draft version August 22, 2023\nTypeset using L..."


In [62]:
vs.add_texts(text_df.content)

AttributeError: 'VectorStore' object has no attribute 'store'

In [None]:
vs.