In [1]:
import openai
import os
import pandas as pd
import numpy as np
import json
import tiktoken
import psycopg2
import ast
import pgvector
import math
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
from dotenv import load_dotenv, find_dotenv


In [2]:
from dotenv import load_dotenv, find_dotenv

dotenv_path = os.path.abspath('../../../.env')
_ = load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']

In [3]:
client = openai.OpenAI(api_key = OPENAI_API_KEY)


In [4]:
from pypdf import PdfReader

def read_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    text = ""

    for page in pdf_reader.pages:
        extracted_text = page.extract_text()
        if extracted_text:  # Check if text is extracted successfully
            text += extracted_text + "\n"  # Append text of each page

    return text


In [5]:
from os import listdir
from os.path import isfile, join

pdf_dirpath = '../../data-ingest/data/docs/'
file_l = [f for f in listdir(pdf_dirpath) if isfile(join(pdf_dirpath, f))]
file_l

['pdf3.pdf', 'pdf2.pdf', 'pdf1.pdf']

In [6]:
content_l = []
for filename in file_l:
    filepath = os.path.join(pdf_dirpath, filename)
    pdf_text = read_pdf(filepath)
    content_l.append(pdf_text)
pdf_df = pd.DataFrame({'title': file_l, 'content': content_l})

In [7]:
pdf_df

Unnamed: 0,title,content
0,pdf3.pdf,Detection of stellar light from quasar host ga...
1,pdf2.pdf,Quasars and the\nIntergalactic Medium at\nCosm...
2,pdf1.pdf,"Draft version August 22, 2023\nTypeset using L..."


In [8]:
pdf_df.to_csv('../data/text/pdf_texts.csv')

In [9]:
# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate length of essay
def get_essay_length(essay):
    word_list = essay.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# Assumes we're using the text-embedding-ada-002 model
# See https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.0001

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost(df):
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost



In [10]:
get_total_embeddings_cost(pdf_df)

0.0096287

In [11]:

# Create new list with small content chunks to not hit max token limits
# Note: the maximum number of tokens for a single request is 8191
# https://openai.com/docs/api-reference/requests

# Split up the text into token sizes of around 512 tokens
def pdf_chunker(df):
    new_list = []
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        if token_len <= 512:
            new_list.append([df['title'][i], df['content'][i], token_len])
        else:
            # add content to the new list in chunks
            start = 0
            ideal_token_size = 512
            # 1 token ~ 3/4 of a word
            ideal_size = int(ideal_token_size // (4/3))
            end = ideal_size
            #split text by spaces into words
            words = text.split()

            #remove empty spaces
            words = [x for x in words if x != ' ']

            total_words = len(words)
            
            #calculate iterations
            chunks = total_words // ideal_size
            if total_words % ideal_size != 0:
                chunks += 1
            
            new_content = []
            for j in range(chunks):
                if end > total_words:
                    end = total_words
                new_content = words[start:end]
                new_content_string = ' '.join(new_content)
                new_content_token_len = num_tokens_from_string(new_content_string)
                if new_content_token_len > 0:
                    new_list.append([df['title'][i], new_content_string,  new_content_token_len])
                start += ideal_size
                end += ideal_size
    return new_list


In [12]:
# Helper function: get embeddings for a text
def get_embeddings(text):
   response = openai.Embedding.create(
       model="text-embedding-ada-002",
       input = text.replace("\n"," ")
   )
   embedding = response['data'][0]['embedding']
   return embedding


In [13]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [14]:
chunked_pdf_l = pdf_chunker(pdf_df)

In [15]:
chunked_pdf_l

[['pdf3.pdf',
  'Detection of stellar light from quasar host galaxies at redshifts above 6 Xuheng Ding1,2,*, Masafusa Onoue3,1,4,*, John D. Silverman1,2,5, Yoshiki Matsuoka6, Takuma Izumi7,8, Michael A. Strauss9, Knud Jahnke4, Camryn L. Phillips9, Junyao Li10, Marta Volonteri11, Zoltan Haiman12,13, Irham Taufik Andika14,15, Kentaro Aoki16, Shunsuke Baba17, Rebekka Bieri18, Sarah E. I. Bosman4, Connor Bottrell1,2, Anna-Christina Eilers19, Seiji Fujimoto20, Melanie Habouzit21,4, Masatoshi Imanishi7,22, Kohei Inayoshi3, Kazushi Iwasawa23,24, Nobunari Kashikawa5,25, Toshihiro Kawaguchi26, Kotaro Kohno27,25, Chien-Hsiu Lee28, Alessandro Lupi29, Jianwei Lyu30, Tohru Nagao6, Roderik Overzier31, Jan-Torge Schindler32, Malte Schramm33, Kazuhiro Shimasaku5,25, Yoshiki Toba7,34, Benny Trakhtenbrot35, Maxime Trebitsch36, Tommaso Treu37, Hideki Umehata38,39, Bram P. Venemans32, Marianne Vestergaard30,40, Fabian Walter4, Feige Wang30, and Jinyi Yang30 1Kavli Institute for the Physics and Mathematics

In [16]:
# Create embeddings for each piece of content
for i in range(len(chunked_pdf_l)):
   text = chunked_pdf_l[i][1]
   embedding = get_embedding(text)
   chunked_pdf_l[i].append(embedding)


In [19]:
# Create a new dataframe from the list
embedding_df = pd.DataFrame(chunked_pdf_l, columns=['title', 'content', 'tokens', 'embeddings'])

In [20]:
embedding_df

Unnamed: 0,title,content,tokens,embeddings
0,pdf3.pdf,Detection of stellar light from quasar host ga...,1037,"[-0.0059149968437850475, -0.006691891700029373..."
1,pdf3.pdf,"22Department of Astronomy, School of Science, ...",847,"[-0.022671174257993698, 0.023965613916516304, ..."
2,pdf3.pdf,"3.4× 1010 M⊙, respectively), compact, and disk...",634,"[-0.045199356973171234, 0.013202028349041939, ..."
3,pdf3.pdf,a point-like quasar and an extended host. The ...,603,"[-0.04193825647234917, 0.0002854476042557508, ..."
4,pdf3.pdf,"of young and relatively old stars, and thus to...",613,"[-0.01969219371676445, 0.014046339318156242, -..."
...,...,...,...,...
133,pdf1.pdf,"by known stars. In the remaining images above,...",925,"[0.002194813219830394, 0.0010228499304503202, ..."
134,pdf1.pdf,0.006 0.104 ± 0.012 0.142 ± 0.030 ... 0.006 ± ...,1023,"[-0.024804944172501564, 0.011800343170762062, ..."
135,pdf1.pdf,15.0 0.003 ± 0.002 0.009 ± 0.003 0.019 ± 0.003...,1061,"[-0.01906454563140869, -0.009697528555989265, ..."
136,pdf1.pdf,together. 24 Table 9.Applied confusion correct...,1107,"[-0.016655320301651955, -0.006832951679825783,..."


In [21]:
embedding_filepath = '../../data-ingest/data/embeddings/test_embedding.csv'
embedding_df.to_csv(embedding_filepath)

In [22]:
embedding_df

Unnamed: 0,title,content,tokens,embeddings
0,pdf3.pdf,Detection of stellar light from quasar host ga...,1037,"[-0.0059149968437850475, -0.006691891700029373..."
1,pdf3.pdf,"22Department of Astronomy, School of Science, ...",847,"[-0.022671174257993698, 0.023965613916516304, ..."
2,pdf3.pdf,"3.4× 1010 M⊙, respectively), compact, and disk...",634,"[-0.045199356973171234, 0.013202028349041939, ..."
3,pdf3.pdf,a point-like quasar and an extended host. The ...,603,"[-0.04193825647234917, 0.0002854476042557508, ..."
4,pdf3.pdf,"of young and relatively old stars, and thus to...",613,"[-0.01969219371676445, 0.014046339318156242, -..."
...,...,...,...,...
133,pdf1.pdf,"by known stars. In the remaining images above,...",925,"[0.002194813219830394, 0.0010228499304503202, ..."
134,pdf1.pdf,0.006 0.104 ± 0.012 0.142 ± 0.030 ... 0.006 ± ...,1023,"[-0.024804944172501564, 0.011800343170762062, ..."
135,pdf1.pdf,15.0 0.003 ± 0.002 0.009 ± 0.003 0.019 ± 0.003...,1061,"[-0.01906454563140869, -0.009697528555989265, ..."
136,pdf1.pdf,together. 24 Table 9.Applied confusion correct...,1107,"[-0.016655320301651955, -0.006832951679825783,..."


In [24]:
import os
from dotenv import load_dotenv, find_dotenv
dotenv_path = os.path.abspath('../../../.env')
_ = load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']
PG_DB_PW = os.environ['POSTGRES_DB_PASSWORD']
host= os.environ['POSTGRES_DB_HOST']
port= os.environ['POSTGRES_DB_PORT']
user= os.environ['POSTGRES_DB_USER']
password= os.environ['POSTGRES_DB_PASSWORD']
dbname= os.environ['POSTGRES_DB_DBNAME']

In [25]:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from pgvector.sqlalchemy import Vector
import numpy as np

Base = declarative_base()
N_DIM = 1536

class TextEmbedding(Base):
    __tablename__ = 'text_embeddings'
    id = Column(Integer, primary_key=True, autoincrement=True)
    content = Column(String)
    embedding = Vector(N_DIM)


  Base = declarative_base()


In [26]:
pg_db_url = f"postgresql+psycopg2://{user}:{PG_DB_PW}@{host}:{port}/{dbname}"

# Connect to PostgreSQL
engine = create_engine(pg_db_url)
Base.metadata.create_all(engine)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

OperationalError: (psycopg2.OperationalError) connection to server at "192.168.5.158", port 5432 failed: Operation timed out
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [31]:
def insert_embeddings(embeddings):
    for embedding in embeddings:
        new_embedding = TextEmbedding(embedding=embedding)
        session.add(new_embedding)
    session.commit()

In [35]:
insert_embeddings(embedding_df['embeddings'])

0      [-0.008913620375096798, -0.003685270668938756,...
1      [-0.027547599747776985, 0.016496682539582253, ...
2      [-0.050405103713274, 0.011267776601016521, 0.0...
3      [-0.022945208474993706, 0.004072774201631546, ...
4      [-0.019858600571751595, 0.015721391886472702, ...
                             ...                        
128    [-0.014840332791209221, 0.00358420517295599, 0...
129    [-0.008777388371527195, 0.013306329026818275, ...
130    [-0.015937522053718567, 0.023344896733760834, ...
131    [-0.012408621609210968, -0.006417050026357174,...
132    [-0.018339229747653008, -0.0001886128593469038...
Name: embeddings, Length: 133, dtype: object

In [None]:
# Setup database
db = SQLDatabase.from_uri(
    f"postgresql+psycopg2://{user}:{PG_DB_PW}@{host}:{port}/{dbname}",
)