In [7]:
import openai
import os
import pandas as pd
import numpy as np
import json
import tiktoken
import psycopg2
import ast
import pgvector
import math
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
from dotenv import load_dotenv, find_dotenv


In [8]:
dotenv_path = os.path.abspath('../../../.env')
_ = load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']

In [9]:
client = openai.OpenAI(api_key = OPENAI_API_KEY)


In [10]:
from pypdf import PdfReader

def read_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    text = ""

    for page in pdf_reader.pages:
        extracted_text = page.extract_text()
        if extracted_text:  # Check if text is extracted successfully
            text += extracted_text + "\n"  # Append text of each page

    return text


In [11]:
from os import listdir
from os.path import isfile, join

pdf_dirpath = '../../data-ingest/data/docs/'
file_l = [f for f in listdir(pdf_dirpath) if isfile(join(pdf_dirpath, f))]
file_l

['pdf3.pdf', 'pdf2.pdf', 'pdf1.pdf']

In [12]:
content_l = []
for filename in file_l:
    filepath = os.path.join(pdf_dirpath, filename)
    pdf_text = read_pdf(filepath)
    content_l.append(pdf_text)
pdf_df = pd.DataFrame({'title': file_l, 'content': content_l})

In [13]:
pdf_df

Unnamed: 0,title,content
0,pdf3.pdf,Detection of stellar light from quasar host ga...
1,pdf2.pdf,Quasars and the\nIntergalactic Medium at\nCosm...
2,pdf1.pdf,"Draft version August 22, 2023\nTypeset using L..."


In [14]:
# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate length of essay
def get_essay_length(essay):
    word_list = essay.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# Assumes we're using the text-embedding-ada-002 model
# See https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.0001

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost(df):
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost



In [15]:
get_total_embeddings_cost(pdf_df)

0.009539

In [16]:

# Create new list with small content chunks to not hit max token limits
# Note: the maximum number of tokens for a single request is 8191
# https://openai.com/docs/api-reference/requests

# Split up the text into token sizes of around 512 tokens
def pdf_chunker(df):
    new_list = []
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        if token_len <= 512:
            new_list.append([df['title'][i], df['content'][i], token_len])
        else:
            # add content to the new list in chunks
            start = 0
            ideal_token_size = 512
            # 1 token ~ 3/4 of a word
            ideal_size = int(ideal_token_size // (4/3))
            end = ideal_size
            #split text by spaces into words
            words = text.split()

            #remove empty spaces
            words = [x for x in words if x != ' ']

            total_words = len(words)
            
            #calculate iterations
            chunks = total_words // ideal_size
            if total_words % ideal_size != 0:
                chunks += 1
            
            new_content = []
            for j in range(chunks):
                if end > total_words:
                    end = total_words
                new_content = words[start:end]
                new_content_string = ' '.join(new_content)
                new_content_token_len = num_tokens_from_string(new_content_string)
                if new_content_token_len > 0:
                    new_list.append([df['title'][i], new_content_string,  new_content_token_len])
                start += ideal_size
                end += ideal_size
    return new_list


In [17]:
# Helper function: get embeddings for a text
def get_embeddings(text):
   response = openai.Embedding.create(
       model="text-embedding-ada-002",
       input = text.replace("\n"," ")
   )
   embedding = response['data'][0]['embedding']
   return embedding


In [18]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [19]:
chunked_pdf_l = pdf_chunker(pdf_df)

In [20]:
# Create embeddings for each piece of content
for i in range(len(chunked_pdf_l)):
   text = chunked_pdf_l[i][1]
   embedding = get_embedding(text)
   chunked_pdf_l[i].append(embedding)


In [21]:
# Create a new dataframe from the list
embedding_df = pd.DataFrame(chunked_pdf_l, columns=['title', 'content', 'tokens', 'embeddings'])

In [22]:
embedding_df

Unnamed: 0,title,content,tokens,embeddings
0,pdf3.pdf,Detection of stellar light from quasar host ga...,1017,"[-0.008913620375096798, -0.003685270668938756,..."
1,pdf3.pdf,"der Universit ¨at Heidelberg (ITA), Albert-Ueb...",867,"[-0.027547599747776985, 0.016496682539582253, ..."
2,pdf3.pdf,that the host galaxies are massive (stellar ma...,662,"[-0.050405103713274, 0.011267776601016521, 0.0..."
3,pdf3.pdf,profile13whose parameters include the position...,625,"[-0.022945208474993706, 0.004072774201631546, ..."
4,pdf3.pdf,"range [−1,−0.3], the stellar age is within the...",635,"[-0.019858600571751595, 0.015721391886472702, ..."
...,...,...,...,...
128,pdf1.pdf,"in Do et al. (2019), with a peak flux density ...",544,"[-0.014840332791209221, 0.00358420517295599, 0..."
129,pdf1.pdf,light curves. varying values of α. See Figure ...,683,"[-0.008777388371527195, 0.013306329026818275, ..."
130,pdf1.pdf,1.028±0.044 1.044±0.029 1.013±0.037 0.995±0.03...,2237,"[-0.015937522053718567, 0.023344896733760834, ..."
131,pdf1.pdf,... 0.005±0.003 0.003±0.002 18.0 0.001±0.001 0...,1186,"[-0.012408621609210968, -0.006417050026357174,..."


In [25]:
embedding_filepath = '../../data-ingest/data/embeddings/test_embedding.csv'
embedding_df.to_csv(embedding_filepath)

In [24]:
embedding_df

Unnamed: 0,title,content,tokens,embeddings
0,pdf3.pdf,Detection of stellar light from quasar host ga...,1017,"[-0.008913620375096798, -0.003685270668938756,..."
1,pdf3.pdf,"der Universit ¨at Heidelberg (ITA), Albert-Ueb...",867,"[-0.027547599747776985, 0.016496682539582253, ..."
2,pdf3.pdf,that the host galaxies are massive (stellar ma...,662,"[-0.050405103713274, 0.011267776601016521, 0.0..."
3,pdf3.pdf,profile13whose parameters include the position...,625,"[-0.022945208474993706, 0.004072774201631546, ..."
4,pdf3.pdf,"range [−1,−0.3], the stellar age is within the...",635,"[-0.019858600571751595, 0.015721391886472702, ..."
...,...,...,...,...
128,pdf1.pdf,"in Do et al. (2019), with a peak flux density ...",544,"[-0.014840332791209221, 0.00358420517295599, 0..."
129,pdf1.pdf,light curves. varying values of α. See Figure ...,683,"[-0.008777388371527195, 0.013306329026818275, ..."
130,pdf1.pdf,1.028±0.044 1.044±0.029 1.013±0.037 0.995±0.03...,2237,"[-0.015937522053718567, 0.023344896733760834, ..."
131,pdf1.pdf,... 0.005±0.003 0.003±0.002 18.0 0.001±0.001 0...,1186,"[-0.012408621609210968, -0.006417050026357174,..."


In [27]:
import os
from dotenv import load_dotenv, find_dotenv
dotenv_path = os.path.abspath('../../../.env')
_ = load_dotenv(dotenv_path)
OPENAI_API_KEY  = os.environ['OPENAI_API_KEY']
PG_DB_PW = os.environ['PG_DB_PW']
host= os.environ['HOST']
port= os.environ['PORT']
user= os.environ['USER']
password= os.environ['PG_DB_PW']
dbname= os.environ['DBNAME']

In [29]:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from pgvector.sqlalchemy import Vector
import numpy as np

Base = declarative_base()
N_DIM = 1536

class TextEmbedding(Base):
    __tablename__ = 'text_embeddings'
    id = Column(Integer, primary_key=True, autoincrement=True)
    content = Column(String)
    embedding = Vector(N_DIM)


  Base = declarative_base()


In [30]:
pg_db_url = f"postgresql+psycopg2://{user}:{PG_DB_PW}@{host}:{port}/{dbname}"

# Connect to PostgreSQL
engine = create_engine(pg_db_url)
Base.metadata.create_all(engine)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

In [31]:
def insert_embeddings(embeddings):
    for embedding in embeddings:
        new_embedding = TextEmbedding(embedding=embedding)
        session.add(new_embedding)
    session.commit()

In [35]:
insert_embeddings(embedding_df['embeddings'])

0      [-0.008913620375096798, -0.003685270668938756,...
1      [-0.027547599747776985, 0.016496682539582253, ...
2      [-0.050405103713274, 0.011267776601016521, 0.0...
3      [-0.022945208474993706, 0.004072774201631546, ...
4      [-0.019858600571751595, 0.015721391886472702, ...
                             ...                        
128    [-0.014840332791209221, 0.00358420517295599, 0...
129    [-0.008777388371527195, 0.013306329026818275, ...
130    [-0.015937522053718567, 0.023344896733760834, ...
131    [-0.012408621609210968, -0.006417050026357174,...
132    [-0.018339229747653008, -0.0001886128593469038...
Name: embeddings, Length: 133, dtype: object

In [None]:
# Setup database
db = SQLDatabase.from_uri(
    f"postgresql+psycopg2://{user}:{PG_DB_PW}@{host}:{port}/{dbname}",
)