# Install Libraries

In [14]:
!pip install langchain --quiet
!pip install pdfminer.six
!pip install pillow-heif
!pip install opencv-python-headless
!pip install pikepdf unstructured_inference pytesseract
!pip install -U langchain-openai
!pip install openai --quiet
!pip install pdf2image --quiet
!pip install tiktoken --quiet
!pip install unstructured --quiet
!pip install python-dotenv



In [None]:
from langchain.document_loaders import OnlinePDFLoader

#if using on your S2 workspace remember to add a firewall exception for https://contentserver.adobe.com
loader = OnlinePDFLoader("https://www.sjsu.edu/writingcenter/docs/handouts/Research%20Papers%20in%20the%20Sciences.pdf")

data = loader.load()

: 

# Check to see how many documents and chars in the pdf

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

print(f"You have {len(data)} document(s) in your data")
print(f"There are {len(data[0].page_content)} characters in your document")

: 

# Now let's split the data into chunks before we turn them into embeddings

In [None]:
# Split the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 0)
texts = text_splitter.split_documents(data)

# Check how many documents do we have now
print(f"You have {len(texts)} pages")

# Let's convert these chunks into embeddings and then store it in a SingleStore table

## Let's create the table first

In [None]:
%%sql
USE reader_data;
DROP TABLE IF EXISTS my_book;
CREATE TABLE IF NOT EXISTS my_book (
    id INT PRIMARY KEY, 
    text TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, 
    embedding BLOB
);

## Connect to OpenAI API

In [30]:
import openai
import os
from dotenv import load_dotenv

# Load variables from .env file
load_dotenv('.env')

# Check to see if there is an environment variable with your API Key,
# If not, use what you put below.

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [None]:
print(texts[0].__dict__)

In [None]:
from sqlalchemy import *

db_connection = create_engine(connection_url)
sql_connection = db_connection.connect()

In [None]:
from langchain_openai import OpenAIEmbeddings

# Initialize the OpenAIEmbeddings
embedder = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)

# Now let's add the embeddings to the my_book table. Truncate to make sure we don't overwrite.

In [None]:
from sqlalchemy import text

# Clear the my_book table
sql_connection.execute(text("TRUNCATE TABLE my_book;"))

# Iterate over the texts
for i, document in enumerate(texts):
    # Extract the text content from the Documents
    text_content = document.page_content

    # Convert the text to embeddings
    embedding = embedder.embed_documents([text_content])[0]
    print(f"Embedding {i}: {embedding}\n")

    # Insert the text and its embedding into the database
    stmt = """
        INSERT INTO my_book (
            id,
            text,
            embedding
        )
        VALUES (
            :id,
            :text,
            JSON_ARRAY_PACK_F32(:embedding)
        )
    """
    parameters = {
        'id': int(i),
        'text': str(text_content),
        'embedding': str(embedding)
    }

    sql_connection.execute(text(stmt), parameters)

sql_connection.close()

In [None]:
%%sql
USE reader_data;
SELECT JSON_ARRAY_UNPACK_F32(embedding), text
FROM my_book
WHERE id = 5;