# Prepare environment

## Configure jupyter

* Install libs

In [1]:
%%capture
# Prepare graphing capabilities
!pip install plotly matplotlib
# Enable functions that contain % and %%
!pip install ipython-sql
# Enable easy UI
!pip install gradio

# Create embeddings

* Install libs
* Read env from https://github.com/frtu/jupyter-workbench/blob/master/docker/jupyter-llm/docker-compose.yml#L22

In [2]:
%%capture

!pip install --upgrade tiktoken
!pip install --upgrade langchain

In [3]:
!pip show langchain

[0mName: langchain
Version: 0.0.181
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /opt/conda/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, dataclasses-json, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


## Prepare data processing

In [4]:
def normalize_text(text):
    return " ".join(text.split())

normalize_text("""
Apple is a corporate structure
 that
 
 is famous
""")

'Apple is a corporate structure that is famous'

In [5]:
import tiktoken

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

encoding = tiktoken.get_encoding(embedding_encoding)

def get_token(text):
    filtered_text = normalize_text(text)
    return encoding.encode(filtered_text)

token = get_token("""
Apple is a corporate structure
 that
 
 is famous
""")

len(token)

8

# Document manipulation

## Loading PDF

In [6]:
%%capture

!pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents. 
!pip install unstructured[local-inference]
!pip install pdf2image # for PDF

!pip install chromadb # the AI-native open-source embedding database
!pip install azure-core # Needed for imports

In [7]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

In [8]:
!wget https://s21.q4cdn.com/399680738/files/doc_financials/2022/q4/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf #meta earnings; replace with any pdf
!mkdir -p docs
!mv Meta-12.31.2022-Exhibit-99.1-FINAL.pdf docs    

--2023-05-28 15:50:04--  https://s21.q4cdn.com/399680738/files/doc_financials/2022/q4/Meta-12.31.2022-Exhibit-99.1-FINAL.pdf
Resolving s21.q4cdn.com (s21.q4cdn.com)... 139.99.62.128, 2402:1f00:8001:580::
Connecting to s21.q4cdn.com (s21.q4cdn.com)|139.99.62.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 185815 (181K) [application/pdf]
Saving to: ‘Meta-12.31.2022-Exhibit-99.1-FINAL.pdf’


2023-05-28 15:50:05 (1.45 MB/s) - ‘Meta-12.31.2022-Exhibit-99.1-FINAL.pdf’ saved [185815/185815]



In [10]:
import os

text_folder = 'docs'
loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]

In [None]:
index = VectorstoreIndexCreator().from_loaders(loaders)

In [None]:
query = "How much revenue did Meta make in 2022?"
index.query(query)

In [None]:
query = "What are meta's biggest risks?"
index.query(query)