# Load PDF content

In [5]:
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from pprint import pprint

In [6]:
loader = UnstructuredPDFLoader(
  "./data/book.pdf",
)
pages = loader.load()

In [7]:
content = pages[0].page_content
with open("./data/pdf_raw_output.txt", "w") as f:
  f.write(content)

In [8]:
def filter_lines(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Filter out lines with only one number or a list of numbers
    filtered_lines = [line.strip() for line in lines if not (
        line.isdigit() or all(char.isdigit() or char.isspace() for char in line))]

    with open(output_file, 'w') as file:
        file.write('\n'.join(filtered_lines))


# Example usage:
input_file_path = "./data/pdf_raw_output.txt"
output_file_path = "./data/pdf_processed.txt"
filter_lines(input_file_path, output_file_path)

# Process content manually

# Send to LLM

In [1]:
from tqdm import tqdm
import add_packages

from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic


In [2]:
file_path = '../data/llm_input.txt'

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
    
# Split the content into paragraphs based on the newline character
paragraphs = content.split('\n\n')

# Remove empty paragraphs
paragraphs = [paragraph.strip()
              for paragraph in paragraphs if paragraph.strip()]

template = """\
Condense/optimize the input text as best as possible while still retaining the \
core meaning of the content by following these steps:

- Remove pronouns (I, you, your, they, we, our, ...), suggestions/invitations/collaborative words (Let's, ...) as much as possible if necessary.
- Rewrite the content to make it as memorable as possible.
- Correct grammar if necessary.
- Change another word if that word is easier to remember and more popular.
- If the input text has starts with markdown heading syntax (#, ##, ### ...) \
or bullet points (-), return the input ("#", "##", "###", "-" included).
 
<examples>
<example>
Input: Normal Paragraph
Output: Normal Paragraph
</example>

<example>
Input: ## HeadingName
Output: ## HeadingName
</example>

<example>
Input: - BulletpointIdea
Output: - BulletpointIdea
</example>
</examples>

Here is the input:
{text}\
"""
prompt_template = PromptTemplate.from_template(template)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
# model = ChatAnthropic(model_name="claude-3-haiku-20240307")
chain = prompt_template | model

output_file_path = '../data/llm_output.txt'
with open(output_file_path, 'a', encoding='utf-8') as f:
  for i in tqdm(range(len(paragraphs))):
    result = chain.invoke({"text": paragraphs[i]}).content
    f.writelines(result + "\n\n")

100%|██████████| 49/49 [00:39<00:00,  1.24it/s]
