# Load PDF content

In [1]:
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from pprint import pprint

In [3]:
loader = UnstructuredPDFLoader(
  "./data/book.pdf",
)
pages = loader.load()

In [7]:
content



In [9]:
content = pages[0].page_content
with open("./data/pdf_raw_output.txt", "w") as f:
  f.write(content)

In [10]:
def filter_lines(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Filter out lines with only one number or a list of numbers
    filtered_lines = [line.strip() for line in lines if not (
        line.isdigit() or all(char.isdigit() or char.isspace() for char in line))]

    with open(output_file, 'w') as file:
        file.write('\n'.join(filtered_lines))


# Example usage:
input_file_path = "./data/pdf_raw_output.txt"
output_file_path = "./data/pdf_processed.txt"
filter_lines(input_file_path, output_file_path)

# Process content manually

# Send to LLM

In [1]:
from tqdm import tqdm
import config

from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI


In [2]:
file_path = './data/llm_input.txt'

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
    
# Split the content into paragraphs based on the newline character
paragraphs = content.split('\n\n')

# Remove empty paragraphs
paragraphs = [paragraph.strip()
              for paragraph in paragraphs if paragraph.strip()]

In [3]:
# gpt-3.5-turbo-0125, gpt-3.5-turbo-instruct
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

template = """\
Condense/optimize the input text as best as possible while still retaining the \
core meaning of the content by following these steps:

Remove pronouns, adverbs, suggestions/invitations/collaborative words.

If the input text has starts with markdown heading syntax (#, ##, ### ...) \
or bullet points (-), return the input ("#", "##", "###", "-" included). 
Example:
## Heading -> ## Heading
- Bulletpoint -> - Bulletpoint

Here is the input:
{text}"""
prompt_template = PromptTemplate.from_template(template)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
chain = prompt_template | model

output_file_path = './data/llm_output.txt'
with open(output_file_path, 'a', encoding='utf-8') as f:
  for i in tqdm(range(len(paragraphs))):
    result = chain.invoke({"text": paragraphs[i]}).content
    f.writelines(result + "\n\n")

100%|██████████| 205/205 [03:25<00:00,  1.00s/it]
