In [10]:
import os 

import seaborn as sns 

from tqdm import tqdm
from langchain_cohere import ChatCohere
from langchain_core.messages import HumanMessage
from langchain_community.vectorstores import FAISS
from unstructured.partition.pdf import partition_pdf
from langchain_community.embeddings import HuggingFaceEmbeddings
import time

In [2]:
def extract_text(paths) : 

    texts = []

    for path in tqdm(paths , total = len(paths)) : 

        raw_pdf_elements = partition_pdf(
            filename = path ,
            extract_images_in_pdf = True , 
            infer_table_structure = True , 
            chunking_strategy = 'by_title' ,
            max_characters = 4000 , 
            new_after_n_chars = 3800 , 
            combine_text_under_n_chars = 2000 , 
            extract_image_block_output_dir = 'out' , 
        )

        texts.extend([
            ele.text
            for ele 
            in raw_pdf_elements
        ])
        
    return texts

In [3]:
def norm(lis) : 

    lis = [
        (val - min(lis)) / (max(lis) - min(lis))
        for val 
        in lis
    ]

    return lis

In [4]:
paths = os.listdir('PDFs')
paths = [
    f'PDFs/{pdf}'
    for pdf 
    in paths
]

In [None]:
unstructured_texts = extract_text(paths)

In [6]:
normal_text = ' '.join(unstructured_texts)
normal_text = [
    normal_text[index : index + 512]
    for index 
    in range(0 , len(normal_text) , 512)
]

In [11]:
llm = ChatCohere(cohere_api_key = 'FELFXgLGfcqsy4eh4Q75dXNT7VyIQjKZmhkiIug3')

In [13]:
def get_answer(text , sleep) : 

    content = llm.invoke([
        HumanMessage(content = text)
    ]).content

    time.sleep(sleep)

    return content

In [27]:
context_testing_prompt = open('Assets/Prompts/Context_Testing.txt').read()
question_asking_prompt = open('Assets/Prompts/Question_Asking.txt').read()
question_checking_prompt = open('Assets/Prompts/Question_Checking.txt').read()

In [None]:
uncstructured_context_scores = [
    float(
        get_answer(
            context_testing_prompt.format(text) , 
            sleep = 2
        )
    ) # 40 API Calls per Minute
    for text
    in tqdm(unstructured_texts , total = len(unstructured_texts))
]

normal_text_context_scores = [
    float(
        get_answer(
            context_testing_prompt.format(text) , 
            sleep = 2
        )
    ) # 40 API Calls per Minute
    for text
    in tqdm(normal_text , total = len(normal_text))
]

In [None]:
unstructured_questions = [
    get_answer(
        question_asking_prompt.format(ele) , 
        sleep = 2
    )
    for ele 
    in tqdm(unstructured_texts , total = len(unstructured_texts))
]

normal_questions = [
    get_answer(
        question_asking_prompt.format(ele) , 
        sleep = 2
    )
    for ele 
    in tqdm(normal_text , total = len(normal_text))
]

In [None]:
unstructured_question_scores = [
    float(
        get_answer(
            question_checking_prompt.format(text , question) , 
            sleep = 2
        )
    )
    for text , question
    in tqdm(zip(unstructured_texts , unstructured_questions) , total = len(unstructured_texts))
]

normal_question_scores = [
    float(
        get_answer(
            question_checking_prompt.format(text , question) , 
            sleep = 2
        )
    )
    for text , question
    in tqdm(zip(normal_text , normal_questions) , total = len(normal_text))
]

In [None]:
sns.lineplot(norm(uncstructured_context_scores))
sns.lineplot(norm(unstructured_question_scores))

In [None]:
sns.lineplot(norm(normal_text_context_scores))
sns.lineplot(norm(normal_question_scores))

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [13]:
unstructured_vectorstore = FAISS.from_texts(unstructured_texts , embedding = embeddings)
normal_vectorstore = FAISS.from_texts(normal_text , embedding = embeddings)

In [14]:
unstructured_vectorstore.save_local('unstructured_vc')
normal_vectorstore.save_local('normal_vc')

AttributeError: module 'pdf2image' has no attribute '__version__'

Version-controlled requirements written to version_controlled_requirements.txt
