In [3]:
import os
import openai
import copy

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI


embeddings = OpenAIEmbeddings()
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

def get_completion(prompt, model = llm_name, temperature = 0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(model = model,
                                            messages = messages,
                                            temperature = temperature)
    return response.choices[0].message['content']

# document folders
folder_names = ["articles", "patents", "lectures"]
# folder_names = ["articles", "lectures"]

In [None]:
# Converting Docx to Pdf for easier loading

import aspose.words as aw

for folder_name in folder_names:
    os.system(f"rm pdf_docs/{folder_name}/*.pdf")

    for file_name in os.listdir(os.path.join('docx_docs', folder_name)):
        try:
            a = aw.Document(os.path.join('docx_docs', folder_name, file_name))
            a.save(os.path.join('pdf_docs', folder_name, file_name.split('.')[0]+'.pdf'))
        except:
            print(f"Could not convert document {file_name} from {folder_name}")

# TODO: Investigate why The Moon's Rotation 2.docx cannot be converted
os.remove("pdf_docs/articles/The Moon's Rotation 2.pdf")


In [4]:
# Loading PDF documents

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pages_dict = {}
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 10000, chunk_overlap = 1000)
for folder_name in folder_names:
    loader = PyPDFDirectoryLoader(path = f"pdf_docs/{folder_name}/")
    # TODO: Try other values for the text splitter chunk_size and chunk_overlap
    pages_dict[folder_name] = loader.load_and_split(text_splitter = text_splitter)

In [5]:
# Removing watermarks from the data

strings_to_remove = ['Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty \nLtd.',
                    'Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty Ltd.',
                    'Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions \nof our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions',
                    'Created with an evaluation copy of Aspose.Words. To discover the full \nversions of our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions of our \nAPIs please visit: https://products.aspose.com/words/',
                    'of our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the \nfull versions of our APIs please visit: \nhttps://products.aspose.com/words/Evaluation',
                    'Only. Created with Aspose.Words. Copyright 2003-2023 \nAspose Pty Ltd.',
                    'Created with an evaluation copy of Aspose.Words. To discover the',
                    'full versions of our APIs please visit: \nhttps://products.aspose.com/words/',
                    ' of our \nAPIs please visit: https://products.aspose.com/words/'
                    ]

for folder_name in folder_names:
    for i, page in enumerate(pages_dict[folder_name]):
        for string in strings_to_remove:
            page.page_content = page.page_content.replace(string, '')
        assert not 'Created with Aspose' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Aspose' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Evaluation Only' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Copyright' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'https://products.aspose.com/words/' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"

        page.metadata['document_name'] = page.metadata['source'].split('/')[-1].split('.')[0]
        del page.metadata['source']

pages_dict_backup = copy.deepcopy(pages_dict)


In [44]:
# Metadata parsing

import re

metadata_dict = {}
for folder_name in folder_names:
    metadata_dict[folder_name] = {}
    metadata = open(f'metadata/{folder_name}_metadata.txt', 'r').readlines()
    column_names = metadata[0].split(' ')
    for datapoint in metadata[1:]:
        split = datapoint.split('\t')
        length = split[2]
        x = split[1]
        namedate = split[0]
        name = namedate[:-10]
        date = re.search('\d+-\d+-\d+', namedate).group()
        metadata_dict[folder_name][name] = {'document_name': name, 
                                            'date': date, 
                                            f'{column_names[2]}': x}

In [45]:
# Refresh pages

pages_dict = copy.deepcopy(pages_dict_backup)

In [64]:
# Adding metadata

for folder_name in folder_names:
    for datapoint in metadata_dict[folder_name].values():
        pages_found = False
        for i, page in enumerate(pages_dict[folder_name]):
            if page.metadata['document_name'] == datapoint['document_name']:
                pages_found = True
                del page.metadata['document_name']
                page.metadata.update(datapoint)
                page.metadata.update({'document_type': folder_name[:-1]})
                print(i, page.metadata['document_name'])
        if not pages_found:
            # TODO: Investigate why some files are not found
            print(f"Didn't find pages for {datapoint['document_name']}")

332 A New System of Alternate Current Motors and Transformers
333 A New System of Alternate Current Motors and Transformers
334 A New System of Alternate Current Motors and Transformers
335 A New System of Alternate Current Motors and Transformers
336 A New System of Alternate Current Motors and Transformers
337 A New System of Alternate Current Motors and Transformers
338 A New System of Alternate Current Motors and Transformers
339 A New System of Alternate Current Motors and Transformers
340 A New System of Alternate Current Motors and Transformers
341 A New System of Alternate Current Motors and Transformers
342 A New System of Alternate Current Motors and Transformers
Didn't find pages for Inventor Tesla Replies to Dr. Louis Duncan, Explaining His Alternating Current Motor
612 Alternate Current Motors
Didn't find pages for Mr. Nikola Tesla on Alternating Current Motors
Didn't find pages for Mr. Nikola Tesla on the Ganz Alternating-Current Motors
585 The Losses Due to Hysteresis in

In [65]:
# Vectorstore creation

from langchain.vectorstores import Chroma

all_docs = sum([pages_dict[folder_name] for folder_name in folder_names], [])
vectorstore = Chroma.from_documents(
    documents = all_docs,
    embedding = embeddings,
    persist_directory="./chroma_db"
)

In [66]:
# Vectorstore loading

vectorstore = Chroma(persist_directory="./chroma_db", embedding_function = embeddings)

In [80]:
# all_docs
for doc in all_docs:
    # print(doc.metadata)
    if 'document_type' in doc.metadata.keys() and doc.metadata['document_type'] == 'lecture':
        print(doc)
        break

page_content='February 24th, 1893 1 On Light and Other High Frequency Phenomena Delivered before the Franklin Institute, Philadelphia, February 1893, and before the \nNational Electric Light Association, St. Louis, March 1893.Introductory — Some Thoughts on the Eye When we look at the world around us, on Nature, we are impressed with its beauty and \ngrandeur. Each thing we perceive, though it may be vanishingly small, is in itself a world, that \nis, like the whole of the universe, matter and force governed by law, — a world, the \ncontemplation of which fills us with feelings of wonder and irresistibly urges us to ceaseless \nthought and inquiry. But in all this vast world, of all objects our senses reveal to us, the most \nmarvellous, the most appealing to our imagination, appears no doubt a highly developed \norganism, a thinking being. If there is anything fitted to make us admire Nature’s handiwork, it \nis certainly this inconceivable structure, which performs its innumerable mo

In [37]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="document_name",
        description="Name of the source document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="When the document was created",
        type="string",
    ),
    AttributeInfo(
        name="publisher",
        description="Name of the journal that published the document",
        type="string",
    ),
    AttributeInfo(
        name="registration_number",
        description="Registration number of the patent",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Source of the lecture document",
        type="string",
    ),
    AttributeInfo(
        name="document_type",
        description="Type of the document - options are lecture, article, patent",
        type="string",
    )
]

document_content_description = "Document content"

retriever = SelfQueryRetriever.from_llm(
    llm = llm, 
    vectorstore = vectorstore,
    document_contents = document_content_description,
    metadata_field_info = metadata_field_info, 
    verbose=True
)

In [73]:
docs = retriever.get_relevant_documents("What lectures did Nikola Tesla give")



query='Nikola Tesla' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='document_type', value='lecture') limit=None


In [None]:
# Basic Retriever example

from langchain.chains import RetrievalQA

question = "How many poles shoul my electromotor have, and what should I do if I have the wrong number?"
qa_chain = RetrievalQA.from_chain_type(llm = llm, retriever = retriever)
print(qa_chain({"query": question}))

In [83]:
# Retriever with memory example

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm = llm, retriever = retriever, memory = memory)
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")

Question: Did Nikola Tesla teach about light?
Answer: Yes, Nikola Tesla did discuss light in his lecture. He mentioned the electromagnetic theory of light and expressed his belief that electromagnetic waves, unless they had the frequency of true light waves, could not produce luminous effects. However, he believed that electrostatic waves could excite luminous radiation.
Question: When did he give that lecture and where?
Answer: Nikola Tesla gave the lecture discussing light at the Ellicot Club in Buffalo on January 12, 1897.
