In [258]:
import os
import openai
import copy

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI


embeddings = OpenAIEmbeddings()
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

def get_completion(prompt, model = llm_name, temperature = 0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(model = model,
                                            messages = messages,
                                            temperature = temperature)
    return response.choices[0].message['content']

# document folders
# folder_names = ["articles", "patents", "lectures"]
folder_names = ["articles", "lectures"]

In [None]:
# Converting Docx to Pdf for easier loading

import aspose.words as aw

for folder_name in folder_names:
    os.system(f"rm pdf_docs/{folder_name}/*.pdf")

    for file_name in os.listdir(os.path.join('docx_docs', folder_name)):
        try:
            a = aw.Document(os.path.join('docx_docs', folder_name, file_name))
            a.save(os.path.join('pdf_docs', folder_name, file_name.split('.')[0]+'.pdf'))
        except:
            print(f"Could not convert document {file_name} from {folder_name}")

# TODO: Investigate why The Moon's Rotation 2.docx cannot be converted
os.remove("pdf_docs/articles/The Moon's Rotation 2.pdf")


In [344]:
# Loading PDF documents

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pages_dict = {}
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 10000, chunk_overlap = 1000)
for folder_name in folder_names:
    loader = PyPDFDirectoryLoader(path = f"pdf_docs/{folder_name}/")
    # TODO: Try other values for the text splitter chunk_size and chunk_overlap
    pages_dict[folder_name] = loader.load_and_split(text_splitter = text_splitter)

In [345]:
# Removing watermarks from the data

strings_to_remove = ['Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty \nLtd.',
                    'Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty Ltd.',
                    'Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions \nof our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions',
                    'Created with an evaluation copy of Aspose.Words. To discover the full \nversions of our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions of our \nAPIs please visit: https://products.aspose.com/words/',
                    'of our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the \nfull versions of our APIs please visit: \nhttps://products.aspose.com/words/Evaluation',
                    'Only. Created with Aspose.Words. Copyright 2003-2023 \nAspose Pty Ltd.',
                    'Created with an evaluation copy of Aspose.Words. To discover the',
                    'full versions of our APIs please visit: \nhttps://products.aspose.com/words/',
                    ' of our \nAPIs please visit: https://products.aspose.com/words/'
                    ]

for folder_name in folder_names:
    for i, page in enumerate(pages_dict[folder_name]):
        for string in strings_to_remove:
            page.page_content = page.page_content.replace(string, '')
        assert not 'Created with Aspose' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Aspose' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Evaluation Only' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Copyright' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'https://products.aspose.com/words/' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"

        page.metadata['document_name'] = page.metadata['source'].split('/')[-1].split('.')[0]
        del page.metadata['source']

pages_dict_backup = copy.deepcopy(pages_dict)


In [346]:
# Metadata parsing

import re

metadata_dict = {}
for folder_name in folder_names:
    metadata_dict[folder_name] = {}
    metadata = open(f'metadata/{folder_name}_metadata.txt', 'r').readlines()
    column_names = metadata[0].split(' ')
    for datapoint in metadata[1:]:
        split = datapoint.split('\t')
        length = split[2]
        x = split[1]
        namedate = split[0]
        name = namedate[:-10]
        date = re.search('\d+-\d+-\d+', namedate).group()
        metadata_dict[folder_name][name] = {'document_name': name, 'date': date, f'{column_names[2]}': x}

In [355]:
pages_dict = copy.deepcopy(pages_dict_backup)

In [356]:
for folder_name in folder_names:
    for datapoint in metadata_dict[folder_name].values():
        pages_found = False
        for i, page in enumerate(pages_dict[folder_name]):
            if page.metadata['document_name'] == datapoint['document_name']:
                pages_found = True
                del page.metadata['document_name']
                page.metadata.update(datapoint)
                print(i, page.metadata['document_name'])
        if not pages_found:
            # TODO: Investigate why some files are not found
            print(f"Didn't find pages for {datapoint['document_name']}")

332 A New System of Alternate Current Motors and Transformers
333 A New System of Alternate Current Motors and Transformers
334 A New System of Alternate Current Motors and Transformers
335 A New System of Alternate Current Motors and Transformers
336 A New System of Alternate Current Motors and Transformers
337 A New System of Alternate Current Motors and Transformers
338 A New System of Alternate Current Motors and Transformers
339 A New System of Alternate Current Motors and Transformers
340 A New System of Alternate Current Motors and Transformers
341 A New System of Alternate Current Motors and Transformers
342 A New System of Alternate Current Motors and Transformers
Didn't find pages for Inventor Tesla Replies to Dr. Louis Duncan, Explaining His Alternating Current Motor
612 Alternate Current Motors
Didn't find pages for Mr. Nikola Tesla on Alternating Current Motors
Didn't find pages for Mr. Nikola Tesla on the Ganz Alternating-Current Motors
585 The Losses Due to Hysteresis in

In [60]:
# Metadata extraction and cleanup

# import matplotlib.pyplot as plt

# print(len(pages))
# lens = [len(page.page_content) for page in pages]
# plt.hist(lens, color='lightgreen', ec='black', bins=15)

# TODO: Do something smarter here when image loading is available
pages = [page for page in pages if page.page_content]

# print(len(pages))
# lens = [len(page.page_content) for page in pages]
# plt.hist(lens, color='lightgreen', ec='black', bins=15)


# TODO: Extract metadata and then remove unneeded pages (too short)



# prompt = """Convert the following list of new line sepparated strings to four python lists in format: list = [first, second, ...]. The strings represent The name of an article, date of the article, publisher and the number of pages. Note that date is concatenated with the name. Ignore number in the beginning of the each line. Here is the list: 
# A New System of Alternate Current Motors and Transformers1888-05-01	The Inventions, Researches and Writings of Nikola Tesla	11
# Inventor Tesla Replies to Dr. Louis Duncan, Explaining His Alternating Current Motor1888-06-12	Electrical Review	2
# Alternate Current Motors1888-06-15	The Electrical Engineer	1
# Mr. Nikola Tesla on Alternating Current Motors1889-06-07	The Telegraphic Journal and Electrical Review	3
# Mr. Nikola Tesla on the Ganz Alternating-Current Motors1889-06-07	The Electrician	3
# The Losses Due to Hysteresis in Transformers1890-04-09	The Electrical Engineer	2
# Swinburne's "Hedgehog" Transformer1890-09-24	The Electrical Engineer	2
# """
# response = get_completion(prompt)
# print(response)

In [None]:
# Vectorstore loading

vectorstore = Chroma(persist_directory=os.path.join(".", "docs", "merged_vectorstore"), embedding_function = embeddings)

In [None]:
# Basic Retriever example

from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

question = "How many poles shoul my electromotor have, and what should I do if I have the wrong number?"
qa_chain = RetrievalQA.from_chain_type(llm = llm, retriever = vectorstore.as_retriever())
print(qa_chain({"query": question}))

In [None]:
# Retriever with memory example

from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm = llm, retriever = retriever, memory = memory)
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")