In [2]:
import os
import openai

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI


embeddings = OpenAIEmbeddings()
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

def get_completion(prompt, model = llm_name, temperature = 0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(model = model,
                                            messages = messages,
                                            temperature = temperature)
    return response.choices[0].message['content']

# document folders
folder_names = ["articles", "patents", "lectures"]

In [None]:
# Converting Docx to Pdf for easier loading

import aspose.words as aw

folder_names = ["patents", "lectures", "articles"]
for folder_name in folder_names:
    os.system(f"rm pdf_docs/{folder_name}/*.pdf")

    for file_name in os.listdir(os.path.join('docx_docs', folder_name)):
        try:
            a = aw.Document(os.path.join('docx_docs', folder_name, file_name))
            a.save(os.path.join('pdf_docs', folder_name, file_name.split('.')[0]+'.pdf'))
        except:
            print(f"Could not convert document {file_name} from {folder_name}")

# TODO: Investigate why The Moon's Rotation 2.docx cannot be converted
os.remove("pdf_docs/articles/The Moon's Rotation 2.pdf")


In [91]:
# Loading PDF documents

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pages_dict = {}
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 10000, chunk_overlap = 1000)
for folder_name in folder_names:
    loader = PyPDFDirectoryLoader(path = f"pdf_docs/{folder_name}/")
    # TODO: Try other values for the text splitter chunk_size and chunk_overlap
    pages_dict[folder_name] = loader.load_and_split(text_splitter = text_splitter)

In [92]:
# Removing watermarks from the data

strings_to_remove = ['Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty \nLtd.',
                    'Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty Ltd.',
                    'Evaluation Only. Created with Aspose.Words. Copyright 2003-2023 Aspose Pty',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions \nof our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions',
                    'Created with an evaluation copy of Aspose.Words. To discover the full \nversions of our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the full versions of our \nAPIs please visit: https://products.aspose.com/words/',
                    'of our APIs please visit: https://products.aspose.com/words/',
                    'Created with an evaluation copy of Aspose.Words. To discover the \nfull versions of our APIs please visit: \nhttps://products.aspose.com/words/Evaluation',
                    'Only. Created with Aspose.Words. Copyright 2003-2023 \nAspose Pty Ltd.',
                    'Created with an evaluation copy of Aspose.Words. To discover the',
                    'full versions of our APIs please visit: \nhttps://products.aspose.com/words/',
                    ' of our \nAPIs please visit: https://products.aspose.com/words/'
                    ]

for folder_name in folder_names:
    for i, page in enumerate(pages_dict[folder_name]):
        for string in strings_to_remove:
            page.page_content = page.page_content.replace(string, '')
        assert not 'Created with Aspose' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Aspose' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Evaluation Only' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'Copyright' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"
        assert not 'https://products.aspose.com/words/' in page.page_content, f"Assert failed for page number {i} with content: {page.page_content} from file {page.metadata}"

        page.metadata['source'] = page.metadata['source'].split('/')[-1]


In [102]:
# Metadata parsing

import re

metadata_dict = {}
for folder_name in folder_names:
    metadata_dict[folder_name] = {}
    metadata = open(f'metadata/{folder_name}_metadata.txt', 'r').readlines()
    column_names = metadata[0].split(' ')
    for datapoint in metadata[1:]:
        split = datapoint.split('\t')
        length = split[2]
        x = split[1]
        namedate = split[0]
        name = namedate[:-10]
        date = re.search('\d+-\d+-\d+', namedate).group()
        metadata_dict[folder_name][name] = {'document_name': name, 'date': date, f'{column_names[2]}': x}

In [113]:
metadata = metadata_dict[folder_names[1]]
metadata
# datapoint = metadata['PROCESS OF AND APPARATUS FOR BALANCING ROTATING MACHINE PARTS']
# print(datapoint['document_name'])

# pages_found = False
# for page in pages_dict[folder_names[1]]:
#     print(page.metadata['source'].split('.')[0])
#     if page.metadata['source'].split('.')[0] == datapoint['document_name']:
#         pages_found = True
#         print('--------------------------------')
#         print(page.metadata)
#         print(datapoint)
#         del page.metadata['source']
#         page.metadata.update(datapoint)
#         print(page.metadata)
#         print(datapoint)
#         print('--------------------------------')

{'COMMUTATOR FOR DYNAMO-ELECTRIC MACHINES': {'document_name': 'COMMUTATOR FOR DYNAMO-ELECTRIC MACHINES',
  'date': '1888-05-15',
  'registration_number': '382,845'},
 'ELECTRIC-ARC LAMP': {'document_name': 'ELECTRIC-ARC LAMP',
  'date': '1886-02-09',
  'registration_number': '335,787'},
 'REGULATOR FOR DYNAMO-ELECTRIC MACHINES': {'document_name': 'REGULATOR FOR DYNAMO-ELECTRIC MACHINES',
  'date': '1886-10-19',
  'registration_number': '350,954'},
 'DYNAMO-ELECTRIC MACHINE': {'document_name': 'DYNAMO-ELECTRIC MACHINE',
  'date': '1889-07-16',
  'registration_number': '406,968'},
 'ELECTRICAL TRANSMISSION OF POWER': {'document_name': 'ELECTRICAL TRANSMISSION OF POWER',
  'date': '1894-01-02',
  'registration_number': '511,915'},
 'ELECTRO-MAGNETIC MOTOR': {'document_name': 'ELECTRO-MAGNETIC MOTOR',
  'date': '1891-12-08',
  'registration_number': '464,666'},
 'METHOD OF CONVERTING AND DISTRIBUTING ELECTRIC CURRENTS': {'document_name': 'METHOD OF CONVERTING AND DISTRIBUTING ELECTRIC CURR

In [95]:
page = pages_dict[folder_names[0]][0]
print(page.page_content)
print(page.metadata)


The New York Herald                                                                                            October 26, 1907Santos-Dumont’s Speed To the Editor of the Herald:—I notice from the Herald of even date (page 8) that Santos-Dumont managed to attain a 
speed of 32 miles instead of 48, as I calculated. I assumed an efficiency of 64 per cent for propeller 
and inclined plane combined. If his motor was driven at full power the efficiency was only 43 per 
cent, and it is more likely to be true than the highest theoretical figure I assumed. I wish that he had 
made a good test and come close to 48 miles, as I calculated to give my theory a better test. But you 
see that the old notions of the hydroplane cannot possibly be true. Santos- Dumont can still win the 
bet if he will follow my suggestion.                                                    Nikola Tesla   New York, Oct. 24, 1907
{'source': 'Letter Santos-Dumont Speed.pdf', 'page': 0}


In [60]:
# Metadata extraction and cleanup

# import matplotlib.pyplot as plt

# print(len(pages))
# lens = [len(page.page_content) for page in pages]
# plt.hist(lens, color='lightgreen', ec='black', bins=15)

# TODO: Do something smarter here when image loading is available
pages = [page for page in pages if page.page_content]

# print(len(pages))
# lens = [len(page.page_content) for page in pages]
# plt.hist(lens, color='lightgreen', ec='black', bins=15)


# TODO: Extract metadata and then remove unneeded pages (too short)



# prompt = """Convert the following list of new line sepparated strings to four python lists in format: list = [first, second, ...]. The strings represent The name of an article, date of the article, publisher and the number of pages. Note that date is concatenated with the name. Ignore number in the beginning of the each line. Here is the list: 
# A New System of Alternate Current Motors and Transformers1888-05-01	The Inventions, Researches and Writings of Nikola Tesla	11
# Inventor Tesla Replies to Dr. Louis Duncan, Explaining His Alternating Current Motor1888-06-12	Electrical Review	2
# Alternate Current Motors1888-06-15	The Electrical Engineer	1
# Mr. Nikola Tesla on Alternating Current Motors1889-06-07	The Telegraphic Journal and Electrical Review	3
# Mr. Nikola Tesla on the Ganz Alternating-Current Motors1889-06-07	The Electrician	3
# The Losses Due to Hysteresis in Transformers1890-04-09	The Electrical Engineer	2
# Swinburne's "Hedgehog" Transformer1890-09-24	The Electrical Engineer	2
# """
# response = get_completion(prompt)
# print(response)

In [41]:
pages[0].metadata

{'source': 'pdf_docs/lectures/On Light and Other High Frequency Phenomena.pdf',
 'page': 0}

In [18]:
b = 0
for i, l in enumerate(lens):
    if l < 100:
        print(pages[i])
        b+=1
print(b)
# print(lens.index(5522))
# pages[668].page_content

page_content='NIKOLA TESLA.Witnesses:E. A. SUNDERLIN,D. D. LORD.' metadata={'source': 'pdf_docs/patents/91_PATENT 685,956 .pdf', 'page': 10}
page_content='Witnesses:ROBT. F. GAYLORD,FRANK E. HARTLEY.' metadata={'source': 'pdf_docs/patents/35_PATENT 416,192 .pdf', 'page': 3}
page_content='Witnesses: R. J. STONEY, Jr., E. P. COFFIN.' metadata={'source': 'pdf_docs/patents/37_PATENT 424,036.pdf', 'page': 3}
page_content='ROBT. F. GAYLORD,PARKER W. PAGE.' metadata={'source': 'pdf_docs/patents/54_PATENT 455,069.pdf', 'page': 3}
page_content='M. LAWSON DYER,G. W. MARTLING.' metadata={'source': 'pdf_docs/patents/84_PATENT 609,248 .pdf', 'page': 3}
page_content='ROBT. F. GAYLORD, FRANK E. HARTLEY.' metadata={'source': 'pdf_docs/patents/36_PATENT 416,193 .pdf', 'page': 2}
page_content='M. LAWSON DYER,RICHARD DONOVAN.' metadata={'source': 'pdf_docs/patents/99_PATENT 685,958 .pdf', 'page': 4}
page_content='Witnesses:ROBT. F. GAYLORD, FRANK B. MURPHY.' metadata={'source': 'pdf_docs/patents/19_PATEN

In [None]:
# Vectorstore loading

vectorstore = Chroma(persist_directory=os.path.join(".", "docs", "merged_vectorstore"), embedding_function = embeddings)

In [None]:
# Basic Retriever example

from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

question = "How many poles shoul my electromotor have, and what should I do if I have the wrong number?"
qa_chain = RetrievalQA.from_chain_type(llm = llm, retriever = vectorstore.as_retriever())
print(qa_chain({"query": question}))

In [None]:
# Retriever with memory example

from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm = llm, retriever = retriever, memory = memory)
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")
question = input()
print(f"Question: {question}")
print(f"Answer: {qa({'question': question})['answer']}")