In [98]:
# Elements loader

from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import DirectoryLoader

elements_loader = DirectoryLoader(path = "docx_docs", 
                                  loader_cls = UnstructuredWordDocumentLoader,
                                  loader_kwargs = {'mode' : "elements", 'strategy': "fast"},
                                  recursive = True)

docs_elements = elements_loader.load()


In [99]:
# Recursive loaders

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import DirectoryLoader

single_loader = DirectoryLoader(path = "docx_docs", 
                                loader_cls = UnstructuredWordDocumentLoader,
                                loader_kwargs = {'mode' : "single", 'strategy': "fast"},
                                recursive = True)
docs_single = single_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap  = 0, separators = ["\n\n", "(?<=\n)", "(?<=\. )"], is_separator_regex = True)
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap  = 0, separators = ["\n\n", "(?<=\n)", "(?<=\. )"], is_separator_regex = True)

docs_recursive = text_splitter.split_documents(docs_single)


In [100]:
# NLTK loader
from langchain.text_splitter import NLTKTextSplitter

nltk_splitter = NLTKTextSplitter()

docs_nltk = nltk_splitter.split_documents(docs_single)

In [101]:
# Finename metadata rename 

for doc in docs_nltk:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']

for doc in docs_recursive:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']

for doc in docs_single:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']

In [102]:
# Remove undeeded metadata from Elements documents

keys_to_delete = ['source', 'file_directory', 'last_modified', 'filetype', 'primary', 'text_as_html', 'emphasized_text_tags', 'emphasized_text_contents']

for doc in docs_elements:
    for key in keys_to_delete:
        if key in doc.metadata.keys():
            del doc.metadata[key]

# Removing unneeded documents
categories_to_remove = ['PageBreak', 'ListItem', 'Footer', 'Table', 'UncategorizedText', 'Header']
docs_elements = [doc for doc in docs_elements if doc.metadata['category'] not in categories_to_remove]

In [103]:
# Plots

# from matplotlib import pyplot as plt

# plt.figure(1)
# plt.hist([len(d.page_content) for d in docs_nltk], bins = 100)
# plt.grid()
# plt.title("NLTK документи")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")
# plt.figure(2)
# plt.hist([len(d.page_content) for d in docs_elements], bins = 100)
# plt.grid()
# plt.title("Elements documents")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")
# plt.figure(3)
# plt.hist([len(d.page_content) for d in docs_recursive], bins = 100)
# plt.grid()
# plt.title("Recursive documents")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")

In [104]:
# Metadata parsing

import json

metadata_dict = {}
metadatas = json.load(open('metadata.json'))

ascii_replace_dict = {'â€™': '’', 'â€œ': '“', 'â€': '”'}
def replace_non_ascii(s):
    for pair_k, pair_v in ascii_replace_dict.items():
        s = s.replace(pair_k, pair_v)
    return s

for metadata in metadatas[2]['data']:
    if metadata['id'] in ['228', '413']:
        # These are duplicates, not needed
        continue
    key = ''.join(metadata['file_url'].split('/')[-1].rsplit('.', 1)[:-1])

    # Repalce non-ascii characters in key
    key = replace_non_ascii(key)
    
    # Make sure that there are no duplicate entries for the same document type
    if (key in metadata_dict.keys() and metadata['type'] == metadata_dict[key]['type']):
        print(key)
        print(metadata['id'])
    assert not (key in metadata_dict.keys() and metadata['type'] == metadata_dict[key]['type'])
    metadata_dict[key] = {key: value for key, value in metadata.items() if value is not None}
    
    # Replace non-ascii characters in file url
    del metadata_dict[key]['file_url']
    for m_key, m_value in metadata_dict.items():
        if m_value is None:
            del metadata_dict[m_key]

In [105]:
# Date parsing

for metadata_key, metadata_value in metadata_dict.items():
    date = metadata_value['date'].split('-')
    metadata_value['year'] = int(date[0])
    metadata_value['month'] = int(date[1])
    metadata_value['day'] = int(date[2])
    del metadata_value['date']

In [106]:
# Adding metadata

for doc in docs_elements:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_nltk:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_recursive:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_single:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

In [107]:
# Summary document

file_path = 'summary.txt'

with open(file_path, 'w') as file:
    file.write('This file is a summary document of all of the articles, lectures and patents\n')
    file.write('\nArticles:\n')
    file.write('\nLectures:\n')
    file.write('\nPatents:\n')

a_num = 0
l_num = 0
p_num = 0

for metadata_key in metadata_dict.keys():
    with open(file_path, 'r') as file:
        lines = file.readlines()

    new_lines = []

    a_num += 1 if metadata_dict[metadata_key]['type'] == 'article' else 0
    l_num += 1 if metadata_dict[metadata_key]['type'] == 'lecture' else 0
    p_num += 1 if metadata_dict[metadata_key]['type'] == 'patent' else 0

    for i, line in enumerate(lines):
        new_lines.append(line)
        
        if metadata_dict[metadata_key]['type'] == 'article':
            if 'Articles:' in line:
                new_line = f"    - Article {metadata_dict[metadata_key]['title']} written in {metadata_dict[metadata_key]['year']} published by {metadata_dict[metadata_key]['source']}\n"
                new_lines.append(new_line)
        
        if metadata_dict[metadata_key]['type'] == 'lecture':
            if 'Lectures:' in line:
                new_line = f"    - Lecture {metadata_dict[metadata_key]['title']} held in {metadata_dict[metadata_key]['year']}\n"
                new_lines.append(new_line)
        
        if metadata_dict[metadata_key]['type'] == 'patent':
            if 'Patents:' in line:
                new_line = f"    - Patent {metadata_dict[metadata_key]['title']} filed in {metadata_dict[metadata_key]['year']} with registration number {metadata_dict[metadata_key]['register_num']}\n"
                new_lines.append(new_line)


    with open(file_path, 'w') as file:
        file.writelines(new_lines)

    
with open(file_path, 'r') as file:
    lines = file.readlines()
new_lines = []
for i, line in enumerate(lines):
    if 'Patents:' in line:
        new_lines.append(f"Patents ({p_num} files):\n")
    elif 'Articles:' in line:
        new_lines.append(f"Articles ({a_num} files):\n")
    elif 'Lectures:' in line:
        new_lines.append(f"Lectures ({l_num} files):\n")
    else:
        new_lines.append(line)
with open(file_path, 'w') as file:
    file.writelines(new_lines)

from langchain.document_loaders import TextLoader

doc = TextLoader(file_path).load()

docs_metadata = text_splitter.split_documents(doc)


In [None]:
# Vectorstore creation

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

# vectorstore_elements = Chroma.from_documents(
#     collection_name="elements",
#     documents = docs_elements + docs_metadata,
#     embedding = embeddings,
#     persist_directory="./vectorstore_elements"
# )

# vectorstore_nltk = Chroma.from_documents(
#     collection_name="nltk",
#     documents = docs_nltk + docs_metadata,
#     embedding = embeddings,
#     persist_directory="./vectorstore_nltk"
# )

# vectorstore_recursive = Chroma.from_documents(
#     collection_name="recursive",
#     documents = docs_recursive + docs_metadata,
#     embedding = embeddings,
#     persist_directory="./vectorstore_recursive"
# )

In [110]:
# LLM loading

from langchain.chat_models import ChatOpenAI

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

In [111]:
# Vectorstore loading

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

names = ['elements', 'nltk', 'recursive']
all_bases = {}
all_retrievers = {}

for name in names:
    all_bases[name] = Chroma(persist_directory=f"./vectorstore_{name}", embedding_function = embeddings)
    all_retrievers[name] = {}

In [112]:
# BaseRetriever

for name in names:
    all_retrievers[name]['base'] = all_bases[name].as_retriever(search_type = 'mmr', 
                                                                search_kwargs = {'k': 3})

In [113]:
# MultiQueryRetriever

from langchain.retrievers.multi_query import MultiQueryRetriever

for name in names:
    all_retrievers[name]['multi_query'] = MultiQueryRetriever.from_llm(retriever = all_retrievers[name]['base'], 
                                                                       llm = llm)

In [114]:
# ContextualCompressionRetriever

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

for name in names:
    compressor = LLMChainExtractor.from_llm(llm)
    all_retrievers[name]['compression_retriever'] = ContextualCompressionRetriever(base_compressor = compressor, 
                                                                                   base_retriever = all_retrievers[name]['base'])

In [125]:
# SelfQueryRetriever

from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="category",
        description="Category of te text content - possible values are NarrativeText and Title",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="Year when the document was published",
        type="integer",
    ),
    AttributeInfo(
        name="day",
        description="Day when the document was published",
        type="integer",
    ),
    AttributeInfo(
        name="month",
        description="Month when the document was published",
        type="integer",
    ),
    AttributeInfo(
        name="filename",
        description="Name of the file",
        type="string",
    ),
    AttributeInfo(
        name="id",
        description="Document ID",
        type="string",
    ),
    AttributeInfo(
        name="page_number",
        description="Page number from the original document",
        type="string",
    ),
    AttributeInfo(
        name="register_num",
        description="Patent registration number",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Source that published the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Document title",
        type="string",
    ),
    AttributeInfo(
        name="type",
        description="Type of the document - possible value are article, lecture and patent",
        type="string",
    ),
]

document_content_description = "Document content"

for name in names:
    compressor = LLMChainExtractor.from_llm(llm)
    all_retrievers[name]['self_query_retriever'] = SelfQueryRetriever.from_llm(llm = llm, 
                                                                               vectorstore = all_bases[name],
                                                                               document_contents = document_content_description,
                                                                               metadata_field_info = metadata_field_info, 
                                                                               verbose=True)

In [116]:
# # ParentDocumentRetriever

# from langchain.retrievers import ParentDocumentRetriever
# from langchain.storage import InMemoryStore

# vectorstore = Chroma(collection_name="split_children", embedding_function=OpenAIEmbeddings())
# store = Chroma(collection_name="split_parents", embedding_function=OpenAIEmbeddings())

# all_retrievers['recursive']['parent_retrieverfdfd'] = ParentDocumentRetriever(
#     vectorstore = vectorstore, 
#     docstore = store, 
#     child_splitter = child_text_splitter,
#     parent_splitter = text_splitter
# )



# # all_retrievers['recursive']['parent_retriever'].add_documents(documents = docs_single, ids = None)

In [117]:
# EnsembleRetriever

from langchain.retrievers import EnsembleRetriever

for name in names:
    all_retrievers[name]['ensemble_retriever'] = EnsembleRetriever(retrievers = [r for r in all_retrievers[name].values()])

In [118]:
# Prompt template

from langchain import PromptTemplate

prompt_template = PromptTemplate.from_template(
    """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to
    make up an answer.

    Context:
    {context}

    Question: {question}
    Helpful Answer:"""
)


In [126]:
# Chains definition

from langchain.chains import RetrievalQA

all_chains = {}

for name in names:
    all_chains[name] = {}
    for retriever_name, retriever in all_retrievers[name].items():
        all_chains[name][retriever_name] = RetrievalQA.from_llm(llm = llm,
                                                                prompt = prompt_template,
                                                                retriever=retriever)

In [None]:
# Test

import datetime

file_name = f"responses_{str(datetime.datetime.now()).replace(' ', '_').replace(':', '_')}.txt"
queries = ["What did Nikola Tesla think about Mars?",
           "How many patents did Nikola Tesla file between 1890. and 1895.?",
           "How did Nikola Tesla envision the future of travel?",
           "What is Tesla's description of the human eye?",
           "Where was Nikola Tesla born?"]

with open(file_name, 'w') as file:
    file.write(f"{str(datetime.datetime.now())}\n\n")

for q in queries:
    with open(file_name, 'a') as file:
        file.write(f"- Query: {q}\n")
        for name in names:
            file.write(f"   - Split: {name}\n")
            
            for k, chain in all_chains[name].items():
                file.write(f"      - Retriever: {k}\n")
                try:
                    file.write(f"         - Response: {chain.run(q)}\n")
                except Exception as e:
                    file.write(f"         - ERROR: {e}\n")
        file.write(f"\n")

In [204]:
# Results parser

results = []
end = False

with open('responses_branimir.txt') as read_file:
    while True:
        for name in names:
            for chain_name, _ in all_chains[name].items():
                flag = False
                result = None
                for res in results:
                    if name in res['split_name'] and chain_name in res['retriever']:
                        flag = True
                        result = res
                if not flag:
                    result = {'grades': [], 'split_name': name,'retriever': chain_name}
                
                while True:
                    line = read_file.readline()
                    if not line:
                        end = True
                        break
                    if 'Response:' in line:
                        result['grades'].append(int(line.split()[0]))
                        break
                    if 'ERROR' in line:
                        result['grades'].append(1)
                        break
                if not flag:
                    results.append(result)
        if end:
            break


In [205]:
# Helper functions

def res_for_split(s):
    ret = []
    for res in results:
        if res['split_name'] == s:
            ret.append(res)
    return ret

def res_for_retriever(r):
    ret = []
    for res in results:
        if res['retriever'] == r:
            ret.append(res)
    return ret


In [230]:
for name in names:
    s = sum([sum(r['grades']) for r in res_for_split(name)])/25
    print(f"{name}: {s}")

elements: 3.88
nltk: 5.16
recursive: 4.68


In [225]:
sum([sum(r['grades']) for r in res_for_retriever('self_query_retriever')])/15

3.12

In [236]:
res_for_retriever('self_query_retriever')[0]['grades'].count(10)

1

In [229]:
for res in results:
    l = res['grades']
    mean = sum(l) / len(l)
    print(f"{res['split_name']} split with {res['retriever']} - grades - {res['grades']} - average grade: {mean}")

elements split with base - grades - [1, 1, 2, 1, 9] - average grade: 2.8
elements split with multi_query - grades - [3, 6, 3, 1, 9] - average grade: 4.4
elements split with compression_retriever - grades - [9, 3, 2, 1, 9] - average grade: 4.8
elements split with self_query_retriever - grades - [1, 10, 1, 1, 7] - average grade: 4.0
elements split with ensemble_retriever - grades - [3, 1, 3, 1, 9] - average grade: 3.4
nltk split with base - grades - [4, 3, 5, 10, 6] - average grade: 5.6
nltk split with multi_query - grades - [5, 1, 7, 8, 9] - average grade: 6.0
nltk split with compression_retriever - grades - [2, 2, 8, 8, 6] - average grade: 5.2
nltk split with self_query_retriever - grades - [1, 10, 9, 1, 7] - average grade: 5.6
nltk split with ensemble_retriever - grades - [1, 1, 1, 8, 6] - average grade: 3.4
recursive split with base - grades - [4, 1, 6, 9, 1] - average grade: 4.2
recursive split with multi_query - grades - [4, 1, 10, 2, 9] - average grade: 5.2
recursive split with co