In [1]:
# Elements loader

from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import DirectoryLoader

elements_loader = DirectoryLoader(path = "docx_docs", 
                                  loader_cls = UnstructuredWordDocumentLoader,
                                  loader_kwargs = {'mode' : "elements", 'strategy': "fast"},
                                  recursive = True)

docs_elements = elements_loader.load()


In [2]:
# Recursive loaders

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import DirectoryLoader

single_loader = DirectoryLoader(path = "docx_docs", 
                                loader_cls = UnstructuredWordDocumentLoader,
                                loader_kwargs = {'mode' : "single", 'strategy': "fast"},
                                recursive = True)
docs_single = single_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap  = 0, separators = ["\n\n", "(?<=\n)", "(?<=\. )"], is_separator_regex = True)
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap  = 0, separators = ["\n\n", "(?<=\n)", "(?<=\. )"], is_separator_regex = True)

docs_recursive = text_splitter.split_documents(docs_single)


In [3]:
# NLTK loader
from langchain.text_splitter import NLTKTextSplitter

nltk_splitter = NLTKTextSplitter()

docs_nltk = nltk_splitter.split_documents(docs_single)

In [4]:
# Finename metadata rename 

for doc in docs_nltk:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']

for doc in docs_recursive:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']

In [5]:
# Remove undeeded metadata from Elements documents

keys_to_delete = ['source', 'file_directory', 'last_modified', 'filetype', 'primary', 'text_as_html', 'emphasized_text_tags', 'emphasized_text_contents']

for doc in docs_elements:
    for key in keys_to_delete:
        if key in doc.metadata.keys():
            del doc.metadata[key]

# Removing unneeded documents
categories_to_remove = ['PageBreak', 'ListItem', 'Footer', 'Table', 'UncategorizedText', 'Header']
docs_elements = [doc for doc in docs_elements if doc.metadata['category'] not in categories_to_remove]

In [54]:
# Plots

# from matplotlib import pyplot as plt

# plt.figure(1)
# plt.hist([len(d.page_content) for d in docs_nltk], bins = 100)
# plt.grid()
# plt.title("NLTK документи")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")
# plt.figure(2)
# plt.hist([len(d.page_content) for d in docs_elements], bins = 100)
# plt.grid()
# plt.title("Elements documents")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")
# plt.figure(3)
# plt.hist([len(d.page_content) for d in docs_recursive], bins = 100)
# plt.grid()
# plt.title("Recursive documents")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")

In [6]:
# Metadata parsing

import json

metadata_dict = {}
metadatas = json.load(open('metadata.json'))

ascii_replace_dict = {'â€™': '’', 'â€œ': '“', 'â€': '”'}
def replace_non_ascii(s):
    for pair_k, pair_v in ascii_replace_dict.items():
        s = s.replace(pair_k, pair_v)
    return s

for metadata in metadatas[2]['data']:
    if metadata['id'] in ['228', '413']:
        # These are duplicates, not needed
        continue
    key = ''.join(metadata['file_url'].split('/')[-1].rsplit('.', 1)[:-1])

    # Repalce non-ascii characters in key
    key = replace_non_ascii(key)
    
    # Make sure that there are no duplicate entries for the same document type
    if (key in metadata_dict.keys() and metadata['type'] == metadata_dict[key]['type']):
        print(key)
        print(metadata['id'])
    assert not (key in metadata_dict.keys() and metadata['type'] == metadata_dict[key]['type'])
    metadata_dict[key] = {key: value for key, value in metadata.items() if value is not None}
    
    # Replace non-ascii characters in file url
    del metadata_dict[key]['file_url']
    for m_key, m_value in metadata_dict.items():
        if m_value is None:
            del metadata_dict[m_key]

In [7]:
# Adding metadata

for doc in docs_elements:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_nltk:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_recursive:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

In [121]:
# Summary document

file_path = 'summary.txt'

with open(file_path, 'w') as file:
    file.write('This file is a summary document of all of the articles, lectures and patents\n')
    file.write('\nArticles:\n')
    file.write('\nLectures:\n')
    file.write('\nPatents:\n')

a_num = 0
l_num = 0
p_num = 0

for metadata_key in metadata_dict.keys():
    with open(file_path, 'r') as file:
        lines = file.readlines()

    new_lines = []

    a_num += 1 if metadata_dict[metadata_key]['type'] == 'article' else 0
    l_num += 1 if metadata_dict[metadata_key]['type'] == 'lecture' else 0
    p_num += 1 if metadata_dict[metadata_key]['type'] == 'patent' else 0

    for i, line in enumerate(lines):
        new_lines.append(line)
        
        if metadata_dict[metadata_key]['type'] == 'article':
            if 'Articles:' in line:
                new_line = f"    - Article {metadata_dict[metadata_key]['title']} written in {metadata_dict[metadata_key]['date']} published by {metadata_dict[metadata_key]['source']}\n"
                new_lines.append(new_line)
        
        if metadata_dict[metadata_key]['type'] == 'lecture':
            if 'Lectures:' in line:
                new_line = f"    - Lecture {metadata_dict[metadata_key]['title']} held in {metadata_dict[metadata_key]['date']}\n"
                new_lines.append(new_line)
        
        if metadata_dict[metadata_key]['type'] == 'patent':
            if 'Patents:' in line:
                new_line = f"    - Patent {metadata_dict[metadata_key]['title']} filed in {metadata_dict[metadata_key]['date']} with registration number {metadata_dict[metadata_key]['register_num']}\n"
                new_lines.append(new_line)


    with open(file_path, 'w') as file:
        file.writelines(new_lines)

    
with open(file_path, 'r') as file:
    lines = file.readlines()
new_lines = []
for i, line in enumerate(lines):
    if 'Patents:' in line:
        new_lines.append(f"Patents ({p_num} files):\n")
    elif 'Articles:' in line:
        new_lines.append(f"Articles ({a_num} files):\n")
    elif 'Lectures:' in line:
        new_lines.append(f"Lectures ({l_num} files):\n")
    else:
        new_lines.append(line)
with open(file_path, 'w') as file:
    file.writelines(new_lines)

from langchain.document_loaders import TextLoader

doc = TextLoader(file_path).load()

docs_metadata = text_splitter.split_documents(doc)


In [None]:
# Vectorstore creation

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

# vectorstore_elements = Chroma.from_documents(
#     collection_name="elements",
#     documents = docs_elements + docs_metadata,
#     embedding = embeddings,
#     persist_directory="./vectorstore_elements"
# )

# vectorstore_nltk = Chroma.from_documents(
#     collection_name="nltk",
#     documents = docs_nltk + docs_metadata,
#     embedding = embeddings,
#     persist_directory="./vectorstore_nltk"
# )

# vectorstore_recursive = Chroma.from_documents(
#     collection_name="recursive",
#     documents = docs_recursive + docs_metadata,
#     embedding = embeddings,
#     persist_directory="./vectorstore_recursive"
# )

In [8]:
# LLM loading

from langchain.chat_models import ChatOpenAI

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

In [9]:
# Vectorstore loading

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

names = ['elements', 'nltk', 'recursive']
all_bases = {}
all_retrievers = {}

for name in names:
    all_bases[name] = Chroma(persist_directory=f"./vectorstore_{name}", embedding_function = embeddings)
    all_retrievers[name] = {}

In [18]:
# BaseRetriever

for name in names:
    all_retrievers[name]['base'] = all_bases[name].as_retriever(search_type = 'mmr', 
                                                                search_kwargs = {'k': 3})

In [19]:
# MultiQueryRetriever

from langchain.retrievers.multi_query import MultiQueryRetriever

for name in names:
    all_retrievers[name]['multi_query'] = MultiQueryRetriever.from_llm(retriever = all_bases[name].as_retriever(search_type = 'mmr'), 
                                                                       llm = llm)

In [20]:
# ContextualCompressionRetriever

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

for name in names:
    compressor = LLMChainExtractor.from_llm(llm)
    all_retrievers[name]['compression_retriever'] = ContextualCompressionRetriever(base_compressor = compressor, 
                                                                                   base_retriever = all_retrievers[name]['base'])

In [21]:
# SelfQueryRetriever

from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="category",
        description="Category of te text content - possible values are NarrativeText and Title",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date when the document was published",
        type="string",
    ),
    AttributeInfo(
        name="filename",
        description="Name of the file",
        type="string",
    ),
    AttributeInfo(
        name="id",
        description="Document ID",
        type="string",
    ),
    AttributeInfo(
        name="page_number",
        description="Page number from the original document",
        type="string",
    ),
    AttributeInfo(
        name="register_num",
        description="Patent registration number",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Source that published the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Document title",
        type="string",
    ),
    AttributeInfo(
        name="type",
        description="Type of the document - possible value are article, lecture and patent",
        type="string",
    ),
]

document_content_description = "Document content"

for name in names:
    compressor = LLMChainExtractor.from_llm(llm)
    all_retrievers[name]['self_query_retriever'] = SelfQueryRetriever.from_llm(llm = llm, 
                                                                               vectorstore = all_bases[name],
                                                                               document_contents = document_content_description,
                                                                               metadata_field_info = metadata_field_info, 
                                                                               verbose=True)

In [22]:
# EnsembleRetriever

from langchain.retrievers import EnsembleRetriever

for name in names:
    all_retrievers[name]['ensemble_retriever'] = EnsembleRetriever(retrievers = [r for r in all_retrievers[name].values()])

In [23]:
all_retrievers

{'elements': {'base': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7fd6978c46d0>, search_type='mmr', search_kwargs={}),
  'multi_query': MultiQueryRetriever(tags=None, metadata=None, retriever=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7fd6978c46d0>, search_type='mmr', search_kwargs={}), llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['question'], output_parser=None, partial_variables={}, template='You are an AI language model assistant. Your task is \n    to generate 3 different versions of the given user \n    question to retrieve relevant documents from a vector  database. \n    By generating multiple perspectives on the user question, \n    your goal is to help the user overcome some of the limitations

In [None]:
q = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to
make up an answer.

{context} // i.e the pdf text content

Question: {query} // i.e our actualy query, 'Who is the CV about?'
Helpful Answer:"""