In [50]:
# Elements loader

from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import DirectoryLoader

elements_loader = DirectoryLoader(path = "docx_docs", 
                                  loader_cls = UnstructuredWordDocumentLoader,
                                  loader_kwargs = {'mode' : "elements", 'strategy': "fast"},
                                  recursive = True)

docs_elements = elements_loader.load()


In [36]:
# Recursive loaders

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import DirectoryLoader

single_loader = DirectoryLoader(path = "docx_docs", 
                                loader_cls = UnstructuredWordDocumentLoader,
                                loader_kwargs = {'mode' : "single", 'strategy': "fast"},
                                recursive = True)
docs_single = single_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap  = 0, separators = ["\n\n", "(?<=\n)", "(?<=\. )"], is_separator_regex = True)
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap  = 0, separators = ["\n\n", "(?<=\n)", "(?<=\. )"], is_separator_regex = True)

docs_recursive = text_splitter.split_documents(docs_single)


In [37]:
# NLTK loader
from langchain.text_splitter import NLTKTextSplitter

nltk_splitter = NLTKTextSplitter()

docs_nltk = nltk_splitter.split_documents(docs_single)

In [38]:
# Finename metadata rename 

for doc in docs_nltk:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']

for doc in docs_recursive:
    doc.metadata['filename'] = doc.metadata['source'].split('/')[-1]
    del doc.metadata['source']


In [51]:
# Remove undeeded metadata from Elements documents

keys_to_delete = ['source', 'file_directory', 'last_modified', 'filetype', 'primary', 'text_as_html', 'emphasized_text_tags', 'emphasized_text_contents']

for doc in docs_elements:
    for key in keys_to_delete:
        if key in doc.metadata.keys():
            del doc.metadata[key]

# Removing unneeded documents
categories_to_remove = ['PageBreak', 'ListItem', 'Footer', 'Table', 'UncategorizedText', 'Header']
docs_elements = [doc for doc in docs_elements if doc.metadata['category'] not in categories_to_remove]

In [54]:
# Plots

# from matplotlib import pyplot as plt

# plt.figure(1)
# plt.hist([len(d.page_content) for d in docs_nltk], bins = 100)
# plt.grid()
# plt.title("NLTK документи")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")
# plt.figure(2)
# plt.hist([len(d.page_content) for d in docs_elements], bins = 100)
# plt.grid()
# plt.title("Elements documents")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")
# plt.figure(3)
# plt.hist([len(d.page_content) for d in docs_recursive], bins = 100)
# plt.grid()
# plt.title("Recursive documents")
# plt.xlabel("број карактера у документу")
# plt.ylabel("број докумената")

In [55]:
# Metadata parsing

import json

metadata_dict = {}
metadatas = json.load(open('metadata.json'))

ascii_replace_dict = {'â€™': '’', 'â€œ': '“', 'â€': '”'}
def replace_non_ascii(s):
    for pair_k, pair_v in ascii_replace_dict.items():
        s = s.replace(pair_k, pair_v)
    return s

for metadata in metadatas[2]['data']:
    if metadata['id'] in ['228', '413']:
        # These are duplicates, not needed
        continue
    key = ''.join(metadata['file_url'].split('/')[-1].rsplit('.', 1)[:-1])

    # Repalce non-ascii characters in key
    key = replace_non_ascii(key)
    
    # Make sure that there are no duplicate entries for the same document type
    if (key in metadata_dict.keys() and metadata['type'] == metadata_dict[key]['type']):
        print(key)
        print(metadata['id'])
    assert not (key in metadata_dict.keys() and metadata['type'] == metadata_dict[key]['type'])
    metadata_dict[key] = {key: value for key, value in metadata.items() if value is not None}
    
    # Replace non-ascii characters in file url
    del metadata_dict[key]['file_url']
    for m_key, m_value in metadata_dict.items():
        if m_value is None:
            del metadata_dict[m_key]

In [57]:
# Adding metadata

for doc in docs_elements:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_nltk:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

for doc in docs_recursive:
    assert doc.metadata['filename'][:-5] in metadata_dict.keys()
    doc.metadata.update(metadata_dict[doc.metadata['filename'][:-5]])

Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Experiments With Alternate Currents of High Potential and High Frequency (lecture)
Expe

In [103]:
print(docs_nltk[0].metadata)
print(docs_elements[0].metadata)
print(docs_recursive[0].metadata)

{'filename': 'Experiments With Alternate Currents of High Potential and High Frequency (lecture).docx', 'id': '213', 'title': 'Experiments with Alternate Currents of High Potential and High Frequency by Nikola Tesla', 'date': '1892-02-01', 'type': 'lecture', 'source': 'Archives  of the Nikola Tesla Museum'}
{'source': 'Archives  of the Nikola Tesla Museum', 'filename': 'Experiments With Alternate Currents of High Potential and High Frequency (lecture).docx', 'file_directory': 'docx_docs/lectures', 'last_modified': '2023-08-24T11:03:19', 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'page_number': 1, 'emphasized_text_contents': ['Experiments with Alternate Currents of High Potential and High Frequency'], 'emphasized_text_tags': ['b'], 'category': 'Title', 'id': '213', 'title': 'Experiments with Alternate Currents of High Potential and High Frequency by Nikola Tesla', 'date': '1892-02-01', 'type': 'lecture'}
{'filename': 'Experiments With Alternat

In [1]:
# Vectorstore creation

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(
    documents = docs_elements,
    embedding = embeddings,
    persist_directory="./chroma_db"
)

NameError: name 'docs_elements' is not defined

In [None]:
# TODO: Make a file with the list of files