In [1]:
import qdrant_client
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.indices import MultiModalVectorStoreIndex
import yaml


In [2]:
def get_document_metadata(filepath):
    with open(filepath, 'r') as file:
        content = file.read()
        _, front_matter, _ = content.split('---', 2)
        data = yaml.safe_load(front_matter)
    return data

In [3]:
# Create a local Qdrant vector store
client = qdrant_client.QdrantClient(path="qdrant_db")

In [17]:
text_store = QdrantVectorStore(
    client=client, collection_name="text_collection"
)
image_store = QdrantVectorStore(
    client=client, collection_name="image_collection"
)
storage_context = StorageContext.from_defaults(
    vector_store=text_store, image_store=image_store
)

In [39]:
client.delete_collection(collection_name="text_collection")
client.delete_collection(collection_name="image_collection")

True

In [47]:
reader = SimpleDirectoryReader(input_dir="../data/DC-cleaned-md", recursive=True, file_metadata=get_document_metadata)

In [72]:
all_docs = []
for docs in reader.iter_data():
    # <do something with the documents per file>
    print(docs)
    all_docs.extend(docs)

[Document(id_='e83baa61-ba22-49e0-9a30-c9e575857989', embedding=None, metadata={'title': 'Advisory Notes', 'link': 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/gross-floor-area/GFA/Advisory-Notes', 'date': '14 July 2023'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='---\r\ntitle: Advisory Notes\r\nlink: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/gross-floor-area/GFA/Advisory-Notes\r\ndate: 14 July 2023\r\n---\r\n\r', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='7e5bf029-35dc-4019-a366-d799f3b5c2ce', embedding=None, metadata={'title': 'Advisory Notes', 'link': 'https://www.ura.gov.sg/Corporate/Gui

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 2939: character maps to <undefined>

In [79]:
def char_at_byte_index(file_path, byte_index, encoding='utf-8'):
    with open(file_path, 'rb') as file:
        # Seek to the byte index
        file.seek(byte_index)
        
        # Read one byte
        byte = file.read(1)
        
        # Decode the byte to a character
        char = byte.decode(encoding)
        
        return char

# Example usage
file_path = '..\\data\\DC-cleaned-md\\Non-Residential\\Hotel\\Waterbodies.md'
byte_index = 2933  # Replace with the desired byte index
character = char_at_byte_index(file_path, byte_index)
print(f"The character at byte index {byte_index} is: '{character}'")

The character at byte index 2933 is: ' '


In [71]:
import os

def replace_quotes_in_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        updated_content = content.replace('“', '"').replace('”', '"').replace('【', "(").replace('】', ')')
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(updated_content)
        
        return f"Processed {file_path}"
    
    except Exception as e:
        return f"Error processing file {file_path}: {e}"

def process_directory(directory_path):
    log = []
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".md"):
                file_path = os.path.join(root, filename)
                log.append(replace_quotes_in_file(file_path))
    
    return log

# Path to the directory containing your markdown files
directory_path = '../data/DC-cleaned-md'

# Process the directory and log the results
log_entries = process_directory(directory_path)

# Print the log entries
for entry in log_entries:
    print(entry)

Processed ../data/DC-cleaned-md\gross-floor-area.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\Advisory-Notes.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\AutomatedTellerMachineandVendingMachineKiosks.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\Balconies.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\BasementDiaphragmWalls.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\BayWindows.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\BicycleParkingSpacesandEnd-of-TripFacilities.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\CableChambers.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\CarParkMotorcycleParksandRelatedFacilities.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\Catwalks.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\CommunalRoofTerrace.md
Processed ../data/DC-cleaned-md\gross-floor-area\GFA\Covered-greenhouses-farms.md
Processed ../d

In [61]:
documents = reader.load_data()

ScannerError: mapping values are not allowed here
  in "<unicode string>", line 4, column 7:
    date: : 28th July 2023
          ^

In [None]:
client.close()

In [9]:
# Create the MultiModal index
index = MultiModalVectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

In [10]:
test_query = "Compare the GFA of retail space from projects in the pipeline for 4th Quarter 2023 compared to the 1st Quarter 2023"
# generate  retrieval results
retriever = index.as_retriever(similarity_top_k=3)
retrieval_results = retriever.retrieve(test_query)
from llama_index.core.response.notebook_utils import display_source_node
for res_node in retrieval_results:
    display_source_node(res_node, source_length=1000, show_source_metadata=True)

**Node ID:** 9e849dee-19db-4aea-b075-e47c073087f4<br>**Similarity:** 0.8079956237489452<br>**Text:** ---
title: Gross Floor Area (GFA) Handbook
link: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/gross-floor-area
date: 14 July 2023
---<br>**Metadata:** {'title': 'Gross Floor Area (GFA) Handbook', 'link': 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/gross-floor-area', 'date': '14 July 2023'}<br>

**Node ID:** 7dfc889f-6dd8-4081-a4d9-75243457b5fa<br>**Similarity:** 0.7967838638159985<br>**Text:** Gross Floor Area (GFA) Handbook

The principles and illustrations used in this Gross Floor Area (GFA) handbook are not exhaustive in covering all building designs. URA reserves the right to interpret GFA matters based on the specific design of a development proposal, depending on the merits of the proposal.<br>**Metadata:** {'title': 'Gross Floor Area (GFA) Handbook', 'link': 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/gross-floor-area', 'date': '14 July 2023'}<br>