###  This is the process for auditing the Qdrant DB and outputting a list of documents

In [2]:
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())


# CONFIG: qdrant
api_key = os.environ.get("QDRANT_API_KEY")
url = os.environ.get("QDRANT_URL")
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance /private/tmp/local_qdrant
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"

In [3]:
# Load an instance of the client. WITHOUT LANGCHAIN
# 22.5 sec for cloud
# Running this locally places a lock file in the qdrant directory
from qdrant_client import QdrantClient

client = QdrantClient(url,
                      prefer_grpc=True,
                      api_key=api_key,
                      )

'''usage'''
content = (client.get_collections())

In [4]:
# Gets all points (i.e., records) and returns them as a 2-tuple
# Tuple[List[types.Record], Optional[types.PointId]]
# 7.1 sec for cloud

all_records = client.scroll(
    collection_name=qdrant_collection_name,
    limit=100000,
    with_payload=True,  # change to True to see the payload
    with_vectors=False  # change to True to see the vectors
)

print(f"""all_records is a {type(all_records)} containing {len(all_records)}elements. 
    The first element is a {type(all_records[0])} of length {len(all_records[0])}).
    The second element is a {type(all_records[1])} of length unknown
    """)

# all_points[0] #see the first point#

all_records is a <class 'tuple'> containing 2elements. 
    The first element is a <class 'list'> of length 10357).
    The second element is a <class 'NoneType'> of length unknown
    


In [5]:
def create_record_dict(all_records):
    # Extract the records from all_records
    records = all_records[0]

    # Initialize an empty dictionary to store the results
    records_dict = {}

    # Iterate through each record in the records list
    for record in records:
        # Extract the id and metadata from the record
        record_id = record.id
        metadata = record.payload['metadata']

        # Exclude the 'page_content' from the metadata
        if 'page_content' in record.payload:
            del record.payload['page_content']

        # Add the record to the records_dict
        records_dict[record_id] = metadata

    return records_dict


'''usage'''
all_records_dict = create_record_dict(all_records)
print(type(all_records_dict))
print(
    f"Number of points: {len(all_records_dict)}.  Each point is a vector of floats, that is associated with an ID and a payload")

### Count Number of PDFs

In [35]:
def count_pdfs_in_records(all_records_dict):
    unique_sources = set()  # Using a set to ensure unique sources

    for record in all_records_dict.values():
        source = record['source']
        unique_sources.add(source)

    return len(unique_sources)  # Return the count of unique sources


'''usage'''
print("Number of PDFs:", count_pdfs_in_records(all_records_dict))

Number of PDFs: 245


### List Duplicate PDFs

In [None]:
def find_duplicate_pdfs_in_records(all_records_dict):
    seen = {}  # Dictionary to keep track of source-page combinations
    duplicate_pdfs = set()  # Using a set to ensure unique sources

    for record in all_records_dict.values():
        source = record['source']
        page = record['page']
        # Tuple to represent the combination of source and page
        key = (source, page)

        if key in seen:
            duplicate_pdfs.add(source)
        else:
            seen[key] = True

    # Convert set to list for the final result
    duplicate_pdfs = list(duplicate_pdfs)
    return duplicate_pdfs


'''usage'''
duplicate_pdfs = find_duplicate_pdfs_in_records(all_records_dict)
print("Number of duplicate PDFs:", len(
    duplicate_pdfs), "\r\n\r\nPDFs with Duplicate Pages:")
for pdf in duplicate_pdfs:
    print(pdf)

### List Duplicate PDF Pages

In [None]:
def find_duplicate_pdfs_in_records(all_records_dict):
    seen = {}  # Dictionary to keep track of source-page combinations
    duplicate_pdfs = set()  # Using a set to ensure unique sources

    for record in all_records_dict.values():
        source = record['source']
        page = record['page']
        # Tuple to represent the combination of source and page
        key = (source, page)

        if key in seen:
            duplicate_pdfs.add(key)
        else:
            seen[key] = True

    return duplicate_pdfs


'''usage'''
duplicate_pdfs = find_duplicate_pdfs_in_records(all_records_dict)
print("List of Duplicate Pages:")
for source, page in duplicate_pdfs:
    print(f"{source}, Page: {page}")

In [57]:
def create_unique_sources_dict(all_records_dict):

    # Use a set to keep track of unique sources
    seen_sources = set()

    # Dictionary to store records with unique sources
    unique_dict = {}

    # Iterate through each record in the all_records_dict
    for record_id, metadata in all_records_dict.items():
        # If the record's source is not in the set, add it to the set and the unique_dict
        source = metadata['source']
        if source not in seen_sources:
            seen_sources.add(source)
            unique_dict[record_id] = metadata

    return unique_dict


'''usage'''
unique_sources_dict = create_unique_sources_dict(all_records_dict)
len(unique_sources_dict)

In [30]:
def format_unique_source_dict(unique_sources_dict):
    '''Format values and store as a dataframe'''

    formatted_list = []

    for record_id, metadata in unique_sources_dict.items():
        # Extract the base filename without the extension
        source = metadata['source']
        base_filename = os.path.splitext(os.path.basename(source))[0]
        metadata['source_short'] = base_filename

        # Append the metadata to the list
        formatted_list.append(metadata)

        # exclude all metadata from list except these (source will be removed once we have all the metadata
        # metadata = {key: metadata[key] for key in ['short_source', 'source'] if key in metadata}

    # Convert the list to a DataFrame
    dataframe = pd.DataFrame(formatted_list)

    # Remove the 'page' column
    dataframe = dataframe.drop(columns=['page'])
    return dataframe


'''usage'''
unique_sources_df = format_unique_source_dict(unique_sources_dict)

In [None]:
from datetime import datetime


def write_library_xlsx(unique_sources_df):
    """write dataframe to an Excel file."""

    # Get the current date and time in Zulu (UTC) time
    now_utc = datetime.utcnow()
    timestamp = now_utc.strftime('%d%b%Y-%H%M')

    # Specify the relative path to save the Excel file with the timestamp appended
    file_path = f'../reports/library_document_list_{timestamp}.xlsx'

    # Save DataFrame to Excel
    unique_sources_df.to_excel(file_path, index=False)


'''usage'''
write_library_xlsx(unique_sources_df)