###  This is the process for grabbing the pdfs and extracting metadat to a library catalog

##### It includes adding the metadata we are going to use. The custom metadata list is defined in this code block at the bottom. Custom fields can be defined in the custom_fields list. The functions take the list as an argument and  checks for the additional custom metadata fields and includes them in the xlsx if they are present in the PDF files.
##### THis identifies potential duplicate PDF files, you can compute a hash (e.g., SHA-256) for each file and compare these values. Files with the same hash value are very likely to be duplicates.  
##### With the check_pdf_issues function in place, before attempting to get metadata from a PDF, the script will first check if the PDF has issues like being encrypted or corrupt. If it's encrypted, it will attempt to decrypt using the provided password (in this case, an empty string). If it's corrupt or if there's any other issue, it will log the problem and skip the file.  

In [None]:
# %pip install --upgrade pip
# %pip install ipython
#%pip install pypdf

%pip install bs4
%pip install requests
#%pip install openpyxl
%pip install tabulate


from pypdf import PdfReader, PdfWriter
import os
import hashlib
import pandas as pd
import glob
from datetime import datetime, date, timedelta
from tabulate import tabulate

In [5]:
source_directory = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/PDF_archive"

In [6]:
# Code to create metadata dictionary
# Check if the path exists
def check_dir_exists(source_directory):
    if not os.path.exists(source_directory):
        print(f"Error: The path '{source_directory}' does not exist!")
        raise ValueError(f"The path '{source_directory}' does not exist!")


def compute_pdf_hash(pdf_path):
    '''generate a unique hash for the PDF file'''
    doc_id = hashlib.md5()
    with open(pdf_path, 'rb') as f:
        for block in iter(lambda: f.read(4096), b""):
            doc_id.update(block)
    return doc_id.hexdigest()


def check_pdf_for_issues(pdf_path):
    try:
        pdf = PdfReader(pdf_path)
        if pdf.is_encrypted:
            print(f"Encryption detected for {pdf_path}")
            pdf.decrypt("")
            print(f"All pages accessed: {len(pdf.pages)}")
        return True
    except Exception as e:
        print(f"Issue with {pdf_path}: {e}")
        return False


def get_pdf_metadata(pdf_path):
    '''extract all metadata fields present in the PDF file 
    along with page count and the hash into a dictionary
    '''
    pdf_metadata = {}
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        existing_pdf_metadata = reader.metadata
        # existing pdf metadata code deleted from here

        # Create the pdf metadata
        file_name = os.path.basename(pdf_path)
        while '.' in file_name:
            # loop to remove mult extensions
            file_name = os.path.splitext(file_name)[0]
        pdf_metadata['title'] = existing_pdf_metadata.get('/Title')
        if not pdf_metadata['title']:
            pdf_metadata['title'] = file_name.replace('_', ' ')
        pdf_metadata['leadership_scope'] = "1_National"
        pdf_metadata['page_count'] = len(reader.pages)  # Add page count
        creation_date_str = existing_pdf_metadata.get(
            '/CreationDate', '')[2:10]
        if creation_date_str:
            created_date = datetime.strptime(creation_date_str, '%Y-%m-%dT%H:%MZ')
        else:
            created_date = date.today()
        pdf_metadata['creation_date'] = created_date.strftime('%Y-%m-%dT%H:%MZ')
        pdf_metadata['effective_date'] = created_date.strftime('%Y-%m-%dT%H:%MZ')
        pdf_metadata['upsert_date'] = date.today().strftime('%Y-%m-%dT%H:%MZ')
        expiration_date = created_date + timedelta(days=365.25*10)
        pdf_metadata['expiration_date'] = expiration_date.strftime('%Y-%m-%dT%H:%MZ')
        pdf_metadata['aux_specific'] = True
        pdf_metadata['public_release'] = True
        pdf_metadata['publication_number'] = pdf_metadata['Title']
        pdf_metadata['source'] = None
        # not curently used. Can be CG Org or Unit Number
        pdf_metadata['organization'] = None
        pdf_metadata['curator'] = "Drew_Wilkins"
        pdf_metadata['document_id'] = compute_pdf_hash(
            pdf_path)  # Compute and add the hash of the PDF
        pdf_metadata['file_name'] = file_name  # add the filename
        # this metadata is needed to write the metadata back to the pdfs
        pdf_metadata['pdf_path'] = pdf_path
    return pdf_metadata

In [7]:
def make_metadata_dict_from_pdfs():
    '''pulls the metadata from all the pdfs into a dataframe with standard formatting
        pdfs in rows and metadata atributes in columns
    '''
    check_dir_exists(source_directory)

    all_pdfs_metadata = {}

    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                compute_pdf_hash(pdf_path)
                check_pdf_for_issues(pdf_path)
                pdf_metadata_dict = get_pdf_metadata(pdf_path)
                all_pdfs_metadata[file] = pdf_metadata_dict

    return all_pdfs_metadata


all_pdfs_metadata = make_metadata_dict_from_pdfs()

ValueError: time data '20-10-29' does not match format '%Y%m%d'

In [None]:
print(f"\n\nBased on this dictionary...\n {all_pdfs_metadata}")

In [None]:
#pulls the metadata from all the pdfs into a dataframe with standard formatting
#pdfs in rows and metadata atributes in columns
metadata_preview = pd.DataFrame(all_pdfs_metadata).transpose()

print(f"""Dataframe loaded with metadata for rows, columns: {metadata_preview.shape} \nInspect first row below....\n\n""")
print(f"")
# transpose to pdfs in rows and metadata in columns
# pdfs_df_edit_me = pdfs_df.transpose()

print(f"""INDEX FOR THIS ROW:             {metadata_preview.index[0]}\n""")
print(metadata_preview.iloc[0])


In [None]:
from datetime import datetime

def make_xlsx(all_pdfs_metadata):
    """write dataframe to an Excel file to edit by hand."""
    
    now_utc = datetime.utcnow()
    timestamp = now_utc.strftime('%Y-%m-%dT%H:%MZ')
    file_path = f'../docs/library_catalog/library_doc_catalog_{timestamp}.xlsx'
    
    # Save DataFrame to Excel. Index=True metadata_keys as row 1
    # if Index =True then be sure to pd.read_excel( , index_col=0) when you bring it back in
    all_pdfs_metadata.to_excel(file_path, index=True)

make_xlsx(all_pdfs_metadata)

print(f""" editable excel file has been posted as {file_path}""")