###  This is the process for grabbing the pdfs and extracting metadat to a library catalog

##### It includes adding the metadata we are going to use. The custom metadata list is defined in this code block at the bottom. Custom fields can be defined in the custom_fields list. The functions take the list as an argument and  checks for the additional custom metadata fields and includes them in the xlsx if they are present in the PDF files.
##### THis identifies potential duplicate PDF files, you can compute a hash (e.g., SHA-256) for each file and compare these values. Files with the same hash value are very likely to be duplicates.  
##### With the check_pdf_issues function in place, before attempting to get metadata from a PDF, the script will first check if the PDF has issues like being encrypted or corrupt. If it's encrypted, it will attempt to decrypt using the provided password (in this case, an empty string). If it's corrupt or if there's any other issue, it will log the problem and skip the file.  

## 0. Installs

In [1]:
# %pip install --upgrade pip
%pip install -r requirements.txt
# %pip install ipython
%pip install pypdf

%pip install bs4
%pip install requests
#%pip install openpyxl
%pip install tabulate

Collecting pypdf
  Using cached pypdf-4.0.1-py3-none-any.whl.metadata (7.4 kB)
Using cached pypdf-4.0.1-py3-none-any.whl (283 kB)
Installing collected packages: pypdf
Successfully installed pypdf-4.0.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
from pypdf import PdfReader
import os
import hashlib
import pandas as pd
import glob
from datetime import datetime, date, timedelta
from tabulate import tabulate


ImportError: attempted relative import with no known parent package

In [8]:
'''This litle code block is used anytime you want to import a local module from within a Jupyter Notebook. This is required becuase Jupyter treats each cell as a module.'''

import sys
import os

# Navigate up one level from the current notebook's directory to reach the root directory
current_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

In [9]:
import utils



In [35]:
source_directory = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/holding back until testing is over/"

zulu_format = '%Y-%m-%dT%H:%MZ'

In [36]:
# Code to create metadata dictionary
# Check if the path exists
def check_dir_exists(source_directory):
    if not os.path.exists(source_directory):
        print(f"Error: The path '{source_directory}' does not exist!")
        raise ValueError(f"The path '{source_directory}' does not exist!")


def compute_pdf_hash(pdf_path):
    '''generate a unique hash for the PDF file'''
    doc_id = hashlib.md5()
    with open(pdf_path, 'rb') as f:
        for block in iter(lambda: f.read(4096), b""):
            doc_id.update(block)
    return doc_id.hexdigest()


def check_pdf_for_issues(pdf_path):
    try:
        pdf = PdfReader(pdf_path)
        if pdf.is_encrypted:
            print(f"Encryption detected for {pdf_path}")
            pdf.decrypt("")
            print(f"All pages accessed: {len(pdf.pages)}")
        return True
    except Exception as e:
        print(f"Issue with {pdf_path}: {e}")
        return False


def remove_multiple_extensions(file_name):
    while os.path.splitext(file_name)[1]:
        file_name = os.path.splitext(file_name)[0]
    return file_name


def get_pdf_metadata(pdf_path):
    '''Extract all metadata fields present in the PDF file 
    along with page count and the hash into a dictionary.'''
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        metadata = reader.metadata
        file_name = remove_multiple_extensions(os.path.basename(pdf_path))

        CreationDate = metadata.get('/CreationDate', '')[2:16]
        creation_date = datetime.strptime(
            CreationDate, '%Y%m%d%H%M%S') if CreationDate else datetime.utcnow()
        expiration_date = creation_date + timedelta(days=365.25 * 10)
        creation_date = creation_date.strftime(zulu_format)
        return {
            'title': metadata.get('/Title', file_name.replace('_', ' ')),
            'leadership_scope': "1_National",
            'page_count': len(reader.pages),
            'creation_date': creation_date,
            'effective_date': creation_date,
            'upsert_date': datetime.utcnow().strftime(zulu_format),
            'expiration_date': expiration_date.strftime(zulu_format),
            'aux_specific': True,
            'public_release': True,
            'publication_number': file_name.replace('_', ' '),
            'source': None,
            'organization': None,
            'curator': "Drew_Wilkins",
            'document_id': compute_pdf_hash(pdf_path),
            'file_name': file_name
        }

In [37]:
def make_metadata_dict_from_pdfs():
    '''pulls the metadata from all the pdfs into a dataframe with standard formatting
        pdfs in rows and metadata atributes in columns
    '''
    check_dir_exists(source_directory)

    all_pdfs_metadata = {}

    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                compute_pdf_hash(pdf_path)
                check_pdf_for_issues(pdf_path)
                pdf_metadata_dict = get_pdf_metadata(pdf_path)
                all_pdfs_metadata[file] = pdf_metadata_dict

    return all_pdfs_metadata


all_pdfs_metadata = make_metadata_dict_from_pdfs()

In [38]:
print(all_pdfs_metadata)

{'006_13_CAMPAIGN_TO_ELIMINATE_SEXUAL_ASSAULT___4JUN2013.pdf': {'title': '006 13 CAMPAIGN TO ELIMINATE SEXUAL ASSAULT   4JUN2013', 'leadership_scope': '1_National', 'page_count': 1, 'creation_date': '2019-11-12T14:27Z', 'effective_date': '2019-11-12T14:27Z', 'upsert_date': '2024-01-28T19:51Z', 'expiration_date': '2029-11-12T02:27Z', 'aux_specific': True, 'public_release': True, 'publication_number': '006 13 CAMPAIGN TO ELIMINATE SEXUAL ASSAULT   4JUN2013', 'source': None, 'organization': None, 'curator': 'Drew_Wilkins', 'document_id': '869cc57fab30ca4301670b5bc7cb4096', 'file_name': '006_13_CAMPAIGN_TO_ELIMINATE_SEXUAL_ASSAULT___4JUN2013'}}


In [39]:
# pulls the metadata from all the pdfs into a dataframe
# pdfs in columns and metadata atributes in rows
metadata_preview = pd.DataFrame(all_pdfs_metadata).transpose()

print(
    f"""Dataframe loaded with metadata for rows, columns: {metadata_preview.shape} \nInspect first row below....\n\n""")
print(f"""INDEX FOR THIS ROW:             {metadata_preview.index[0]}\n""")
print(metadata_preview.iloc[0])

Dataframe loaded with metadata for rows, columns: (1, 15) 
Inspect first row below....


INDEX FOR THIS ROW:             006_13_CAMPAIGN_TO_ELIMINATE_SEXUAL_ASSAULT___4JUN2013.pdf

title                 006 13 CAMPAIGN TO ELIMINATE SEXUAL ASSAULT   ...
leadership_scope                                             1_National
page_count                                                            1
creation_date                                         2019-11-12T14:27Z
effective_date                                        2019-11-12T14:27Z
upsert_date                                           2024-01-28T19:51Z
expiration_date                                       2029-11-12T02:27Z
aux_specific                                                       True
public_release                                                     True
publication_number    006 13 CAMPAIGN TO ELIMINATE SEXUAL ASSAULT   ...
source                                                             None
organization               

In [42]:
from datetime import datetime


def make_xlsx(metadata_preview):
    """write dataframe to an Excel file to edit by hand."""

    now_utc = datetime.utcnow()
    file_timestamp = now_utc.strftime('%Y-%m-%dT%H%MZ')
    file_path = f'../docs/library_catalog/library_catalog_{file_timestamp}.xlsx'

    # Save DataFrame to Excel. Index=True metadata_keys as row 1
    # if Index =True then be sure to pd.read_excel( , index_col=0) when you bring it back in
    metadata_preview.to_excel(file_path, index=False)
    print(f""" editable excel file has been posted as {file_path}""")


 editable excel file has been posted as ../docs/library_catalog/library_doc_catalog_2024-01-28T1953Z.xlsx


In [None]:
file_path, last_update_date = utils.get_most_recent_filepath_and_date("library_catalog", "docs/library_catalog/", "xlsx")
try:
    df = pd.read_excel(file_path)
except Exception as e:
    os.write(1, f"Failed to read the Excel file: {e}\n".encode())
    make_xlsx(metadata_preview)