#  Extracts PDF metadata to a library catalog

## 0. Installs

In [None]:
# %pip install --upgrade pip
%pip install -r requirements.txt
# %pip install ipython
%pip install pypdf
%pip install bs4
%pip install requests
#%pip install openpyxl
%pip install tabulate

## 1. Imports

In [1]:
from pypdf import PdfReader
import os
import sys
import logging
import uuid
import pandas as pd
from datetime import datetime, timedelta

In [2]:
'''This litle code block is used anytime you want to import a local module from within a Jupyter Notebook. This is required becuase Jupyter treats each cell as a module.'''

# Navigate up one level from the current notebook's directory to reach the root directory
current_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

In [3]:
import utils



## 2. Set Configurations

In [4]:
pdf_source_directory = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/holding back until testing is over/"
library_catalog_directory = "../docs/library_catalog/"
zulu_format = '%Y-%m-%dT%H:%MZ'
leadership_scope = "1_National"
curator = "Wilkins,CA"

## 3. Create the metadata dataframe

In [5]:
# Code to create metadata dictionary

def check_dir_exists(pdf_source_directory):
    '''Check if the path exists'''
    if not os.path.exists(pdf_source_directory):
        print(f"Error: The path '{pdf_source_directory}' does not exist!")
        raise ValueError(f"The path '{pdf_source_directory}' does not exist!")


def check_pdf_for_issues(pdf_path):
    '''
    Check if the PDF has issues like being encrypted or corrupt. 

    If it's encrypted, it will attempt to decrypt using the provided 
    password (in this case, an empty string). If it's corrupt or if 
    there's any other issue, it will log the problem and skip the file.  
    '''
    try:
        pdf = PdfReader(pdf_path)
        if pdf.is_encrypted:
            print(f"Encryption detected for {pdf_path}")
            pdf.decrypt("")
            print(f"All pages accessed: {len(pdf.pages)}")
        return True
    except Exception as e:
        print(f"Issue with {pdf_path}: {e}")
        return False


def remove_multiple_extensions(file_name):
    while os.path.splitext(file_name)[1]:
        file_name = os.path.splitext(file_name)[0]
    return file_name



def get_pdf_metadata(pdf_path):
    '''Extract all metadata fields present in the PDF file 
    along with page count and the hash into a dictionary.'''
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        metadata = reader.metadata
        file_name = remove_multiple_extensions(os.path.basename(pdf_path))
        CreationDate = metadata.get('/CreationDate', '')[2:16]
        creation_date = datetime.strptime(
            CreationDate, '%Y%m%d%H%M%S') if CreationDate else datetime.utcnow()
        expiration_date = creation_date + timedelta(days=365.25 * 10)
        creation_date = creation_date.strftime(zulu_format)
        return {
            'title': metadata.get('/Title', file_name.replace('_', ' ')),
            'leadership_scope': leadership_scope,
            'page_count': len(reader.pages),
            'creation_date': creation_date,
            'effective_date': creation_date,
            'tagged_date': "",
            'upsert_date': datetime.utcnow().strftime(zulu_format),
            'expiration_date': expiration_date.strftime(zulu_format),
            'lifecycle': "",  # set during hand edit
            'aux_specific': True,
            'public_release': True,
            'publication_number': file_name.replace('_', ' '),
            'source': None,
            'organization': None,
            'curator': curator,
            'document_id': utils.compute_doc_id(pdf_path),
            'file_name': file_name,
            # 'pdf_path': pdf_path,
        }

In [6]:
def make_metadata_dict_from_pdfs():
    '''pulls the metadata from all the pdfs into a dataframe with standard formatting
        pdfs in rows and metadata atributes in columns
    '''
    check_dir_exists(pdf_source_directory)

    all_pdfs_metadata = {}

    for root, dirs, files in os.walk(pdf_source_directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                check_pdf_for_issues(pdf_path)
                utils.compute_doc_id(pdf_path)
                pdf_metadata_dict = get_pdf_metadata(pdf_path)
                all_pdfs_metadata[file] = pdf_metadata_dict

    return all_pdfs_metadata


all_pdfs_metadata = make_metadata_dict_from_pdfs()
print(all_pdfs_metadata)

{'006_13_CAMPAIGN_TO_ELIMINATE_SEXUAL_ASSAULT___4JUN2013.pdf': {'title': '006 13 CAMPAIGN TO ELIMINATE SEXUAL ASSAULT   4JUN2013', 'leadership_scope': '1_National', 'page_count': 1, 'creation_date': '2019-11-12T14:27Z', 'effective_date': '2019-11-12T14:27Z', 'tagged_date': '', 'upsert_date': '2024-01-30T20:32Z', 'expiration_date': '2029-11-12T02:27Z', 'lifecycle': '', 'aux_specific': True, 'public_release': True, 'publication_number': '006 13 CAMPAIGN TO ELIMINATE SEXUAL ASSAULT   4JUN2013', 'source': None, 'organization': None, 'curator': 'Wilkins,CA', 'document_id': UUID('bd221a94-cd9a-50ec-8364-6b4ac201ebc7'), 'file_name': '006_13_CAMPAIGN_TO_ELIMINATE_SEXUAL_ASSAULT___4JUN2013'}}


In [7]:
# pull the metadata from all the pdfs into a dataframe

new_metadata_df = pd.DataFrame(all_pdfs_metadata).transpose()
print(
    f"Dataframe created with {new_metadata_df.shape[0]} rows, {new_metadata_df.shape[1]} columns")

Dataframe created with 1 rows, 17 columns


## 4. Append New PDF Metadata to Library Catalog and save to a New File

In [8]:
def make_xlsx(df: pd.DataFrame, file_name, directory_name) -> None:
    """A generic function that writes a dataframe to a new Excel file."""

    now_utc = datetime.utcnow()
    # inside function to prevent accidental file overwrites
    timestamp = now_utc.strftime('%Y-%m-%dT%H%MZ')
    file_path = f'{directory_name}{file_name}{timestamp}.xlsx'

    # Save DataFrame to Excel. Index=True metadata_keys as row 1
    # if Index =True then be sure to pd.read_excel( , index_col=0) when you bring it back in
    df.to_excel(file_path, index=False)
    print(f"""Successfully exported:  {file_path}""")

In [9]:
catalog_file_path, last_update_date = utils.get_most_recent_filepath_and_date(
    "library_catalog", library_catalog_directory, "xlsx")

try:
    most_recent_catalog_df = pd.read_excel(catalog_file_path)
    print(f"Successfully imported:  {catalog_file_path}")
    print(
        f"""Dataframe created with {most_recent_catalog_df.shape[0]} rows, {most_recent_catalog_df.shape[1]} columns""")
except Exception as e:
    os.write(
        1, f"Failed to read the most recent library catalog file: {e}\n".encode())
    os.write(1, f"Cannot append so saving as new catalog file: {e}\n".encode())
    make_xlsx(new_metadata_df, "library_catalog", library_catalog_directory)

Successfully imported:  ../docs/library_catalog/library_catalog_2023-12-19T1900Z.xlsx
Dataframe created with 251 rows, 17 columns


In [10]:
def append_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    if df1.columns.equals(df2.columns):
        return pd.concat([df1, df2], ignore_index=True)
    else:
        raise ValueError("DataFrames do not have the same columns")


def check_for_duplicates(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    duplicates = df.duplicated(subset=column_name, keep=False)
    return df[duplicates]


def compare_dfs(df1, df2):
    '''a utility to find redundant PDFs, if needed

    usage
        compare_dfs(new_metadata_df, most_recent_catalog_df)
    '''

    columns_df1 = set(df1.columns)
    columns_df2 = set(df2.columns)
    unique_to_df1 = columns_df1.difference(columns_df2)
    print(f"Columns unique to the first DataFrame: {unique_to_df1}")
    unique_to_df2 = columns_df2.difference(columns_df1)
    print(f"Columns unique to the second DataFrame: {unique_to_df2}")


def append_new_metadata_check_and_export(most_recent_catalog_df, new_metadata_df, directory, file_name):
    try:
        new_catalog_df = append_dataframes(
            most_recent_catalog_df, new_metadata_df)
        duplicate_rows = check_for_duplicates(new_catalog_df, 'document_id')

        if not duplicate_rows.empty:
            logging.warning(
                "Duplicate document IDs found. Use compare_dfs utility function to find and remove")
            logging.info(duplicate_rows)

        make_xlsx(new_catalog_df, file_name, directory)
        os.write(
            1, "Sucessfully appended data to new file. \nAdd/edit the metadata of the appended rows befor ingesting.\n".encode())
        return new_catalog_df

    except ValueError as e:
        logging.error(e)
        return None


append_new_metadata_check_and_export(
    most_recent_catalog_df, new_metadata_df, library_catalog_directory, "library_catalog_")

Successfully exported:  ../docs/library_catalog/library_catalog2024-01-30T2033Z.xlsx
Sucessfully appended data to new file. 
Add/edit the metadata of the appended rows befor ingesting.


Unnamed: 0,title,leadership_scope,page_count,creation_date,effective_date,tagged_date,upsert_date,expiration_date,lifecycle,aux_specific,public_release,publication_number,source,organization,curator,document_id,file_name
0,Auxiliary Awards Primer,1_National,20,2016-02-22T0000Z,2016-02-22T0000Z,2023-10-30T0000Z,2023-10-30T0000Z,2026-02-21T0000Z,upserted,True,True,Microsoft Word - A-PRIMER-FEB16.docx,,,Drew_Wilkins,ffe70479c5f8f753b5b7be77c7a3fda2,AUX-AWARDS-PRIMER-FEB16.pdf
1,COAST GUARD AUXILIARY AIDS TO NAVIGATION PROGRAM,1_National,11,1995-06-05T0000Z,1995-06-05T0000Z,2023-10-30T0000Z,2023-10-31T0000Z,,upserted,True,True,CI 16500.16A,,,Drew_Wilkins,6bc832bbf5802496f1cd9ff98ece22f3,Auxiliary Aids to Navigation Program CI_16500_...
2,Introduction to Marine Safety and Environmenta...,1_National,101,2022-08-17T0000Z,2022-08-17T0000Z,2023-10-30T0000Z,2023-10-32T0000Z,2032-08-16T0000Z,upserted,True,True,,,,Drew_Wilkins,178c47ff5ff6cb3613dafdf11a253ee1,Auxiliary_Intro_to_Marine_Safety_and_Environme...
3,Auxiliary Operations Process Guide Volume II-A...,1_National,195,2023-08-10T0000Z,2023-08-10T0000Z,2023-10-30T0000Z,2023-10-33T0000Z,2033-08-09T0000Z,upserted,True,True,AOPG 16798.32A,,,Drew_Wilkins,a55e809e39b7704cd37de85569f0e1cf,Auxiliary_Operations_Process_Guide_Volume_II-A...
4,Auxiliary Operations Process Guide Volume III-...,1_National,51,2023-08-04T0000Z,2023-08-04T0000Z,2023-10-30T0000Z,2023-10-34T0000Z,2033-08-03T0000Z,upserted,True,True,AOPG 16798.33A,,,Drew_Wilkins,e68f4897623b255d6553976bcfe6ef07,Auxiliary_Operations_Process_Guide_Volume_III-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,13 19 CHIEF DIRECTOR FINAL ACTION ON NATIONAL ...,1_National,1,2019-10-15T0000Z,2019-10-15T0000Z,2023-10-30T0000Z,2023-10-278T0000Z,2029-10-14T0000Z,upserted,True,True,ALAUX_13 19,cgaux.org,,Drew_Wilkins,442d774a36d629a9094fa5e2d39c6739,13_19_CHIEF_DIRECTOR_FINAL_ACTION_ON_NATIONAL_...
248,06 19 AUXILIARY HUMANITARIAN SERVICE AWARD,1_National,1,2019-10-15T0000Z,2019-10-15T0000Z,2023-10-30T0000Z,2023-10-279T0000Z,2029-10-14T0000Z,upserted,True,True,ALAUX_06 19,cgaux.org,,Drew_Wilkins,8fb7d67a8e84b3ac31a1dff9200b07dd,06_19_AUXILIARY_HUMANITARIAN_SERVICE_AWARD.pdf
249,11 19 CELL PHONE USE ONBOARD AUXILIARY FACILITIES,1_National,3,2019-10-15T0000Z,2019-10-15T0000Z,2023-10-30T0000Z,2023-10-280T0000Z,2029-10-14T0000Z,upserted,True,True,ALAUX_11 19,cgaux.org,,Drew_Wilkins,52f580e4f4fde8cbf9d175451bdb09bd,11_19_CELL_PHONE_USE_ONBOARD_AUXILIARY_FACILIT...
250,01 19 RISK MANAGEMENT TRAINING REQUIREMENTS FO...,1_National,1,2019-10-15T0000Z,2019-10-15T0000Z,2023-10-30T0000Z,2023-10-281T0000Z,2029-10-14T0000Z,upserted,True,True,ALAUX_01 19,cgaux.org,,Drew_Wilkins,eb067d4f3f1cafc66a7c128b7649c1ec,01_19_RISK_MANAGEMENT_TRAINING_REQUIREMENTS_FO...
