###  This is the process for grabbing the pdfs and getting them in the form to be chunked and ingested.

##### It includes the custom metadata we are going to use. The custom metadata list is defined in this code block at the bottom. Custom fields can be defined in the custom_fields list. The functions take the list as an argument and  checks for the additional custom metadata fields and includes them in the CSV if they are present in the PDF files.
##### THis identifies potential duplicate PDF files, you can compute a hash (e.g., SHA-256) for each file and compare these values. Files with the same hash value are very likely to be duplicates.  
##### With the check_pdf_issues function in place, before attempting to get metadata from a PDF, the script will first check if the PDF has issues like being encrypted or corrupt. If it's encrypted, it will attempt to decrypt using the provided password (in this case, an empty string). If it's corrupt or if there's any other issue, it will log the problem and skip the file.  
##### The CSV includes  a 'Page Count' column showing the number of pages in each PDF.

In [None]:
#%pip install --upgrade pip

#%pip install bs4
#%pip install requests
#%pip install openpyxl
#%pip install tabulate

from pypdf import PdfReader, PdfWriter
import os
import hashlib
import pandas as pd
import glob
from datetime import datetime, date, timedelta
from tabulate import tabulate

In [None]:
# Code to create metadata dictionary 
# Check if the path exists
def check_dir_exists(source_directory):
    if not os.path.exists(source_directory):
        print(f"Error: The path '{source_directory}' does not exist!")
        raise ValueError(f"The path '{source_directory}' does not exist!")
    


def compute_pdf_hash(pdf_path):
    '''generate a unique hash for the PDF file'''
    doc_id = hashlib.md5()
    with open(pdf_path, 'rb') as f:
        for block in iter(lambda: f.read(4096), b""):
            doc_id.update(block)
    return doc_id.hexdigest()



def check_pdf_issues(pdf_path):
    try:
        pdf = PdfReader(pdf_path)
        if pdf.is_encrypted:
            print(f"Encryption detected for {pdf_path}")
            pdf.decrypt("")
            print(f"All pages accessed: {len(pdf.pages)}")
        return True
    except Exception as e:
        print(f"Issue with {pdf_path}: {e}")
        return False



def get_pdf_metadata(pdf_path):
    '''extract all metadata fields present in the PDF file 
    along with page count and the hash into a dictionary
    '''
    pdf_metadata = {}
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        existing_pdf_metadata = reader.metadata
        # existing pdf metadata code deleted from here
        
        # Create the pdf metadata
        file_name = os.path.basename(pdf_path)
        while '.' in file_name:
            file_name = os.path.splitext(file_name)[0]   #loop to remove mult extensions
        pdf_metadata['Title'] = existing_pdf_metadata.get('/Title')
        if not pdf_metadata['Title']:
            pdf_metadata['Title'] = file_name.replace('_', ' ')
        pdf_metadata['LeadershipScope'] = "1_National"
        pdf_metadata['PageCount'] = len(reader.pages) # Add page count
        creation_date_str = existing_pdf_metadata.get('/CreationDate', '')[2:10]
        if creation_date_str:
            created_date = datetime.strptime(creation_date_str, '%Y%m%d')
        else:
            created_date = date.today()
        pdf_metadata['CreationDate'] = created_date.strftime('%Y-%m-%d')
        pdf_metadata['EffectiveDate'] = created_date.strftime('%Y-%m-%d')
        pdf_metadata['IngestDate'] = date.today().strftime('%Y-%m-%d')
        expiration_date = created_date + timedelta(days=365.25*10)
        pdf_metadata['ExpirationDate'] = expiration_date.strftime('%Y-%m-%d')
        pdf_metadata['AuxSpecific'] = True
        pdf_metadata['PublicRelease'] = True
        pdf_metadata['PublicationNumber'] = pdf_metadata['Title']
        pdf_metadata['Source'] = None
        pdf_metadata['Organization'] = None # not curently used. Can be CG Org or Unit Number
        #pdf_metadata['Curator'] = "Drew_Wilkins"
        #pdf_metadata['DocId'] = compute_pdf_hash(pdf_path) # Compute and add the hash of the PDF
        pdf_metadata['FileName'] = file_name  # add the filename
        pdf_metadata['pdf_path'] = pdf_path   #this metadata is needed to write the metadata back to the pdfs  
    return pdf_metadata


In [None]:
def make_metadata_dict_from_pdfs():
    '''pulls the metadata from all the pdfs into a dataframe with standard formatting
        pdfs in rows and metadata atributes in columns
    
    '''
    check_dir_exists(initial_queue_dir)

    pdfs_metadata = {}

    for root, dirs, files in os.walk(initial_queue_dir):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                compute_pdf_hash(pdf_path)
                check_pdf_issues(pdf_path)
                pdf_metadata_dict = get_pdf_metadata(pdf_path)
                pdfs_metadata[file] = pdf_metadata_dict
    
    return pdfs_metadata




CHeck the metadata dictionary before writing to edit_me

In [None]:
initial_queue_dir = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/PDF_initial_queue"
pdfs_metadata = make_metadata_dict_from_pdfs()

In [None]:
print(f"\n\nBased on this dictionary...\n {pdfs_metadata}")

In [None]:
#pulls the metadata from all the pdfs into a dataframe with standard formatting
#pdfs in rows and metadata atributes in columns
pdfs_df_edit_me = pd.DataFrame(pdfs_metadata).transpose()

print(f"""Dataframe loaded with metadata for rows, columns: {pdfs_df_edit_me.shape} \nInspect first row below....\n\n""")
print(f"")
# transpose to pdfs in rows and metadata in columns
# pdfs_df_edit_me = pdfs_df.transpose()

print(f"""INDEX FOR THIS ROW:             {pdfs_df_edit_me.index[0]}\n""")
print(pdfs_df_edit_me.iloc[0])


In [None]:
from datetime import datetime

def make_xlsx(pdfs_df_edit_me):
    """write dataframe to an Excel file to edit by hand."""
    
    # Get the current date and time in Zulu (UTC) time
    now_utc = datetime.utcnow()
    timestamp = now_utc.strftime('%d%b%Y-%H%M')
    
    # Specify the relative path to save the Excel file with the timestamp appended
    file_path = f'../data/PDF_initial_queue/pdfs_edit_me_{timestamp}.xlsx'
    
    # Save DataFrame to Excel. Index=True metadata_keys as row 1
    # if Index =True then be sure to pd.read_excel( , index_col=0) when you bring it back in
    pdfs_df_edit_me.to_excel(file_path, index=True)

make_xlsx(pdfs_df_edit_me)

print(f""" editable excel file has been posted as /data/PDF_initial_queue/ """)

### <span style="color: green;">Add the medata by hand into the spreadsheet and save it to new name pdfs_edited.xlsx</span>

">>>>>> EDIT THE SPREADSHEET pdfs_df_edit_me NOW <<<<<<<

### <span style="color: green;">Import the completed Excel file back in as a dataframe</span>

In [None]:
def load_xlsx_to_df():
    """Load the hand-edited Excel file into a DataFrame."""
    
    # Search for Excel files that start with 'pdfs_edited_' in the specified directory
    files = glob.glob('../data/PDF_initial_queue/pdfs_edited_*.xlsx')

    # Check if any files were found
    if files:
        # Take the first file from the list (assuming there's only one file that matches the pattern)
        file_path = files[0]
        
        # Read the Excel file into a DataFrame; set index_col=0 if index=True when it was dict was dataframed
        pdfs_processed_metadata_df = pd.read_excel(file_path, index_col=0)
        
        # Check if the DataFrame has values
        if not pdfs_processed_metadata_df.empty:
            print(f"Successfully loaded the file contents from {file_path} into a DataFrame.")
            print("The DataFrame has values in it:")
            print(pdfs_processed_metadata_df)
        else:
            print(f"Loaded the file from {file_path}, but the DataFrame is empty.")
        
        return pdfs_processed_metadata_df
    else:
        print("No matching Excel file found!")
        return pd.DataFrame()  # Return an empty DataFrame

# Call the function and store the result in the specified DataFrame
pdfs_processed_metadata_df = load_xlsx_to_df()

#check the head of the dataframe to make sure the keys oook correct
print(pdfs_processed_metadata_df.head())

## Inspect metadata in a few ways before creating all the new pdfs

In [None]:
#if the head of the dataframe doesn't look correct, try this
#pdfs_processed_metadata_df.set_index('Unnamed: 0', inplace=True)
#check the head of the dataframe to make sure the keys look correct
print(pdfs_processed_metadata_df.head())

In [None]:
#converts this back into a dictionary of dictionaries (each pdf has a dictionary of metadata) to dicitonary and check dictionary to make sure the keys are there
pdfs_processed_metadata_dict = pdfs_processed_metadata_df.to_dict(orient='index')

In [None]:
pdfs_processed_metadata_dict.keys()

## <span style="color: red;">Save to new pdf files</span>

In [None]:
from pypdf import PdfWriter, PdfReader
import os

def write_metadata_to_pdfs(pdfs_processed_metadata_dict, output_dir):
    '''write new metadata values out to new pdfs'''
    
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    print("Files to write...")
    
    # Initialize a counter for the number of files written
    files_written_count = 0
    
    for file, metadata in pdfs_processed_metadata_dict.items():
        pdf_path = metadata['pdf_path']
        
        # Determine the output path for the updated PDF
        output_path = os.path.join(output_dir, os.path.basename(pdf_path))
        print(output_path)
        
        # Read the original PDF
        with open(pdf_path, 'rb') as f:
            reader = PdfReader(f)
            writer = PdfWriter()
            
            # Copy all the pages to the writer object
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                writer.add_page(page)
            
            # Prep the metadata and write it to the writer object (excluding only the pdf_path field)
            metadata_to_write = {f"/{key}" if not key.startswith('/') else key: value for key, value in metadata.items() if key != 'pdf_path'}
            print(metadata_to_write)
            writer.add_metadata(metadata_to_write)
            
            with open(output_path, 'wb') as out:
                writer.write(out)
                
            files_written_count += 1


    print(f"\nTotal number of files written: {files_written_count}:\n")
                
# Call the function to write metadata to PDFs
output_dir = '../data/PDF_metadata_complete'
write_metadata_to_pdfs(pdfs_processed_metadata_dict, output_dir)



### Check the metadata of the files to be sure t worked

In [None]:
def extract_metadata_from_pdf(pdf_path):
    '''Extract metadata from a single PDF and return it as a dictionary.'''
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        metadata = reader.metadata
        # Convert the metadata to a cleaner dictionary format
        clean_metadata = {key[1:] if key.startswith('/') else key: value for key, value in metadata.items()}
        return clean_metadata

def extract_metadata_from_directory(directory):
    '''Extract metadata from all PDFs in a directory and return a list of dictionaries.'''
    metadata_list = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                metadata = extract_metadata_from_pdf(pdf_path)
                metadata_list.append(metadata)
    return metadata_list

# Extract metadata from all PDFs in the directory
directory = '../data/PDF_metadata_complete'  # Current directory. Modify this to point to your desired directory.
metadata_list = extract_metadata_from_directory(directory)

# Convert the list of metadata dictionaries to a pandas DataFrame
metadata_complete_df = pd.DataFrame(metadata_list)

print(metadata_complete_df.head())


In [None]:


def print_pdf_metadata(directory, file_name):
    '''Print metadata of a given PDF.'''
    pdf_path = os.path.join(directory, file_name)
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        doc_info = reader.metadata
    
    for key, value in doc_info.items():
        print(f"{key}: {value}")

# Specify the path to your PDF
directory = '../data/PDF_metadata_complete'
file_name = '026_20_SOLICITATION_FOR_RESEARCH_DEVELOPMENT_TEST_AND_EVALUATION__RDT_E__IDEAS copy.pdf'

print_pdf_metadata(directory, file_name)