In [12]:
pip install validators

Note: you may need to restart the kernel to use updated packages.


In [None]:
'''
Defines a Pydantic model class, DocumentMetadata, which is used for documenting metadata of documents. 
It includes fields like file_size_bytes, num_pages, s3_grobid_text_link, file_path, encryption, and date_updated. 
The model also includes validators for ensuring that certain fields meet specific criteria, 
such as being positive integers, having a specific date format, and not containing HTML or quote characters.
These validators help ensure that the metadata is valid and can be used reliably in your application.

'''

In [38]:
from pydantic import BaseModel, validator
import re
from datetime import datetime

class DocumentMetadata(BaseModel):
    file_size_bytes: int
    num_pages: int
    s3_grobid_text_link: str | None = None  # Default None values for optional fields
    file_path: str | None = None
    encryption: str | None = None
    date_updated: str | None

    @validator('num_pages', 'file_size_bytes')
    @classmethod
    def validate_positive_integer(cls, v, values, **kwargs):
        if v <= 0:
            raise ValueError("Must be a positive integer")
        return v

    @validator('date_updated')
    @classmethod
    def validate_date_format(cls, v, values, **kwargs):
        if v is not None:  # Check if 'v' is not None before validation
            try:
                datetime.strptime(v, '%m/%d/%Y')
            except ValueError:
                raise ValueError("date_updated must be in MM/DD/YYYY format")
        return v

    @validator('s3_grobid_text_link', 'file_path', 'encryption', 'date_updated', pre=True)
    @classmethod
    def validate_no_html_or_quotes(cls, v, values, **kwargs):
        if v and re.search('[\'"‘’”“]|<.*?>', v):
            raise ValueError("Fields may not contain HTML or quote characters")
        return v



/var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/212386787.py:13: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('num_pages', 'file_size_bytes')
/var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/212386787.py:20: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('date_updated')
/var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/212386787.py:30: PydanticDeprecatedSince20:

In [None]:
'''reads metadata from PDF files using PyPDF2, creates DocumentMetadata instances, 
and then writes the validated metadata to a CSV file using pandas. 
It defines a function get_pdf_metadata that reads the metadata from PDF files and returns a list of metadata dictionaries. 
Then, it iterates over the metadata dictionaries, validates them using the DocumentMetadata class, and appends them to validated_metadata.
Finally, it creates a pandas DataFrame from validated_metadata and writes it to a CSV file specified by csv_file_path.'''

In [34]:
import PyPDF2
import pandas as pd
import csv
import os

# Function to read PDF metadata and create DocumentMetadata instances
def get_pdf_metadata(content):
    all_metadata = []
    for pdf_path, s3_grobid_text_link, level in content:
        try:
            if not os.path.exists(pdf_path):
                raise FileNotFoundError(f"File not found: {pdf_path}")
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                metadata = DocumentMetadata(
                    level=level,
                    file_size_bytes=os.path.getsize(pdf_path),
                    num_pages=len(pdf_reader.pages),
                    s3_grobid_text_link=s3_grobid_text_link,
                    file_path=pdf_path,
                    encryption="Yes" if pdf_reader.is_encrypted else "No",
                    date_updated=datetime.today().strftime("%m/%d/%Y")
                )
                all_metadata.append(metadata.dict())
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    return all_metadata

content = [
    ("/Users/Aneesh/Downloads/Archive 2 2/2024-l1-topics-combined-2.pdf",
     "s3://bigdatacasestudy2/GrobidFiles/Grobid_RR_2024_l1_combined.txt", "Level I"),
    ("/Users/Aneesh/Downloads/Archive 2 2/2024-l2-topics-combined-2.pdf",
     "s3://bigdatacasestudy2/GrobidFiles/Grobid_RR_2024_l2_combined.txt", "Level II"),
    ("/Users/Aneesh/Downloads/Archive 2 2/2024-l3-topics-combined-2.pdf",
     "s3://bigdatacasestudy2/GrobidFiles/Grobid_RR_2024_l3_combined.txt", "Level III")
]

all_metadata = get_pdf_metadata(content)

validated_metadata = []
for metadata in all_metadata:
    try:
        doc_metadata = DocumentMetadata(
            file_size_bytes=metadata['file_size_bytes'],
            num_pages=metadata['num_pages'],
            s3_grobid_text_link=metadata['s3_grobid_text_link'],
            file_path=metadata['file_path'],
            encryption=metadata['encryption'],
            date_updated=metadata['date_updated']
        )
        validated_metadata.append(doc_metadata.dict())
    except Exception as e:
        print(f"Validation error for {metadata['file_path']}: {e}")

# Writing validated metadata to CSV

# Assuming 'validated_metadata' is your list of dictionaries containing the metadata
df = pd.DataFrame(validated_metadata)

# Specify the full path including the filename
csv_file_path = '/Users/Aneesh/Downloads/MetaData Clean CSV/metadata.csv'

# Write the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)


In [None]:
'''defines several test cases using pytest to validate the behavior of the DocumentMetadata class when provided with invalid data.
Each test case uses pytest.raises to check that the class raises a ValueError when encountering specific invalid data. 
These test cases help ensure that the DocumentMetadata class behaves as expected and validates its inputs correctly.'''

In [39]:
import pytest


# Failed Test Cases
def test_file_size_bytes_negative_failure():
    """Ensure negative file size bytes raise a ValueError."""
    with pytest.raises(ValueError):
        DocumentMetadata(file_size_bytes=-1, num_pages=100, date_updated="12/31/2023")

def test_num_pages_zero_failure():
    """Ensure zero number of pages raises a ValueError."""
    with pytest.raises(ValueError):
        DocumentMetadata(file_size_bytes=5000, num_pages=0, date_updated="12/31/2023")

def test_date_updated_invalid_format_failure():
    """Ensure incorrect date format raises a ValueError."""
    with pytest.raises(ValueError):
        DocumentMetadata(file_size_bytes=5000, num_pages=100, date_updated="31/12/2023")

def test_encryption_html_tags_failure():
    """Ensure HTML tags in encryption field raise a ValueError."""
    with pytest.raises(ValueError):
        DocumentMetadata(file_size_bytes=5000, num_pages=100, encryption="<script>alert('hack')</script>", date_updated="12/31/2023")

def test_file_path_contains_quotes_failure():
    """Ensure quotes in file path raise a ValueError."""
    with pytest.raises(ValueError):
        DocumentMetadata(file_size_bytes=5000, num_pages=100, file_path="\"/user/data/file.pdf\"", date_updated="12/31/2023")


In [40]:
# Passed Test Cases

def test_valid_file_size_and_num_pages_success():
    """Verify valid file size and number of pages are accepted."""
    obj = DocumentMetadata(file_size_bytes=1024, num_pages=10, date_updated="12/31/2023")
    assert obj.file_size_bytes == 1024 and obj.num_pages == 10

def test_valid_date_updated_format_success():
    """Verify correctly formatted date_updated is accepted."""
    obj = DocumentMetadata(file_size_bytes=2048, num_pages=50, date_updated="01/01/2024")
    assert obj.date_updated == "01/01/2024"

def test_optional_fields_none_success():
    """Verify optional fields can be None without error."""
    obj = DocumentMetadata(file_size_bytes=3072, num_pages=75, date_updated="02/02/2024")
    assert obj.s3_grobid_text_link is None and obj.file_path is None and obj.encryption is None

def test_valid_encryption_field_success():
    """Verify encryption field accepts valid input without HTML or quotes."""
    obj = DocumentMetadata(file_size_bytes=4096, num_pages=100, encryption="AES256", date_updated="03/03/2024")
    assert obj.encryption == "AES256"

def test_valid_file_path_success():
    """Verify file_path field accepts valid input without HTML or quotes."""
    obj = DocumentMetadata(file_size_bytes=5120, num_pages=125, file_path="/data/files/document.pdf", date_updated="04/04/2024")
    assert obj.file_path == "/data/files/document.pdf"


In [31]:
import ipytest

In [41]:
ipytest.run('-vv')

platform darwin -- Python 3.10.9, pytest-7.1.2, pluggy-1.0.0 -- /Users/Aneesh/anaconda3/bin/python
cachedir: .pytest_cache
rootdir: /Users/Aneesh
plugins: anyio-3.5.0, cov-4.1.0, mock-3.12.0
[1mcollecting ... [0mcollected 10 items

t_469bbbef4b9945baac40a1ec145c2e67.py::test_file_size_bytes_negative_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/735227127.py [32mPASSED[0m[32m [ 10%][0m
t_469bbbef4b9945baac40a1ec145c2e67.py::test_num_pages_zero_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/735227127.py [32mPASSED[0m[32m [ 20%][0m
t_469bbbef4b9945baac40a1ec145c2e67.py::test_date_updated_invalid_format_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/735227127.py [32mPASSED[0m[32m [ 30%][0m
t_469bbbef4b9945baac40a1ec145c2e67.py::test_encryption_html_tags_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_22761/735227127.py [32mPASSED[0m[32m [ 40%][

<ExitCode.OK: 0>